summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Heim <phreak@gentoo.org>2006-07-15 14:47:37 +0000
committerChristian Heim <phreak@gentoo.org>2006-07-15 14:47:37 +0000
commite4c83cd472e7986c2fce3dbd0c12b9edce2299ce (patch)
tree8740eab35358cab40fb55f26fb412c40a78c7ced /openvz-sources
parentAdding the missing patch to 026.015-r1 (diff)
downloadmisc-e4c83cd472e7986c2fce3dbd0c12b9edce2299ce.tar.gz
misc-e4c83cd472e7986c2fce3dbd0c12b9edce2299ce.tar.bz2
misc-e4c83cd472e7986c2fce3dbd0c12b9edce2299ce.zip
Fixing #140444 / CVE-2006-3626
svn path=/; revision=404
Diffstat (limited to 'openvz-sources')
-rw-r--r--openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch99
-rw-r--r--openvz-sources/026.015-r2/0100_patch-026test015-core.patch91083
-rw-r--r--openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch19
-rw-r--r--openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch20
4 files changed, 91221 insertions, 0 deletions
diff --git a/openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch b/openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch
new file mode 100644
index 0000000..a7fe97d
--- /dev/null
+++ b/openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch
@@ -0,0 +1,99 @@
+--- ./scripts/kconfig/Makefile.nonint 2006-01-03 06:21:10.000000000 +0300
++++ ./scripts/kconfig/Makefile 2006-01-16 16:59:19.000000000 +0300
+@@ -42,6 +42,10 @@ update-po-config: $(obj)/kxgettext
+ $(Q)rm -f arch/um/Kconfig_arch
+ $(Q)rm -f scripts/kconfig/linux_*.pot scripts/kconfig/config.pot
+
++nonint_oldconfig: scripts/kconfig/conf
++ ./scripts/kconfig/conf -b arch/$(ARCH)/Kconfig
++
++
+ .PHONY: randconfig allyesconfig allnoconfig allmodconfig defconfig
+
+ randconfig: $(obj)/conf
+--- ./scripts/kconfig/conf.c.nonint 2006-01-03 06:21:10.000000000 +0300
++++ ./scripts/kconfig/conf.c 2006-01-16 16:10:30.000000000 +0300
+@@ -20,6 +20,7 @@ enum {
+ ask_all,
+ ask_new,
+ ask_silent,
++ dont_ask,
+ set_default,
+ set_yes,
+ set_mod,
+@@ -36,6 +37,8 @@ static struct menu *rootEntry;
+
+ static char nohelp_text[] = N_("Sorry, no help available for this option yet.\n");
+
++static int return_value = 0;
++
+ static void strip(char *str)
+ {
+ char *p = str;
+@@ -102,6 +105,12 @@ static void conf_askvalue(struct symbol
+ fflush(stdout);
+ fgets(line, 128, stdin);
+ return;
++ case dont_ask:
++ if (!sym_has_value(sym)) {
++ fprintf(stderr,"CONFIG_%s\n",sym->name);
++ return_value++;
++ }
++ return;
+ case set_default:
+ printf("%s\n", def);
+ return;
+@@ -346,6 +355,10 @@ static int conf_choice(struct menu *menu
+ printf("?");
+ printf("]: ");
+ switch (input_mode) {
++ case dont_ask:
++ cnt = def;
++ printf("%d\n", cnt);
++ break;
+ case ask_new:
+ case ask_silent:
+ if (!is_new) {
+@@ -482,7 +495,10 @@ static void check_conf(struct menu *menu
+ if (!conf_cnt++)
+ printf(_("*\n* Restart config...\n*\n"));
+ rootEntry = menu_get_parent_menu(menu);
+- conf(rootEntry);
++ if (input_mode == dont_ask)
++ fprintf(stderr,"CONFIG_%s\n",sym->name);
++ else
++ conf(rootEntry);
+ }
+ }
+
+@@ -501,6 +517,9 @@ int main(int ac, char **av)
+ case 'o':
+ input_mode = ask_new;
+ break;
++ case 'b':
++ input_mode = dont_ask;
++ break;
+ case 's':
+ input_mode = ask_silent;
+ valid_stdin = isatty(0) && isatty(1) && isatty(2);
+@@ -565,6 +584,7 @@ int main(int ac, char **av)
+ }
+ case ask_all:
+ case ask_new:
++ case dont_ask:
+ conf_read(NULL);
+ break;
+ case set_no:
+@@ -603,10 +623,10 @@ int main(int ac, char **av)
+ do {
+ conf_cnt = 0;
+ check_conf(&rootmenu);
+- } while (conf_cnt);
++ } while ((conf_cnt) && (input_mode != dont_ask));
+ if (conf_write(NULL)) {
+ fprintf(stderr, _("\n*** Error during writing of the kernel configuration.\n\n"));
+ return 1;
+ }
+- return 0;
++ return return_value;
+ }
diff --git a/openvz-sources/026.015-r2/0100_patch-026test015-core.patch b/openvz-sources/026.015-r2/0100_patch-026test015-core.patch
new file mode 100644
index 0000000..94452f7
--- /dev/null
+++ b/openvz-sources/026.015-r2/0100_patch-026test015-core.patch
@@ -0,0 +1,91083 @@
+diff -upr linux-2.6.16.orig/COPYING.SWsoft linux-2.6.16-026test015/COPYING.SWsoft
+--- linux-2.6.16.orig/COPYING.SWsoft 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/COPYING.SWsoft 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,350 @@
++
++Nothing in this license should be construed as a grant by SWsoft of any rights
++beyond the rights specified in the GNU General Public License, and nothing in
++this license should be construed as a waiver by SWsoft of its patent, copyright
++and/or trademark rights, beyond the waiver required by the GNU General Public
++License. This license is expressly inapplicable to any product that is not
++within the scope of the GNU General Public License
++
++----------------------------------------
++
++ GNU GENERAL PUBLIC LICENSE
++ Version 2, June 1991
++
++ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
++ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++ Everyone is permitted to copy and distribute verbatim copies
++ of this license document, but changing it is not allowed.
++
++ Preamble
++
++ The licenses for most software are designed to take away your
++freedom to share and change it. By contrast, the GNU General Public
++License is intended to guarantee your freedom to share and change free
++software--to make sure the software is free for all its users. This
++General Public License applies to most of the Free Software
++Foundation's software and to any other program whose authors commit to
++using it. (Some other Free Software Foundation software is covered by
++the GNU Library General Public License instead.) You can apply it to
++your programs, too.
++
++ When we speak of free software, we are referring to freedom, not
++price. Our General Public Licenses are designed to make sure that you
++have the freedom to distribute copies of free software (and charge for
++this service if you wish), that you receive source code or can get it
++if you want it, that you can change the software or use pieces of it
++in new free programs; and that you know you can do these things.
++
++ To protect your rights, we need to make restrictions that forbid
++anyone to deny you these rights or to ask you to surrender the rights.
++These restrictions translate to certain responsibilities for you if you
++distribute copies of the software, or if you modify it.
++
++ For example, if you distribute copies of such a program, whether
++gratis or for a fee, you must give the recipients all the rights that
++you have. You must make sure that they, too, receive or can get the
++source code. And you must show them these terms so they know their
++rights.
++
++ We protect your rights with two steps: (1) copyright the software, and
++(2) offer you this license which gives you legal permission to copy,
++distribute and/or modify the software.
++
++ Also, for each author's protection and ours, we want to make certain
++that everyone understands that there is no warranty for this free
++software. If the software is modified by someone else and passed on, we
++want its recipients to know that what they have is not the original, so
++that any problems introduced by others will not reflect on the original
++authors' reputations.
++
++ Finally, any free program is threatened constantly by software
++patents. We wish to avoid the danger that redistributors of a free
++program will individually obtain patent licenses, in effect making the
++program proprietary. To prevent this, we have made it clear that any
++patent must be licensed for everyone's free use or not licensed at all.
++
++ The precise terms and conditions for copying, distribution and
++modification follow.
++
++ GNU GENERAL PUBLIC LICENSE
++ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
++
++ 0. This License applies to any program or other work which contains
++a notice placed by the copyright holder saying it may be distributed
++under the terms of this General Public License. The "Program", below,
++refers to any such program or work, and a "work based on the Program"
++means either the Program or any derivative work under copyright law:
++that is to say, a work containing the Program or a portion of it,
++either verbatim or with modifications and/or translated into another
++language. (Hereinafter, translation is included without limitation in
++the term "modification".) Each licensee is addressed as "you".
++
++Activities other than copying, distribution and modification are not
++covered by this License; they are outside its scope. The act of
++running the Program is not restricted, and the output from the Program
++is covered only if its contents constitute a work based on the
++Program (independent of having been made by running the Program).
++Whether that is true depends on what the Program does.
++
++ 1. You may copy and distribute verbatim copies of the Program's
++source code as you receive it, in any medium, provided that you
++conspicuously and appropriately publish on each copy an appropriate
++copyright notice and disclaimer of warranty; keep intact all the
++notices that refer to this License and to the absence of any warranty;
++and give any other recipients of the Program a copy of this License
++along with the Program.
++
++You may charge a fee for the physical act of transferring a copy, and
++you may at your option offer warranty protection in exchange for a fee.
++
++ 2. You may modify your copy or copies of the Program or any portion
++of it, thus forming a work based on the Program, and copy and
++distribute such modifications or work under the terms of Section 1
++above, provided that you also meet all of these conditions:
++
++ a) You must cause the modified files to carry prominent notices
++ stating that you changed the files and the date of any change.
++
++ b) You must cause any work that you distribute or publish, that in
++ whole or in part contains or is derived from the Program or any
++ part thereof, to be licensed as a whole at no charge to all third
++ parties under the terms of this License.
++
++ c) If the modified program normally reads commands interactively
++ when run, you must cause it, when started running for such
++ interactive use in the most ordinary way, to print or display an
++ announcement including an appropriate copyright notice and a
++ notice that there is no warranty (or else, saying that you provide
++ a warranty) and that users may redistribute the program under
++ these conditions, and telling the user how to view a copy of this
++ License. (Exception: if the Program itself is interactive but
++ does not normally print such an announcement, your work based on
++ the Program is not required to print an announcement.)
++
++These requirements apply to the modified work as a whole. If
++identifiable sections of that work are not derived from the Program,
++and can be reasonably considered independent and separate works in
++themselves, then this License, and its terms, do not apply to those
++sections when you distribute them as separate works. But when you
++distribute the same sections as part of a whole which is a work based
++on the Program, the distribution of the whole must be on the terms of
++this License, whose permissions for other licensees extend to the
++entire whole, and thus to each and every part regardless of who wrote it.
++
++Thus, it is not the intent of this section to claim rights or contest
++your rights to work written entirely by you; rather, the intent is to
++exercise the right to control the distribution of derivative or
++collective works based on the Program.
++
++In addition, mere aggregation of another work not based on the Program
++with the Program (or with a work based on the Program) on a volume of
++a storage or distribution medium does not bring the other work under
++the scope of this License.
++
++ 3. You may copy and distribute the Program (or a work based on it,
++under Section 2) in object code or executable form under the terms of
++Sections 1 and 2 above provided that you also do one of the following:
++
++ a) Accompany it with the complete corresponding machine-readable
++ source code, which must be distributed under the terms of Sections
++ 1 and 2 above on a medium customarily used for software interchange; or,
++
++ b) Accompany it with a written offer, valid for at least three
++ years, to give any third party, for a charge no more than your
++ cost of physically performing source distribution, a complete
++ machine-readable copy of the corresponding source code, to be
++ distributed under the terms of Sections 1 and 2 above on a medium
++ customarily used for software interchange; or,
++
++ c) Accompany it with the information you received as to the offer
++ to distribute corresponding source code. (This alternative is
++ allowed only for noncommercial distribution and only if you
++ received the program in object code or executable form with such
++ an offer, in accord with Subsection b above.)
++
++The source code for a work means the preferred form of the work for
++making modifications to it. For an executable work, complete source
++code means all the source code for all modules it contains, plus any
++associated interface definition files, plus the scripts used to
++control compilation and installation of the executable. However, as a
++special exception, the source code distributed need not include
++anything that is normally distributed (in either source or binary
++form) with the major components (compiler, kernel, and so on) of the
++operating system on which the executable runs, unless that component
++itself accompanies the executable.
++
++If distribution of executable or object code is made by offering
++access to copy from a designated place, then offering equivalent
++access to copy the source code from the same place counts as
++distribution of the source code, even though third parties are not
++compelled to copy the source along with the object code.
++
++ 4. You may not copy, modify, sublicense, or distribute the Program
++except as expressly provided under this License. Any attempt
++otherwise to copy, modify, sublicense or distribute the Program is
++void, and will automatically terminate your rights under this License.
++However, parties who have received copies, or rights, from you under
++this License will not have their licenses terminated so long as such
++parties remain in full compliance.
++
++ 5. You are not required to accept this License, since you have not
++signed it. However, nothing else grants you permission to modify or
++distribute the Program or its derivative works. These actions are
++prohibited by law if you do not accept this License. Therefore, by
++modifying or distributing the Program (or any work based on the
++Program), you indicate your acceptance of this License to do so, and
++all its terms and conditions for copying, distributing or modifying
++the Program or works based on it.
++
++ 6. Each time you redistribute the Program (or any work based on the
++Program), the recipient automatically receives a license from the
++original licensor to copy, distribute or modify the Program subject to
++these terms and conditions. You may not impose any further
++restrictions on the recipients' exercise of the rights granted herein.
++You are not responsible for enforcing compliance by third parties to
++this License.
++
++ 7. If, as a consequence of a court judgment or allegation of patent
++infringement or for any other reason (not limited to patent issues),
++conditions are imposed on you (whether by court order, agreement or
++otherwise) that contradict the conditions of this License, they do not
++excuse you from the conditions of this License. If you cannot
++distribute so as to satisfy simultaneously your obligations under this
++License and any other pertinent obligations, then as a consequence you
++may not distribute the Program at all. For example, if a patent
++license would not permit royalty-free redistribution of the Program by
++all those who receive copies directly or indirectly through you, then
++the only way you could satisfy both it and this License would be to
++refrain entirely from distribution of the Program.
++
++If any portion of this section is held invalid or unenforceable under
++any particular circumstance, the balance of the section is intended to
++apply and the section as a whole is intended to apply in other
++circumstances.
++
++It is not the purpose of this section to induce you to infringe any
++patents or other property right claims or to contest validity of any
++such claims; this section has the sole purpose of protecting the
++integrity of the free software distribution system, which is
++implemented by public license practices. Many people have made
++generous contributions to the wide range of software distributed
++through that system in reliance on consistent application of that
++system; it is up to the author/donor to decide if he or she is willing
++to distribute software through any other system and a licensee cannot
++impose that choice.
++
++This section is intended to make thoroughly clear what is believed to
++be a consequence of the rest of this License.
++
++ 8. If the distribution and/or use of the Program is restricted in
++certain countries either by patents or by copyrighted interfaces, the
++original copyright holder who places the Program under this License
++may add an explicit geographical distribution limitation excluding
++those countries, so that distribution is permitted only in or among
++countries not thus excluded. In such case, this License incorporates
++the limitation as if written in the body of this License.
++
++ 9. The Free Software Foundation may publish revised and/or new versions
++of the General Public License from time to time. Such new versions will
++be similar in spirit to the present version, but may differ in detail to
++address new problems or concerns.
++
++Each version is given a distinguishing version number. If the Program
++specifies a version number of this License which applies to it and "any
++later version", you have the option of following the terms and conditions
++either of that version or of any later version published by the Free
++Software Foundation. If the Program does not specify a version number of
++this License, you may choose any version ever published by the Free Software
++Foundation.
++
++ 10. If you wish to incorporate parts of the Program into other free
++programs whose distribution conditions are different, write to the author
++to ask for permission. For software which is copyrighted by the Free
++Software Foundation, write to the Free Software Foundation; we sometimes
++make exceptions for this. Our decision will be guided by the two goals
++of preserving the free status of all derivatives of our free software and
++of promoting the sharing and reuse of software generally.
++
++ NO WARRANTY
++
++ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
++FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
++OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
++PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
++OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
++MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
++TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
++PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
++REPAIR OR CORRECTION.
++
++ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
++WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
++REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
++INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
++OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
++TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
++YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
++PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
++POSSIBILITY OF SUCH DAMAGES.
++
++ END OF TERMS AND CONDITIONS
++
++ How to Apply These Terms to Your New Programs
++
++ If you develop a new program, and you want it to be of the greatest
++possible use to the public, the best way to achieve this is to make it
++free software which everyone can redistribute and change under these terms.
++
++ To do so, attach the following notices to the program. It is safest
++to attach them to the start of each source file to most effectively
++convey the exclusion of warranty; and each file should have at least
++the "copyright" line and a pointer to where the full notice is found.
++
++ <one line to give the program's name and a brief idea of what it does.>
++ Copyright (C) <year> <name of author>
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; either version 2 of the License, or
++ (at your option) any later version.
++
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
++
++
++Also add information on how to contact you by electronic and paper mail.
++
++If the program is interactive, make it output a short notice like this
++when it starts in an interactive mode:
++
++ Gnomovision version 69, Copyright (C) year name of author
++ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
++ This is free software, and you are welcome to redistribute it
++ under certain conditions; type `show c' for details.
++
++The hypothetical commands `show w' and `show c' should show the appropriate
++parts of the General Public License. Of course, the commands you use may
++be called something other than `show w' and `show c'; they could even be
++mouse-clicks or menu items--whatever suits your program.
++
++You should also get your employer (if you work as a programmer) or your
++school, if any, to sign a "copyright disclaimer" for the program, if
++necessary. Here is a sample; alter the names:
++
++ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
++ `Gnomovision' (which makes passes at compilers) written by James Hacker.
++
++ <signature of Ty Coon>, 1 April 1989
++ Ty Coon, President of Vice
++
++This General Public License does not permit incorporating your program into
++proprietary programs. If your program is a subroutine library, you may
++consider it more useful to permit linking proprietary applications with the
++library. If this is what you want to do, use the GNU Library General
++Public License instead of this License.
+diff -upr linux-2.6.16.orig/Documentation/dvb/get_dvb_firmware linux-2.6.16-026test015/Documentation/dvb/get_dvb_firmware
+--- linux-2.6.16.orig/Documentation/dvb/get_dvb_firmware 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/Documentation/dvb/get_dvb_firmware 2006-07-04 14:41:36.000000000 +0400
+@@ -240,9 +240,9 @@ sub dibusb {
+ }
+
+ sub nxt2002 {
+- my $sourcefile = "Broadband4PC_4_2_11.zip";
++ my $sourcefile = "Technisat_DVB-PC_4_4_COMPACT.zip";
+ my $url = "http://www.bbti.us/download/windows/$sourcefile";
+- my $hash = "c6d2ea47a8f456d887ada0cfb718ff2a";
++ my $hash = "476befae8c7c1bb9648954060b1eec1f";
+ my $outfile = "dvb-fe-nxt2002.fw";
+ my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1);
+
+@@ -250,8 +250,8 @@ sub nxt2002 {
+
+ wgetfile($sourcefile, $url);
+ unzip($sourcefile, $tmpdir);
+- verify("$tmpdir/SkyNETU.sys", $hash);
+- extract("$tmpdir/SkyNETU.sys", 375832, 5908, $outfile);
++ verify("$tmpdir/SkyNET.sys", $hash);
++ extract("$tmpdir/SkyNET.sys", 331624, 5908, $outfile);
+
+ $outfile;
+ }
+diff -upr linux-2.6.16.orig/Documentation/vsched.txt linux-2.6.16-026test015/Documentation/vsched.txt
+--- linux-2.6.16.orig/Documentation/vsched.txt 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/Documentation/vsched.txt 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,83 @@
++Copyright (C) 2005 SWsoft. All rights reserved.
++Licensing governed by "linux/COPYING.SWsoft" file.
++
++Hierarchical CPU schedulers
++~~~~~~~~~~~~~~~~~~~~~~~~~~~
++
++Hierarchical CPU scheduler is a stack of CPU schedulers which allows
++to organize different policies of scheduling in the system and/or between
++groups of processes.
++
++Virtuozzo uses a hierarchical Fair CPU scheduler organized as a 2-stage
++CPU scheduler, where the scheduling decisions are made in 2 steps:
++1. On the first step Fair CPU scheduler selects a group of processes
++ which should get some CPU time.
++2. Then standard Linux scheduler chooses a process inside the group.
++Such scheduler efficiently allows to isolate one group of processes
++from another and still allows a group to use more than 1 CPU on SMP systems.
++
++This document describes a new middle layer of Virtuozzo hierarchical CPU
++scheduler which makes decisions after Fair scheduler, but before Linux
++scheduler and which is called VCPU scheduler.
++
++
++Where VCPU scheduler comes from?
++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++
++Existing hierarchical CPU scheduler uses isolated algorithms on each stage
++of decision making, i.e. every scheduler makes its decisions without
++taking into account the details of other schedulers. This can lead to a number
++of problems described below.
++
++On SMP systems there are possible situations when the first CPU scheduler
++in the hierarchy (e.g. Fair scheduler) wants to schedule some group of
++processes on the physical CPU, but the underlying process scheduler
++(e.g. Linux O(1) CPU scheduler) is unable to schedule any processes
++on this physical CPU. Usually this happens due to the fact that Linux
++kernel scheduler uses per-physical CPU runqueues.
++
++Another problem is that Linux scheduler also knows nothing about
++Fair scheduler and can't balance efficiently without taking into account
++statistics about process groups from Fair scheduler. Without such
++statistics Linux scheduler can concentrate all processes on one physical
++CPU, thus making CPU consuming highly inefficient.
++
++VCPU scheduler solves these problems by adding a new layer between
++Fair schedule and Linux scheduler.
++
++VCPU scheduler
++~~~~~~~~~~~~~~
++
++VCPU scheduler is a CPU scheduler which splits notion of
++physical and virtual CPUs (VCPU and PCPU). This means that tasks are
++running on virtual CPU runqueues, while VCPUs are running on PCPUs.
++
++The Virtuozzo hierarchical fair scheduler becomes 3 stage CPU scheduler:
++1. First, Fair CPU scheduler select a group of processes.
++2. Then VCPU scheduler select a virtual CPU to run (this is actually
++ a runqueue).
++3. Standard Linux scheduler chooses a process from the runqueue.
++
++For example on the picture below PCPU0 executes tasks from
++VCPU1 runqueue and PCPU1 is idle:
++
++ virtual | physical | virtual
++ idle CPUs | CPUs | CPUS
++--------------------|------------------------|--------------------------
++ | | -----------------
++ | | | virtual sched X |
++ | | | ----------- |
++ | | | | VCPU0 | |
++ | | | ----------- |
++ ------------ | ----------- | ----------- |
++| idle VCPU0 | | | PCPU0 | <---> | | VCPU1 | |
++ ------------ | ----------- | ----------- |
++ | | -----------------
++ | |
++ | | -----------------
++ | | | virtual sched Y |
++ ------------ ----------- | | ----------- |
++| idle VCPU1 | <---> | PCPU1 | | | | VCPU0 | |
++ ------------ ----------- | | ----------- |
++ | | -----------------
++ | |
+diff -upr linux-2.6.16.orig/Makefile linux-2.6.16-026test015/Makefile
+--- linux-2.6.16.orig/Makefile 2006-07-04 14:41:39.000000000 +0400
++++ linux-2.6.16-026test015/Makefile 2006-07-04 14:41:39.000000000 +0400
+@@ -1,7 +1,7 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 16
+-EXTRAVERSION =
++EXTRAVERSION = -026test015
+ NAME=Sliding Snow Leopard
+
+ # *DOCUMENTATION*
+diff -upr linux-2.6.16.orig/arch/alpha/kernel/setup.c linux-2.6.16-026test015/arch/alpha/kernel/setup.c
+--- linux-2.6.16.orig/arch/alpha/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/alpha/kernel/setup.c 2006-07-04 14:41:36.000000000 +0400
+@@ -24,6 +24,7 @@
+ #include <linux/config.h> /* CONFIG_ALPHA_LCA etc */
+ #include <linux/mc146818rtc.h>
+ #include <linux/console.h>
++#include <linux/cpu.h>
+ #include <linux/errno.h>
+ #include <linux/init.h>
+ #include <linux/string.h>
+@@ -477,6 +478,22 @@ page_is_ram(unsigned long pfn)
+ #undef PFN_PHYS
+ #undef PFN_MAX
+
++static int __init
++register_cpus(void)
++{
++ int i;
++
++ for_each_possible_cpu(i) {
++ struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
++ if (!p)
++ return -ENOMEM;
++ register_cpu(p, i, NULL);
++ }
++ return 0;
++}
++
++arch_initcall(register_cpus);
++
+ void __init
+ setup_arch(char **cmdline_p)
+ {
+diff -upr linux-2.6.16.orig/arch/alpha/kernel/smp.c linux-2.6.16-026test015/arch/alpha/kernel/smp.c
+--- linux-2.6.16.orig/arch/alpha/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/alpha/kernel/smp.c 2006-07-04 14:41:36.000000000 +0400
+@@ -439,7 +439,7 @@ setup_smp(void)
+ if ((cpu->flags & 0x1cc) == 0x1cc) {
+ smp_num_probed++;
+ /* Assume here that "whami" == index */
+- cpu_set(i, cpu_possible_map);
++ cpu_set(i, cpu_present_mask);
+ cpu->pal_revision = boot_cpu_palrev;
+ }
+
+@@ -450,9 +450,8 @@ setup_smp(void)
+ }
+ } else {
+ smp_num_probed = 1;
+- cpu_set(boot_cpuid, cpu_possible_map);
++ cpu_set(boot_cpuid, cpu_present_mask);
+ }
+- cpu_present_mask = cpumask_of_cpu(boot_cpuid);
+
+ printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_mask = %lx\n",
+ smp_num_probed, cpu_possible_map.bits[0]);
+@@ -488,9 +487,8 @@ void __devinit
+ smp_prepare_boot_cpu(void)
+ {
+ /*
+- * Mark the boot cpu (current cpu) as both present and online
++ * Mark the boot cpu (current cpu) as online
+ */
+- cpu_set(smp_processor_id(), cpu_present_mask);
+ cpu_set(smp_processor_id(), cpu_online_map);
+ }
+
+diff -upr linux-2.6.16.orig/arch/alpha/lib/strncpy.S linux-2.6.16-026test015/arch/alpha/lib/strncpy.S
+--- linux-2.6.16.orig/arch/alpha/lib/strncpy.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/alpha/lib/strncpy.S 2006-07-04 14:41:36.000000000 +0400
+@@ -43,8 +43,8 @@ strncpy:
+
+ .align 4
+ $multiword:
+- subq $24, 1, $2 # clear the final bits in the prev word
+- or $2, $24, $2
++ subq $27, 1, $2 # clear the final bits in the prev word
++ or $2, $27, $2
+ zapnot $1, $2, $1
+ subq $18, 1, $18
+
+@@ -70,8 +70,8 @@ $multiword:
+ bne $18, 0b
+
+ 1: ldq_u $1, 0($16) # clear the leading bits in the final word
+- subq $27, 1, $2
+- or $2, $27, $2
++ subq $24, 1, $2
++ or $2, $24, $2
+
+ zap $1, $2, $1
+ stq_u $1, 0($16)
+diff -upr linux-2.6.16.orig/arch/arm/kernel/smp.c linux-2.6.16-026test015/arch/arm/kernel/smp.c
+--- linux-2.6.16.orig/arch/arm/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/arm/kernel/smp.c 2006-07-04 14:41:38.000000000 +0400
+@@ -197,7 +197,7 @@ int __cpuexit __cpu_disable(void)
+ local_flush_tlb_all();
+
+ read_lock(&tasklist_lock);
+- for_each_process(p) {
++ for_each_process_all(p) {
+ if (p->mm)
+ cpu_clear(cpu, p->mm->cpu_vm_mask);
+ }
+diff -upr linux-2.6.16.orig/arch/frv/mm/mmu-context.c linux-2.6.16-026test015/arch/frv/mm/mmu-context.c
+--- linux-2.6.16.orig/arch/frv/mm/mmu-context.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/frv/mm/mmu-context.c 2006-07-04 14:41:38.000000000 +0400
+@@ -181,7 +181,7 @@ int cxn_pin_by_pid(pid_t pid)
+
+ /* get a handle on the mm_struct */
+ read_lock(&tasklist_lock);
+- tsk = find_task_by_pid(pid);
++ tsk = find_task_by_pid_ve(pid);
+ if (tsk) {
+ ret = -EINVAL;
+
+diff -upr linux-2.6.16.orig/arch/i386/Kconfig linux-2.6.16-026test015/arch/i386/Kconfig
+--- linux-2.6.16.orig/arch/i386/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -216,6 +216,8 @@ config NR_CPUS
+ This is purely to save memory - each supported CPU adds
+ approximately eight kilobytes to the kernel image.
+
++source "kernel/Kconfig.fairsched"
++
+ config SCHED_SMT
+ bool "SMT (Hyperthreading) scheduler support"
+ depends on SMP
+@@ -268,6 +270,14 @@ config X86_VISWS_APIC
+ depends on X86_VISWS
+ default y
+
++config NMI_WATCHDOG
++ bool "NMI Watchdog"
++ default y
++ help
++ If you say Y here the kernel will activate NMI watchdog by default
++ on boot. You can still activate NMI watchdog via nmi_watchdog
++ command line option even if you say N here.
++
+ config X86_MCE
+ bool "Machine Check Exception"
+ depends on !X86_VOYAGER
+@@ -1071,12 +1081,16 @@ endmenu
+
+ source "arch/i386/Kconfig.debug"
+
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+
+ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
+
++source "kernel/ub/Kconfig"
++
+ #
+ # Use the generic interrupt handling code in kernel/irq/:
+ #
+diff -upr linux-2.6.16.orig/arch/i386/kernel/apic.c linux-2.6.16-026test015/arch/i386/kernel/apic.c
+--- linux-2.6.16.orig/arch/i386/kernel/apic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/apic.c 2006-07-04 14:41:38.000000000 +0400
+@@ -1177,6 +1177,7 @@ inline void smp_local_timer_interrupt(st
+ fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+ {
+ int cpu = smp_processor_id();
++ struct ve_struct *ve;
+
+ /*
+ * the NMI deadlock-detector uses this.
+@@ -1193,9 +1194,11 @@ fastcall void smp_apic_timer_interrupt(s
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+ smp_local_timer_interrupt(regs);
+ irq_exit();
++ (void)set_exec_env(ve);
+ }
+
+ #ifndef CONFIG_SMP
+diff -upr linux-2.6.16.orig/arch/i386/kernel/apm.c linux-2.6.16-026test015/arch/i386/kernel/apm.c
+--- linux-2.6.16.orig/arch/i386/kernel/apm.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/apm.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1081,7 +1081,7 @@ static int apm_console_blank(int blank)
+ break;
+ }
+
+- if (error == APM_NOT_ENGAGED && state != APM_STATE_READY) {
++ if (error == APM_NOT_ENGAGED) {
+ static int tried;
+ int eng_error;
+ if (tried++ == 0) {
+diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/amd.c linux-2.6.16-026test015/arch/i386/kernel/cpu/amd.c
+--- linux-2.6.16.orig/arch/i386/kernel/cpu/amd.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/amd.c 2006-07-04 14:41:36.000000000 +0400
+@@ -207,6 +207,8 @@ static void __init init_amd(struct cpuin
+ set_bit(X86_FEATURE_K7, c->x86_capability);
+ break;
+ }
++ if (c->x86 >= 6)
++ set_bit(X86_FEATURE_FXSAVE_LEAK, c->x86_capability);
+
+ display_cacheinfo(c);
+
+diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/Kconfig linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/Kconfig
+--- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -203,6 +203,7 @@ config X86_LONGRUN
+ config X86_LONGHAUL
+ tristate "VIA Cyrix III Longhaul"
+ select CPU_FREQ_TABLE
++ depends on BROKEN
+ help
+ This adds the CPUFreq driver for VIA Samuel/CyrixIII,
+ VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
+diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+--- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-07-04 14:41:36.000000000 +0400
+@@ -244,7 +244,7 @@ static int cpufreq_p4_cpu_init(struct cp
+ for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+ if ((i<2) && (has_N44_O17_errata[policy->cpu]))
+ p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+- else if (has_N60_errata[policy->cpu] && p4clockmod_table[i].frequency < 2000000)
++ else if (has_N60_errata[policy->cpu] && ((stock_freq * i)/8) < 2000000)
+ p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+ else
+ p4clockmod_table[i].frequency = (stock_freq * i)/8;
+diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c
+--- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c 2006-07-04 14:41:36.000000000 +0400
+@@ -75,7 +75,9 @@ static int speedstep_smi_ownership (void
+ __asm__ __volatile__(
+ "out %%al, (%%dx)\n"
+ : "=D" (result)
+- : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic)
++ : "a" (command), "b" (function), "c" (0), "d" (smi_port),
++ "D" (0), "S" (magic)
++ : "memory"
+ );
+
+ dprintk("result is %x\n", result);
+diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/if.c linux-2.6.16-026test015/arch/i386/kernel/cpu/mtrr/if.c
+--- linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/if.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/mtrr/if.c 2006-07-04 14:41:38.000000000 +0400
+@@ -392,7 +392,7 @@ static int __init mtrr_if_init(void)
+ return -ENODEV;
+
+ proc_root_mtrr =
+- create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root);
++ create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL);
+ if (proc_root_mtrr) {
+ proc_root_mtrr->owner = THIS_MODULE;
+ proc_root_mtrr->proc_fops = &mtrr_fops;
+diff -upr linux-2.6.16.orig/arch/i386/kernel/dmi_scan.c linux-2.6.16-026test015/arch/i386/kernel/dmi_scan.c
+--- linux-2.6.16.orig/arch/i386/kernel/dmi_scan.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/dmi_scan.c 2006-07-04 14:41:36.000000000 +0400
+@@ -106,7 +106,7 @@ static void __init dmi_save_devices(stru
+ struct dmi_device *dev;
+
+ for (i = 0; i < count; i++) {
+- char *d = ((char *) dm) + (i * 2);
++ char *d = (char *)(dm + 1) + (i * 2);
+
+ /* Skip disabled device */
+ if ((*d & 0x80) == 0)
+diff -upr linux-2.6.16.orig/arch/i386/kernel/irq.c linux-2.6.16-026test015/arch/i386/kernel/irq.c
+--- linux-2.6.16.orig/arch/i386/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400
+@@ -59,7 +59,9 @@ fastcall unsigned int do_IRQ(struct pt_r
+ union irq_ctx *curctx, *irqctx;
+ u32 *isp;
+ #endif
++ struct ve_struct *ve;
+
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
+ /* Debugging check for stack overflow: is there less than 1KB free? */
+@@ -108,6 +110,7 @@ fastcall unsigned int do_IRQ(struct pt_r
+ __do_IRQ(irq, regs);
+
+ irq_exit();
++ (void)set_exec_env(ve);
+
+ return 1;
+ }
+diff -upr linux-2.6.16.orig/arch/i386/kernel/ldt.c linux-2.6.16-026test015/arch/i386/kernel/ldt.c
+--- linux-2.6.16.orig/arch/i386/kernel/ldt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/ldt.c 2006-07-04 14:41:39.000000000 +0400
+@@ -13,6 +13,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/vmalloc.h>
+ #include <linux/slab.h>
++#include <linux/module.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -20,6 +21,8 @@
+ #include <asm/desc.h>
+ #include <asm/mmu_context.h>
+
++#include <ub/ub_mem.h>
++
+ #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+ static void flush_ldt(void *null)
+ {
+@@ -39,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, i
+ oldsize = pc->size;
+ mincount = (mincount+511)&(~511);
+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
++ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE);
+ else
+- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
++ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+ if (!newldt)
+ return -ENOMEM;
+@@ -105,6 +108,7 @@ int init_new_context(struct task_struct
+ }
+ return retval;
+ }
++EXPORT_SYMBOL_GPL(init_new_context);
+
+ /*
+ * No need to lock the MM as we are the last user
+@@ -251,3 +255,5 @@ asmlinkage int sys_modify_ldt(int func,
+ }
+ return ret;
+ }
++
++EXPORT_SYMBOL_GPL(default_ldt);
+diff -upr linux-2.6.16.orig/arch/i386/kernel/nmi.c linux-2.6.16-026test015/arch/i386/kernel/nmi.c
+--- linux-2.6.16.orig/arch/i386/kernel/nmi.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/nmi.c 2006-07-04 14:41:37.000000000 +0400
+@@ -32,7 +32,13 @@
+
+ #include "mach_traps.h"
+
+-unsigned int nmi_watchdog = NMI_NONE;
++#ifdef CONFIG_NMI_WATCHDOG
++#define NMI_DEFAULT NMI_IO_APIC
++#else
++#define NMI_DEFAULT NMI_NONE
++#endif
++
++unsigned int nmi_watchdog = NMI_DEFAULT;
+ extern int unknown_nmi_panic;
+ static unsigned int nmi_hz = HZ;
+ static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+@@ -521,7 +527,22 @@ void touch_nmi_watchdog (void)
+
+ extern void die_nmi(struct pt_regs *, const char *msg);
+
+-void nmi_watchdog_tick (struct pt_regs * regs)
++void smp_show_regs(struct pt_regs *regs, void *info)
++{
++ static DEFINE_SPINLOCK(show_regs_lock);
++
++ if (regs == NULL)
++ return;
++
++ bust_spinlocks(1);
++ spin_lock(&show_regs_lock);
++ printk("----------- IPI show regs -----------");
++ show_regs(regs);
++ spin_unlock(&show_regs_lock);
++ bust_spinlocks(0);
++}
++
++void nmi_watchdog_tick(struct pt_regs *regs)
+ {
+
+ /*
+diff -upr linux-2.6.16.orig/arch/i386/kernel/process.c linux-2.6.16-026test015/arch/i386/kernel/process.c
+--- linux-2.6.16.orig/arch/i386/kernel/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -59,6 +59,7 @@
+ #include <asm/cpu.h>
+
+ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
++EXPORT_SYMBOL_GPL(ret_from_fork);
+
+ static int hlt_counter;
+
+@@ -289,11 +290,15 @@ __setup("idle=", idle_setup);
+ void show_regs(struct pt_regs * regs)
+ {
+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
++ extern int die_counter;
+
+ printk("\n");
+- printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+- printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id());
+- print_symbol("EIP is at %s\n", regs->eip);
++ printk("Pid: %d, comm: %20s, oopses: %d\n",
++ current->pid, current->comm, die_counter);
++ printk("EIP: %04x:[<%08lx>] CPU: %d, VCPU: %d:%d\n",0xffff & regs->xcs,regs->eip, smp_processor_id(),
++ task_vsched_id(current), task_cpu(current));
++ if (decode_call_traces)
++ print_symbol("EIP is at %s\n", regs->eip);
+
+ if (user_mode(regs))
+ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
+@@ -314,6 +319,8 @@ void show_regs(struct pt_regs * regs)
+ cr4 = read_cr4_safe();
+ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
+ show_trace(NULL, &regs->esp);
++ if (!decode_call_traces)
++ printk(" EIP: [<%08lx>]\n",regs->eip);
+ }
+
+ /*
+@@ -339,6 +346,13 @@ int kernel_thread(int (*fn)(void *), voi
+ {
+ struct pt_regs regs;
+
++ /* Don't allow kernel_thread() inside VE */
++ if (!ve_is_super(get_exec_env())) {
++ printk("kernel_thread call inside VE\n");
++ dump_stack();
++ return -EPERM;
++ }
++
+ memset(&regs, 0, sizeof(regs));
+
+ regs.ebx = (unsigned long) fn;
+diff -upr linux-2.6.16.orig/arch/i386/kernel/ptrace.c linux-2.6.16-026test015/arch/i386/kernel/ptrace.c
+--- linux-2.6.16.orig/arch/i386/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/ptrace.c 2006-07-04 14:41:39.000000000 +0400
+@@ -706,7 +706,9 @@ int do_syscall_trace(struct pt_regs *reg
+ /* the 0x80 provides a way for the tracing parent to distinguish
+ between a syscall stop and SIGTRAP delivery */
+ /* Note that the debugger could change the result of test_thread_flag!*/
++ set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY);
+ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
++ clear_pn_state(current);
+
+ /*
+ * this isn't the same as continuing with a signal, but it will do
+diff -upr linux-2.6.16.orig/arch/i386/kernel/signal.c linux-2.6.16-026test015/arch/i386/kernel/signal.c
+--- linux-2.6.16.orig/arch/i386/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/signal.c 2006-07-04 14:41:36.000000000 +0400
+@@ -582,7 +582,7 @@ static void fastcall do_signal(struct pt
+ if (!user_mode(regs))
+ return;
+
+- if (try_to_freeze())
++ if (try_to_freeze() && !signal_pending(current))
+ goto no_signal;
+
+ if (test_thread_flag(TIF_RESTORE_SIGMASK))
+diff -upr linux-2.6.16.orig/arch/i386/kernel/smp.c linux-2.6.16-026test015/arch/i386/kernel/smp.c
+--- linux-2.6.16.orig/arch/i386/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/smp.c 2006-07-04 14:41:37.000000000 +0400
+@@ -21,6 +21,7 @@
+ #include <linux/cpu.h>
+ #include <linux/module.h>
+
++#include <asm/nmi.h>
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
+ #include <mach_apic.h>
+@@ -566,6 +567,89 @@ int smp_call_function (void (*func) (voi
+ }
+ EXPORT_SYMBOL(smp_call_function);
+
++static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED;
++static struct nmi_call_data_struct {
++ smp_nmi_function func;
++ void *info;
++ atomic_t started;
++ atomic_t finished;
++ cpumask_t cpus_called;
++ int wait;
++} *nmi_call_data;
++
++static int smp_nmi_callback(struct pt_regs * regs, int cpu)
++{
++ smp_nmi_function func;
++ void *info;
++ int wait;
++
++ func = nmi_call_data->func;
++ info = nmi_call_data->info;
++ wait = nmi_call_data->wait;
++ ack_APIC_irq();
++ /* prevent from calling func() multiple times */
++ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called))
++ return 0;
++ /*
++ * notify initiating CPU that I've grabbed the data and am
++ * about to execute the function
++ */
++ mb();
++ atomic_inc(&nmi_call_data->started);
++ /* at this point the nmi_call_data structure is out of scope */
++ irq_enter();
++ func(regs, info);
++ irq_exit();
++ if (wait)
++ atomic_inc(&nmi_call_data->finished);
++
++ return 0;
++}
++
++/*
++ * This function tries to call func(regs, info) on each cpu.
++ * Func must be fast and non-blocking.
++ * May be called with disabled interrupts and from any context.
++ */
++int smp_nmi_call_function(smp_nmi_function func, void *info, int wait)
++{
++ struct nmi_call_data_struct data;
++ int cpus;
++
++ cpus = num_online_cpus() - 1;
++ if (!cpus)
++ return 0;
++
++ data.func = func;
++ data.info = info;
++ data.wait = wait;
++ atomic_set(&data.started, 0);
++ atomic_set(&data.finished, 0);
++ cpus_clear(data.cpus_called);
++ /* prevent this cpu from calling func if NMI happens */
++ cpu_set(smp_processor_id(), data.cpus_called);
++
++ if (!spin_trylock(&nmi_call_lock))
++ return -1;
++
++ nmi_call_data = &data;
++ set_nmi_ipi_callback(smp_nmi_callback);
++ mb();
++
++ /* Send a message to all other CPUs and wait for them to respond */
++ send_IPI_allbutself(APIC_DM_NMI);
++ while (atomic_read(&data.started) != cpus)
++ barrier();
++
++ unset_nmi_ipi_callback();
++ if (wait)
++ while (atomic_read(&data.finished) != cpus)
++ barrier();
++ spin_unlock(&nmi_call_lock);
++
++ return 0;
++}
++
+ static void stop_this_cpu (void * dummy)
+ {
+ /*
+diff -upr linux-2.6.16.orig/arch/i386/kernel/smpboot.c linux-2.6.16-026test015/arch/i386/kernel/smpboot.c
+--- linux-2.6.16.orig/arch/i386/kernel/smpboot.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/smpboot.c 2006-07-04 14:41:38.000000000 +0400
+@@ -317,6 +317,10 @@ static void __init synchronize_tsc_bp (v
+ }
+ if (!buggy)
+ printk("passed.\n");
++#ifdef CONFIG_VE
++ /* TSC reset. kill whatever might rely on old values */
++ VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ }
+
+ static void __init synchronize_tsc_ap (void)
+@@ -342,6 +346,10 @@ static void __init synchronize_tsc_ap (v
+ atomic_inc(&tsc_count_stop);
+ while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+ }
++#ifdef CONFIG_VE
++ /* TSC reset. kill whatever might rely on old values */
++ VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ }
+ #undef NR_LOOPS
+
+@@ -908,6 +916,13 @@ static int __devinit do_boot_cpu(int api
+ if (IS_ERR(idle))
+ panic("failed fork for CPU %d", cpu);
+ idle->thread.eip = (unsigned long) start_secondary;
++
++#ifdef CONFIG_VE
++ /* Cosmetic: sleep_time won't be changed afterwards for the idle
++ * thread; keep it 0 rather than -cycles. */
++ VE_TASK_INFO(idle)->sleep_time = 0;
++#endif
++
+ /* start_eip had better be page-aligned! */
+ start_eip = setup_trampoline();
+
+diff -upr linux-2.6.16.orig/arch/i386/kernel/sys_i386.c linux-2.6.16-026test015/arch/i386/kernel/sys_i386.c
+--- linux-2.6.16.orig/arch/i386/kernel/sys_i386.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/sys_i386.c 2006-07-04 14:41:38.000000000 +0400
+@@ -217,7 +217,7 @@ asmlinkage int sys_uname(struct old_utsn
+ if (!name)
+ return -EFAULT;
+ down_read(&uts_sem);
+- err=copy_to_user(name, &system_utsname, sizeof (*name));
++ err=copy_to_user(name, &ve_utsname, sizeof (*name));
+ up_read(&uts_sem);
+ return err?-EFAULT:0;
+ }
+@@ -233,15 +233,15 @@ asmlinkage int sys_olduname(struct oldol
+
+ down_read(&uts_sem);
+
+- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
++ error = __copy_to_user(name->sysname,ve_utsname.sysname,__OLD_UTS_LEN);
+ error |= __put_user(0,name->sysname+__OLD_UTS_LEN);
+- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
++ error |= __copy_to_user(name->nodename,ve_utsname.nodename,__OLD_UTS_LEN);
+ error |= __put_user(0,name->nodename+__OLD_UTS_LEN);
+- error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
++ error |= __copy_to_user(name->release,ve_utsname.release,__OLD_UTS_LEN);
+ error |= __put_user(0,name->release+__OLD_UTS_LEN);
+- error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
++ error |= __copy_to_user(name->version,ve_utsname.version,__OLD_UTS_LEN);
+ error |= __put_user(0,name->version+__OLD_UTS_LEN);
+- error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN);
++ error |= __copy_to_user(name->machine,ve_utsname.machine,__OLD_UTS_LEN);
+ error |= __put_user(0,name->machine+__OLD_UTS_LEN);
+
+ up_read(&uts_sem);
+diff -upr linux-2.6.16.orig/arch/i386/kernel/syscall_table.S linux-2.6.16-026test015/arch/i386/kernel/syscall_table.S
+--- linux-2.6.16.orig/arch/i386/kernel/syscall_table.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/syscall_table.S 2006-07-04 14:41:39.000000000 +0400
+@@ -310,3 +310,21 @@ ENTRY(sys_call_table)
+ .long sys_pselect6
+ .long sys_ppoll
+ .long sys_unshare /* 310 */
++
++ .rept 500-(.-sys_call_table)/4
++ .long sys_ni_syscall
++ .endr
++ .long sys_fairsched_mknod /* 500 */
++ .long sys_fairsched_rmnod
++ .long sys_fairsched_chwt
++ .long sys_fairsched_mvpr
++ .long sys_fairsched_rate
++
++ .rept 510-(.-sys_call_table)/4
++ .long sys_ni_syscall
++ .endr
++
++ .long sys_getluid /* 510 */
++ .long sys_setluid
++ .long sys_setublimit
++ .long sys_ubstat
+diff -upr linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c linux-2.6.16-026test015/arch/i386/kernel/timers/timer_tsc.c
+--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/timers/timer_tsc.c 2006-07-04 14:41:38.000000000 +0400
+@@ -94,7 +94,7 @@ static int count2; /* counter for mark_o
+ * Equal to 2^32 * (1 / (clocks per usec) ).
+ * Initialized in time_init.
+ */
+-static unsigned long fast_gettimeoffset_quotient;
++unsigned long fast_gettimeoffset_quotient;
+
+ static unsigned long get_offset_tsc(void)
+ {
+diff -upr linux-2.6.16.orig/arch/i386/kernel/traps.c linux-2.6.16-026test015/arch/i386/kernel/traps.c
+--- linux-2.6.16.orig/arch/i386/kernel/traps.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/traps.c 2006-07-04 14:41:39.000000000 +0400
+@@ -116,8 +116,10 @@ static void print_addr_and_symbol(unsign
+ {
+ printk(log_lvl);
+ printk(" [<%08lx>] ", addr);
+- print_symbol("%s", addr);
+- printk("\n");
++ if (decode_call_traces) {
++ print_symbol("%s", addr);
++ printk("\n");
++ }
+ }
+
+ static inline unsigned long print_context_stack(struct thread_info *tinfo,
+@@ -167,7 +169,10 @@ static void show_trace_log_lvl(struct ta
+ if (!stack)
+ break;
+ printk(log_lvl);
+- printk(" =======================\n");
++ if (decode_call_traces)
++ printk(" =======================\n");
++ else
++ printk(" =<ctx>= ");
+ }
+ }
+
+@@ -203,8 +208,13 @@ static void show_stack_log_lvl(struct ta
+ }
+ printk("\n");
+ printk(log_lvl);
+- printk("Call Trace:\n");
++ if (decode_call_traces)
++ printk("Call Trace:\n");
++ else
++ printk("Call Trace: ");
+ show_trace_log_lvl(task, esp, log_lvl);
++ if (!decode_call_traces)
++ printk("\n");
+ }
+
+ void show_stack(struct task_struct *task, unsigned long *esp)
+@@ -220,6 +230,8 @@ void dump_stack(void)
+ unsigned long stack;
+
+ show_trace(current, &stack);
++ if (!decode_call_traces)
++ printk("\n");
+ }
+
+ EXPORT_SYMBOL(dump_stack);
+@@ -239,9 +251,10 @@ void show_registers(struct pt_regs *regs
+ ss = regs->xss & 0xffff;
+ }
+ print_modules();
+- printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
++ printk(KERN_EMERG "CPU: %d, VCPU: %d:%d\nEIP: %04x:[<%08lx>] %s VLI\n"
+ "EFLAGS: %08lx (%s %.*s) \n",
+- smp_processor_id(), 0xffff & regs->xcs, regs->eip,
++ smp_processor_id(), task_vsched_id(current), task_cpu(current),
++ 0xffff & regs->xcs, regs->eip,
+ print_tainted(), regs->eflags, system_utsname.release,
+ (int)strcspn(system_utsname.version, " "),
+ system_utsname.version);
+@@ -252,8 +265,11 @@ void show_registers(struct pt_regs *regs
+ regs->esi, regs->edi, regs->ebp, esp);
+ printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n",
+ regs->xds & 0xffff, regs->xes & 0xffff, ss);
+- printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)",
+- current->comm, current->pid, current_thread_info(), current);
++ printk(KERN_EMERG "Process %s (pid: %d, veid=%d, threadinfo=%p task=%p)",
++ current->comm, current->pid,
++ VEID(VE_TASK_INFO(current)->owner_env),
++ current_thread_info(), current);
++
+ /*
+ * When in-kernel, we also print out the stack and code at the
+ * time of the fault..
+@@ -299,9 +315,9 @@ static void handle_BUG(struct pt_regs *r
+ goto no_bug;
+ if (ud2 != 0x0b0f)
+ goto no_bug;
+- if (__get_user(line, (unsigned short __user *)(eip + 2)))
++ if (__get_user(line, (unsigned short __user *)(eip + 4)))
+ goto bug;
+- if (__get_user(file, (char * __user *)(eip + 4)) ||
++ if (__get_user(file, (char * __user *)(eip + 7)) ||
+ (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
+ file = "<bad filename>";
+
+@@ -316,6 +332,15 @@ bug:
+ printk(KERN_EMERG "Kernel BUG\n");
+ }
+
++int die_counter = 0;
++
++static void inline check_kernel_csum_bug(void)
++{
++ if (kernel_text_csum_broken)
++ printk("Kernel code checksum mismatch detected %d times\n",
++ kernel_text_csum_broken);
++}
++
+ /* This is gone through when something in the kernel
+ * has done something bad and is about to be terminated.
+ */
+@@ -330,7 +355,6 @@ void die(const char * str, struct pt_reg
+ .lock_owner = -1,
+ .lock_owner_depth = 0
+ };
+- static int die_counter;
+ unsigned long flags;
+
+ if (die.lock_owner != raw_smp_processor_id()) {
+@@ -370,6 +394,7 @@ void die(const char * str, struct pt_reg
+ } else
+ printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+
++ check_kernel_csum_bug();
+ bust_spinlocks(0);
+ die.lock_owner = -1;
+ spin_unlock_irqrestore(&die.lock, flags);
+@@ -597,12 +622,27 @@ static void unknown_nmi_error(unsigned c
+ printk("Do you have a strange power saving mode enabled?\n");
+ }
+
+-static DEFINE_SPINLOCK(nmi_print_lock);
++/*
++ * Voyager doesn't implement these
++ */
++void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info)
++{
++}
++
++#ifdef CONFIG_SMP
++int __attribute__((weak))
++smp_nmi_call_function(smp_nmi_function func, void *info, int wait)
++{
++ return 0;
++}
++#endif
+
+ void die_nmi (struct pt_regs *regs, const char *msg)
+ {
++ static DEFINE_SPINLOCK(nmi_print_lock);
++
+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
+- NOTIFY_STOP)
++ NOTIFY_STOP)
+ return;
+
+ spin_lock(&nmi_print_lock);
+@@ -615,7 +655,11 @@ void die_nmi (struct pt_regs *regs, cons
+ printk(" on CPU%d, eip %08lx, registers:\n",
+ smp_processor_id(), regs->eip);
+ show_registers(regs);
+- printk(KERN_EMERG "console shuts up ...\n");
++ smp_nmi_call_function(smp_show_regs, NULL, 1);
++ bust_spinlocks(1);
++ /* current CPU messages should go bottom */
++ if (!decode_call_traces)
++ smp_show_regs(regs, NULL);
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
+@@ -631,6 +675,14 @@ void die_nmi (struct pt_regs *regs, cons
+ do_exit(SIGSEGV);
+ }
+
++static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
++{
++ return 0;
++}
++
++static nmi_callback_t nmi_callback = dummy_nmi_callback;
++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback;
++
+ static void default_do_nmi(struct pt_regs * regs)
+ {
+ unsigned char reason = 0;
+@@ -653,6 +705,9 @@ static void default_do_nmi(struct pt_reg
+ return;
+ }
+ #endif
++ if (nmi_ipi_callback != dummy_nmi_callback)
++ return;
++
+ unknown_nmi_error(reason, regs);
+ return;
+ }
+@@ -669,13 +724,6 @@ static void default_do_nmi(struct pt_reg
+ reassert_nmi();
+ }
+
+-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+-{
+- return 0;
+-}
+-
+-static nmi_callback_t nmi_callback = dummy_nmi_callback;
+-
+ fastcall void do_nmi(struct pt_regs * regs, long error_code)
+ {
+ int cpu;
+@@ -689,9 +737,20 @@ fastcall void do_nmi(struct pt_regs * re
+ if (!rcu_dereference(nmi_callback)(regs, cpu))
+ default_do_nmi(regs);
+
++ nmi_ipi_callback(regs, cpu);
+ nmi_exit();
+ }
+
++void set_nmi_ipi_callback(nmi_callback_t callback)
++{
++ nmi_ipi_callback = callback;
++}
++
++void unset_nmi_ipi_callback(void)
++{
++ nmi_ipi_callback = dummy_nmi_callback;
++}
++
+ void set_nmi_callback(nmi_callback_t callback)
+ {
+ rcu_assign_pointer(nmi_callback, callback);
+diff -upr linux-2.6.16.orig/arch/i386/kernel/vm86.c linux-2.6.16-026test015/arch/i386/kernel/vm86.c
+--- linux-2.6.16.orig/arch/i386/kernel/vm86.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/kernel/vm86.c 2006-07-04 14:41:36.000000000 +0400
+@@ -43,6 +43,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/highmem.h>
+ #include <linux/ptrace.h>
++#include <linux/audit.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/io.h>
+@@ -252,6 +253,7 @@ out:
+ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+ {
+ struct tss_struct *tss;
++ long eax;
+ /*
+ * make sure the vm86() system call doesn't try to do anything silly
+ */
+@@ -305,13 +307,19 @@ static void do_sys_vm86(struct kernel_vm
+ tsk->thread.screen_bitmap = info->screen_bitmap;
+ if (info->flags & VM86_SCREEN_BITMAP)
+ mark_screen_rdonly(tsk->mm);
++ __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
++ __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
++
++ /*call audit_syscall_exit since we do not exit via the normal paths */
++ if (unlikely(current->audit_context))
++ audit_syscall_exit(current, AUDITSC_RESULT(eax), eax);
++
+ __asm__ __volatile__(
+- "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t"
+ "movl %0,%%esp\n\t"
+ "movl %1,%%ebp\n\t"
+ "jmp resume_userspace"
+ : /* no outputs */
+- :"r" (&info->regs), "r" (task_thread_info(tsk)) : "ax");
++ :"r" (&info->regs), "r" (task_thread_info(tsk)));
+ /* we never return here */
+ }
+
+diff -upr linux-2.6.16.orig/arch/i386/mm/fault.c linux-2.6.16-026test015/arch/i386/mm/fault.c
+--- linux-2.6.16.orig/arch/i386/mm/fault.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/mm/fault.c 2006-07-04 14:41:37.000000000 +0400
+@@ -31,32 +31,6 @@
+ extern void die(const char *,struct pt_regs *,long);
+
+ /*
+- * Unlock any spinlocks which will prevent us from getting the
+- * message out
+- */
+-void bust_spinlocks(int yes)
+-{
+- int loglevel_save = console_loglevel;
+-
+- if (yes) {
+- oops_in_progress = 1;
+- return;
+- }
+-#ifdef CONFIG_VT
+- unblank_screen();
+-#endif
+- oops_in_progress = 0;
+- /*
+- * OK, the message is on the console. Now we call printk()
+- * without oops_in_progress set so that printk will give klogd
+- * a poke. Hold onto your hats...
+- */
+- console_loglevel = 15; /* NMI oopser may have shut the console up */
+- printk(" ");
+- console_loglevel = loglevel_save;
+-}
+-
+-/*
+ * Return EIP plus the CS segment base. The segment limit is also
+ * adjusted, clamped to the kernel/user address space (whichever is
+ * appropriate), and returned in *eip_limit.
+@@ -347,7 +321,6 @@ good_area:
+ goto bad_area;
+ }
+
+- survive:
+ /*
+ * If for any reason at all we couldn't handle the fault,
+ * make sure we exit gracefully rather than endlessly redo
+@@ -485,14 +458,14 @@ no_context:
+ */
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+- if (tsk->pid == 1) {
+- yield();
+- down_read(&mm->mmap_sem);
+- goto survive;
++ if (error_code & 4) {
++ /*
++ * 0-order allocation always success if something really
++ * fatal not happen: beancounter overdraft or OOM.
++ */
++ force_sig(SIGKILL, tsk);
++ return;
+ }
+- printk("VM: killing process %s\n", tsk->comm);
+- if (error_code & 4)
+- do_exit(SIGKILL);
+ goto no_context;
+
+ do_sigbus:
+diff -upr linux-2.6.16.orig/arch/i386/mm/hugetlbpage.c linux-2.6.16-026test015/arch/i386/mm/hugetlbpage.c
+--- linux-2.6.16.orig/arch/i386/mm/hugetlbpage.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/mm/hugetlbpage.c 2006-07-04 14:41:39.000000000 +0400
+@@ -14,6 +14,7 @@
+ #include <linux/slab.h>
+ #include <linux/err.h>
+ #include <linux/sysctl.h>
++#include <linux/module.h>
+ #include <asm/mman.h>
+ #include <asm/tlb.h>
+ #include <asm/tlbflush.h>
+@@ -110,6 +111,7 @@ int pmd_huge(pmd_t pmd)
+ {
+ return !!(pmd_val(pmd) & _PAGE_PSE);
+ }
++EXPORT_SYMBOL(pmd_huge);
+
+ struct page *
+ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+diff -upr linux-2.6.16.orig/arch/i386/mm/init.c linux-2.6.16-026test015/arch/i386/mm/init.c
+--- linux-2.6.16.orig/arch/i386/mm/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/mm/init.c 2006-07-04 14:41:37.000000000 +0400
+@@ -677,7 +677,7 @@ void __init pgtable_cache_init(void)
+ pmd_cache = kmem_cache_create("pmd",
+ PTRS_PER_PMD*sizeof(pmd_t),
+ PTRS_PER_PMD*sizeof(pmd_t),
+- 0,
++ SLAB_UBC,
+ pmd_ctor,
+ NULL);
+ if (!pmd_cache)
+@@ -686,7 +686,7 @@ void __init pgtable_cache_init(void)
+ pgd_cache = kmem_cache_create("pgd",
+ PTRS_PER_PGD*sizeof(pgd_t),
+ PTRS_PER_PGD*sizeof(pgd_t),
+- 0,
++ SLAB_UBC,
+ pgd_ctor,
+ PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+ if (!pgd_cache)
+diff -upr linux-2.6.16.orig/arch/i386/mm/pgtable.c linux-2.6.16-026test015/arch/i386/mm/pgtable.c
+--- linux-2.6.16.orig/arch/i386/mm/pgtable.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/i386/mm/pgtable.c 2006-07-04 14:41:38.000000000 +0400
+@@ -5,8 +5,10 @@
+ #include <linux/config.h>
+ #include <linux/sched.h>
+ #include <linux/kernel.h>
++#include <linux/module.h>
+ #include <linux/errno.h>
+ #include <linux/mm.h>
++#include <linux/vmalloc.h>
+ #include <linux/swap.h>
+ #include <linux/smp.h>
+ #include <linux/highmem.h>
+@@ -64,7 +66,9 @@ void show_mem(void)
+ printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
+ printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
+ printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
++ vprintstat();
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /*
+ * Associate a virtual page frame with a given physical page frame
+@@ -159,9 +163,11 @@ struct page *pte_alloc_one(struct mm_str
+ struct page *pte;
+
+ #ifdef CONFIG_HIGHPTE
+- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_HIGHMEM|
++ __GFP_REPEAT|__GFP_ZERO, 0);
+ #else
+- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|
++ __GFP_REPEAT|__GFP_ZERO, 0);
+ #endif
+ return pte;
+ }
+diff -upr linux-2.6.16.orig/arch/ia64/Kconfig linux-2.6.16-026test015/arch/ia64/Kconfig
+--- linux-2.6.16.orig/arch/ia64/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -283,6 +283,8 @@ config PREEMPT
+ Say Y here if you are building a kernel for a desktop, embedded
+ or real-time system. Say N if you are unsure.
+
++source "kernel/Kconfig.fairsched"
++
+ source "mm/Kconfig"
+
+ config ARCH_SELECT_MEMORY_MODEL
+@@ -464,6 +466,10 @@ endmenu
+
+ source "arch/ia64/Kconfig.debug"
+
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+
+ source "crypto/Kconfig"
++
++source "kernel/ub/Kconfig"
+diff -upr linux-2.6.16.orig/arch/ia64/ia32/binfmt_elf32.c linux-2.6.16-026test015/arch/ia64/ia32/binfmt_elf32.c
+--- linux-2.6.16.orig/arch/ia64/ia32/binfmt_elf32.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/ia32/binfmt_elf32.c 2006-07-04 14:41:37.000000000 +0400
+@@ -136,6 +136,12 @@ ia64_elf32_init (struct pt_regs *regs)
+ up_write(&current->mm->mmap_sem);
+ }
+
++ if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES *
++ IA32_LDT_ENTRY_SIZE),
++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE,
++ NULL, UB_SOFT))
++ goto skip;
++
+ /*
+ * Install LDT as anonymous memory. This gives us all-zero segment descriptors
+ * until a task modifies them via modify_ldt().
+@@ -157,7 +163,12 @@ ia64_elf32_init (struct pt_regs *regs)
+ }
+ }
+ up_write(&current->mm->mmap_sem);
+- }
++ } else
++ ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES *
++ IA32_LDT_ENTRY_SIZE),
++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL);
++
++skip:
+
+ ia64_psr(regs)->ac = 0; /* turn off alignment checking */
+ regs->loadrs = 0;
+@@ -212,9 +223,15 @@ ia32_setup_arg_pages (struct linux_binpr
+ bprm->loader += stack_base;
+ bprm->exec += stack_base;
+
++ ret = -ENOMEM;
++ if (ub_memory_charge(mm, IA32_STACK_TOP -
++ (PAGE_MASK & (unsigned long)bprm->p),
++ VM_STACK_FLAGS, NULL, UB_SOFT))
++ goto err_charge;
++
+ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!mpnt)
+- return -ENOMEM;
++ goto err_alloc;
+
+ memset(mpnt, 0, sizeof(*mpnt));
+
+@@ -231,11 +248,8 @@ ia32_setup_arg_pages (struct linux_binpr
+ mpnt->vm_flags = VM_STACK_FLAGS;
+ mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)?
+ PAGE_COPY_EXEC: PAGE_COPY;
+- if ((ret = insert_vm_struct(current->mm, mpnt))) {
+- up_write(&current->mm->mmap_sem);
+- kmem_cache_free(vm_area_cachep, mpnt);
+- return ret;
+- }
++ if ((ret = insert_vm_struct(current->mm, mpnt)))
++ goto err_insert;
+ current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt);
+ }
+
+@@ -254,6 +268,16 @@ ia32_setup_arg_pages (struct linux_binpr
+ current->thread.ppl = ia32_init_pp_list();
+
+ return 0;
++
++err_insert:
++ up_write(&current->mm->mmap_sem);
++ kmem_cache_free(vm_area_cachep, mpnt);
++err_alloc:
++ ub_memory_uncharge(mm, IA32_STACK_TOP -
++ (PAGE_MASK & (unsigned long)bprm->p),
++ VM_STACK_FLAGS, NULL);
++err_charge:
++ return ret;
+ }
+
+ static void
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c linux-2.6.16-026test015/arch/ia64/kernel/asm-offsets.c
+--- linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/asm-offsets.c 2006-07-04 14:41:38.000000000 +0400
+@@ -44,11 +44,21 @@ void foo(void)
+ DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid));
+ DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader));
+ DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending));
++#ifdef CONFIG_VE
++ DEFINE(IA64_TASK_PID_OFFSET, offsetof
++ (struct task_struct, pids[PIDTYPE_PID].vnr));
++#else
+ DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid));
++#endif
+ DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent));
+ DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand));
+ DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal));
++#ifdef CONFIG_VE
++ DEFINE(IA64_TASK_TGID_OFFSET, offsetof
++ (struct task_struct, pids[PIDTYPE_TGID].vnr));
++#else
+ DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid));
++#endif
+ DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp));
+ DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack));
+
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/entry.S linux-2.6.16-026test015/arch/ia64/kernel/entry.S
+--- linux-2.6.16.orig/arch/ia64/kernel/entry.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/entry.S 2006-07-04 14:41:39.000000000 +0400
+@@ -1620,4 +1620,17 @@ sys_call_table:
+ data8 sys_ni_syscall // 1295 reserved for ppoll
+ data8 sys_unshare
+
++.rept 1500-1297
++ data8 sys_ni_syscall
++.endr
++ data8 sys_fairsched_mknod // 1500
++ data8 sys_fairsched_rmnod
++ data8 sys_fairsched_chwt
++ data8 sys_fairsched_mvpr
++ data8 sys_fairsched_rate
++ data8 sys_getluid // 1505
++ data8 sys_setluid
++ data8 sys_setublimit
++ data8 sys_ubstat
++
+ .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/fsys.S linux-2.6.16-026test015/arch/ia64/kernel/fsys.S
+--- linux-2.6.16.orig/arch/ia64/kernel/fsys.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/fsys.S 2006-07-04 14:41:38.000000000 +0400
+@@ -72,6 +72,7 @@ ENTRY(fsys_getpid)
+ FSYS_RETURN
+ END(fsys_getpid)
+
++#ifndef CONFIG_VE
+ ENTRY(fsys_getppid)
+ .prologue
+ .altrp b6
+@@ -118,6 +119,7 @@ ENTRY(fsys_getppid)
+ #endif
+ FSYS_RETURN
+ END(fsys_getppid)
++#endif
+
+ ENTRY(fsys_set_tid_address)
+ .prologue
+@@ -665,7 +667,11 @@ fsyscall_table:
+ data8 0 // chown
+ data8 0 // lseek // 1040
+ data8 fsys_getpid // getpid
++#ifdef CONFIG_VE
++ data8 0
++#else
+ data8 fsys_getppid // getppid
++#endif
+ data8 0 // mount
+ data8 0 // umount
+ data8 0 // setuid // 1045
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/irq.c linux-2.6.16-026test015/arch/ia64/kernel/irq.c
+--- linux-2.6.16.orig/arch/ia64/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400
+@@ -163,7 +163,9 @@ void fixup_irqs(void)
+ {
+ unsigned int irq;
+ extern void ia64_process_pending_intr(void);
++ struct ve_struct *ve;
+
++ ve = set_exec_env(get_ve0());
+ ia64_set_itv(1<<16);
+ /*
+ * Phase 1: Locate irq's bound to this cpu and
+@@ -197,5 +199,6 @@ void fixup_irqs(void)
+ */
+ max_xtp();
+ local_irq_disable();
++ (void)set_exec_env(ve);
+ }
+ #endif
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/irq_ia64.c linux-2.6.16-026test015/arch/ia64/kernel/irq_ia64.c
+--- linux-2.6.16.orig/arch/ia64/kernel/irq_ia64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/irq_ia64.c 2006-07-04 14:41:38.000000000 +0400
+@@ -103,6 +103,7 @@ void
+ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
+ {
+ unsigned long saved_tpr;
++ struct ve_struct *ve;
+
+ #if IRQ_DEBUG
+ {
+@@ -139,6 +140,7 @@ ia64_handle_irq (ia64_vector vector, str
+ * 16 (without this, it would be ~240, which could easily lead
+ * to kernel stack overflows).
+ */
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+ saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
+ ia64_srlz_d();
+@@ -164,6 +166,7 @@ ia64_handle_irq (ia64_vector vector, str
+ * come through until ia64_eoi() has been done.
+ */
+ irq_exit();
++ (void)set_exec_env(get_ve0());
+ }
+
+ #ifdef CONFIG_HOTPLUG_CPU
+@@ -176,9 +179,11 @@ void ia64_process_pending_intr(void)
+ ia64_vector vector;
+ unsigned long saved_tpr;
+ extern unsigned int vectors_in_migration[NR_IRQS];
++ struct ve_struct *ve;
+
+ vector = ia64_get_ivr();
+
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+ saved_tpr = ia64_getreg(_IA64_REG_CR_TPR);
+ ia64_srlz_d();
+@@ -210,6 +215,7 @@ void ia64_process_pending_intr(void)
+ vector = ia64_get_ivr();
+ }
+ irq_exit();
++ (void)set_exec_env(ve);
+ }
+ #endif
+
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/mca.c linux-2.6.16-026test015/arch/ia64/kernel/mca.c
+--- linux-2.6.16.orig/arch/ia64/kernel/mca.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/mca.c 2006-07-04 14:41:38.000000000 +0400
+@@ -1241,10 +1241,10 @@ default_monarch_init_process(struct noti
+ }
+ printk("\n\n");
+ if (read_trylock(&tasklist_lock)) {
+- do_each_thread (g, t) {
++ do_each_thread_all (g, t) {
+ printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm);
+ show_stack(t, NULL);
+- } while_each_thread (g, t);
++ } while_each_thread_all (g, t);
+ read_unlock(&tasklist_lock);
+ }
+ return NOTIFY_DONE;
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/perfmon.c linux-2.6.16-026test015/arch/ia64/kernel/perfmon.c
+--- linux-2.6.16.orig/arch/ia64/kernel/perfmon.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/perfmon.c 2006-07-04 14:41:38.000000000 +0400
+@@ -2624,7 +2624,7 @@ pfm_get_task(pfm_context_t *ctx, pid_t p
+
+ read_lock(&tasklist_lock);
+
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+
+ /* make sure task cannot go away while we operate on it */
+ if (p) get_task_struct(p);
+@@ -4188,12 +4188,12 @@ pfm_check_task_exist(pfm_context_t *ctx)
+
+ read_lock(&tasklist_lock);
+
+- do_each_thread (g, t) {
++ do_each_thread_ve (g, t) {
+ if (t->thread.pfm_context == ctx) {
+ ret = 0;
+ break;
+ }
+- } while_each_thread (g, t);
++ } while_each_thread_ve (g, t);
+
+ read_unlock(&tasklist_lock);
+
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/process.c linux-2.6.16-026test015/arch/ia64/kernel/process.c
+--- linux-2.6.16.orig/arch/ia64/kernel/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -109,7 +109,8 @@ show_regs (struct pt_regs *regs)
+ unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
+
+ print_modules();
+- printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm);
++ printk("\nPid: %d, CPU %d, VCPU %d:%d, comm: %20s\n", current->pid, smp_processor_id(),
++ task_vsched_id(current), task_cpu(current), current->comm);
+ printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n",
+ regs->cr_ipsr, regs->cr_ifs, ip, print_tainted());
+ print_symbol("ip is at %s\n", ip);
+@@ -681,6 +682,13 @@ kernel_thread (int (*fn)(void *), void *
+ struct pt_regs pt;
+ } regs;
+
++ /* Don't allow kernel_thread() inside VE */
++ if (!ve_is_super(get_exec_env())) {
++ printk("kernel_thread call inside VE\n");
++ dump_stack();
++ return -EPERM;
++ }
++
+ memset(&regs, 0, sizeof(regs));
+ regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */
+ regs.pt.r1 = helper_fptr[1]; /* set GP */
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/ptrace.c linux-2.6.16-026test015/arch/ia64/kernel/ptrace.c
+--- linux-2.6.16.orig/arch/ia64/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/ptrace.c 2006-07-04 14:41:38.000000000 +0400
+@@ -1433,7 +1433,7 @@ sys_ptrace (long request, pid_t pid, uns
+ ret = -ESRCH;
+ read_lock(&tasklist_lock);
+ {
+- child = find_task_by_pid(pid);
++ child = find_task_by_pid_ve(pid);
+ if (child) {
+ if (peek_or_poke)
+ child = find_thread_for_addr(child, addr);
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/signal.c linux-2.6.16-026test015/arch/ia64/kernel/signal.c
+--- linux-2.6.16.orig/arch/ia64/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/signal.c 2006-07-04 14:41:38.000000000 +0400
+@@ -270,7 +270,7 @@ ia64_rt_sigreturn (struct sigscratch *sc
+ si.si_signo = SIGSEGV;
+ si.si_errno = 0;
+ si.si_code = SI_KERNEL;
+- si.si_pid = current->pid;
++ si.si_pid = virt_pid(current);
+ si.si_uid = current->uid;
+ si.si_addr = sc;
+ force_sig_info(SIGSEGV, &si, current);
+@@ -375,7 +375,7 @@ force_sigsegv_info (int sig, void __user
+ si.si_signo = SIGSEGV;
+ si.si_errno = 0;
+ si.si_code = SI_KERNEL;
+- si.si_pid = current->pid;
++ si.si_pid = virt_pid(current);
+ si.si_uid = current->uid;
+ si.si_addr = addr;
+ force_sig_info(SIGSEGV, &si, current);
+@@ -641,7 +641,7 @@ set_sigdelayed(pid_t pid, int signo, int
+ for (i = 1; i <= 3; ++i) {
+ switch (i) {
+ case 1:
+- t = find_task_by_pid(pid);
++ t = find_task_by_pid_ve(pid);
+ if (t)
+ start_time = start_time_ul(t);
+ break;
+@@ -682,7 +682,7 @@ do_sigdelayed(void)
+ siginfo.si_code = current_thread_info()->sigdelayed.code;
+ siginfo.si_addr = current_thread_info()->sigdelayed.addr;
+ pid = current_thread_info()->sigdelayed.pid;
+- t = find_task_by_pid(pid);
++ t = find_task_by_pid_ve(pid);
+ if (!t)
+ return;
+ if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/traps.c linux-2.6.16-026test015/arch/ia64/kernel/traps.c
+--- linux-2.6.16.orig/arch/ia64/kernel/traps.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/traps.c 2006-07-04 14:41:37.000000000 +0400
+@@ -54,34 +54,6 @@ trap_init (void)
+ fpswa_interface = __va(ia64_boot_param->fpswa);
+ }
+
+-/*
+- * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock
+- * is acquired through the console unblank code)
+- */
+-void
+-bust_spinlocks (int yes)
+-{
+- int loglevel_save = console_loglevel;
+-
+- if (yes) {
+- oops_in_progress = 1;
+- return;
+- }
+-
+-#ifdef CONFIG_VT
+- unblank_screen();
+-#endif
+- oops_in_progress = 0;
+- /*
+- * OK, the message is on the console. Now we call printk() without
+- * oops_in_progress set so that printk will give klogd a poke. Hold onto
+- * your hats...
+- */
+- console_loglevel = 15; /* NMI oopser may have shut the console up */
+- printk(" ");
+- console_loglevel = loglevel_save;
+-}
+-
+ void
+ die (const char *str, struct pt_regs *regs, long err)
+ {
+diff -upr linux-2.6.16.orig/arch/ia64/kernel/unaligned.c linux-2.6.16-026test015/arch/ia64/kernel/unaligned.c
+--- linux-2.6.16.orig/arch/ia64/kernel/unaligned.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/kernel/unaligned.c 2006-07-04 14:41:37.000000000 +0400
+@@ -1290,7 +1290,7 @@ within_logging_rate_limit (void)
+ {
+ static unsigned long count, last_time;
+
+- if (jiffies - last_time > 5*HZ)
++ if (jiffies - last_time > 60 * HZ)
+ count = 0;
+ if (count < 5) {
+ last_time = jiffies;
+diff -upr linux-2.6.16.orig/arch/ia64/mm/contig.c linux-2.6.16-026test015/arch/ia64/mm/contig.c
+--- linux-2.6.16.orig/arch/ia64/mm/contig.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/mm/contig.c 2006-07-04 14:41:38.000000000 +0400
+@@ -64,6 +64,7 @@ show_mem (void)
+ printk("%ld pages in page table cache\n",
+ pgtable_quicklist_total_size());
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /* physical address where the bootmem map is located */
+ unsigned long bootmap_start;
+diff -upr linux-2.6.16.orig/arch/ia64/mm/discontig.c linux-2.6.16-026test015/arch/ia64/mm/discontig.c
+--- linux-2.6.16.orig/arch/ia64/mm/discontig.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/mm/discontig.c 2006-07-04 14:41:38.000000000 +0400
+@@ -594,6 +594,7 @@ void show_mem(void)
+ pgtable_quicklist_total_size());
+ printk("%d free buffer pages\n", nr_free_buffer_pages());
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /**
+ * call_pernode_memory - use SRAT to call callback functions with node info
+diff -upr linux-2.6.16.orig/arch/ia64/mm/fault.c linux-2.6.16-026test015/arch/ia64/mm/fault.c
+--- linux-2.6.16.orig/arch/ia64/mm/fault.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/mm/fault.c 2006-07-04 14:41:37.000000000 +0400
+@@ -116,7 +116,6 @@ ia64_do_page_fault (unsigned long addres
+ if ((vma->vm_flags & mask) != mask)
+ goto bad_area;
+
+- survive:
+ /*
+ * If for any reason at all we couldn't handle the fault, make
+ * sure we exit gracefully rather than endlessly redo the
+@@ -241,13 +240,13 @@ ia64_do_page_fault (unsigned long addres
+
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+- if (current->pid == 1) {
+- yield();
+- down_read(&mm->mmap_sem);
+- goto survive;
+- }
+- printk(KERN_CRIT "VM: killing process %s\n", current->comm);
+- if (user_mode(regs))
+- do_exit(SIGKILL);
++ if (user_mode(regs)) {
++ /*
++ * 0-order allocation always success if something really
++ * fatal not happen: beancounter overdraft or OOM.
++ */
++ force_sig(SIGKILL, current);
++ return;
++ }
+ goto no_context;
+ }
+diff -upr linux-2.6.16.orig/arch/ia64/mm/init.c linux-2.6.16-026test015/arch/ia64/mm/init.c
+--- linux-2.6.16.orig/arch/ia64/mm/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ia64/mm/init.c 2006-07-04 14:41:37.000000000 +0400
+@@ -37,6 +37,8 @@
+ #include <asm/unistd.h>
+ #include <asm/mca.h>
+
++#include <ub/ub_vmpages.h>
++
+ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+
+ DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
+@@ -96,7 +98,7 @@ check_pgt_cache(void)
+ preempt_disable();
+ while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
+ while (pages_to_free--) {
+- free_page((unsigned long)pgtable_quicklist_alloc());
++ free_page((unsigned long)pgtable_quicklist_alloc(0));
+ }
+ preempt_enable();
+ preempt_disable();
+@@ -146,6 +148,10 @@ ia64_init_addr_space (void)
+
+ ia64_set_rbs_bot();
+
++ if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS,
++ NULL, UB_SOFT))
++ goto skip;
++
+ /*
+ * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore
+ * the problem. When the process attempts to write to the register backing store
+@@ -166,8 +172,11 @@ ia64_init_addr_space (void)
+ return;
+ }
+ up_write(&current->mm->mmap_sem);
+- }
++ } else
++ ub_memory_uncharge(current->mm, PAGE_SIZE,
++ VM_DATA_DEFAULT_FLAGS, NULL);
+
++skip:
+ /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */
+ if (!(current->personality & MMAP_PAGE_ZERO)) {
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+diff -upr linux-2.6.16.orig/arch/m32r/kernel/m32r_ksyms.c linux-2.6.16-026test015/arch/m32r/kernel/m32r_ksyms.c
+--- linux-2.6.16.orig/arch/m32r/kernel/m32r_ksyms.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/m32r/kernel/m32r_ksyms.c 2006-07-04 14:41:36.000000000 +0400
+@@ -38,10 +38,6 @@ EXPORT_SYMBOL(__udelay);
+ EXPORT_SYMBOL(__delay);
+ EXPORT_SYMBOL(__const_udelay);
+
+-EXPORT_SYMBOL(__get_user_1);
+-EXPORT_SYMBOL(__get_user_2);
+-EXPORT_SYMBOL(__get_user_4);
+-
+ EXPORT_SYMBOL(strpbrk);
+ EXPORT_SYMBOL(strstr);
+
+diff -upr linux-2.6.16.orig/arch/m32r/kernel/setup.c linux-2.6.16-026test015/arch/m32r/kernel/setup.c
+--- linux-2.6.16.orig/arch/m32r/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/m32r/kernel/setup.c 2006-07-04 14:41:36.000000000 +0400
+@@ -9,6 +9,7 @@
+
+ #include <linux/config.h>
+ #include <linux/init.h>
++#include <linux/kernel.h>
+ #include <linux/stddef.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+@@ -218,8 +219,6 @@ static unsigned long __init setup_memory
+ extern unsigned long setup_memory(void);
+ #endif /* CONFIG_DISCONTIGMEM */
+
+-#define M32R_PCC_PCATCR 0x00ef7014 /* will move to m32r.h */
+-
+ void __init setup_arch(char **cmdline_p)
+ {
+ ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+@@ -268,15 +267,14 @@ void __init setup_arch(char **cmdline_p)
+ paging_init();
+ }
+
+-static struct cpu cpu[NR_CPUS];
++static struct cpu cpu_devices[NR_CPUS];
+
+ static int __init topology_init(void)
+ {
+- int cpu_id;
++ int i;
+
+- for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++)
+- if (cpu_possible(cpu_id))
+- register_cpu(&cpu[cpu_id], cpu_id, NULL);
++ for_each_present_cpu(i)
++ register_cpu(&cpu_devices[i], i, NULL);
+
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/arch/m32r/kernel/smpboot.c linux-2.6.16-026test015/arch/m32r/kernel/smpboot.c
+--- linux-2.6.16.orig/arch/m32r/kernel/smpboot.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/m32r/kernel/smpboot.c 2006-07-04 14:41:36.000000000 +0400
+@@ -39,8 +39,10 @@
+ * Martin J. Bligh : Added support for multi-quad systems
+ */
+
++#include <linux/module.h>
+ #include <linux/config.h>
+ #include <linux/init.h>
++#include <linux/kernel.h>
+ #include <linux/mm.h>
+ #include <linux/smp_lock.h>
+ #include <linux/irq.h>
+@@ -72,11 +74,15 @@ physid_mask_t phys_cpu_present_map;
+
+ /* Bitmask of currently online CPUs */
+ cpumask_t cpu_online_map;
++EXPORT_SYMBOL(cpu_online_map);
+
+ cpumask_t cpu_bootout_map;
+ cpumask_t cpu_bootin_map;
+-cpumask_t cpu_callout_map;
+ static cpumask_t cpu_callin_map;
++cpumask_t cpu_callout_map;
++EXPORT_SYMBOL(cpu_callout_map);
++cpumask_t cpu_possible_map = CPU_MASK_ALL;
++EXPORT_SYMBOL(cpu_possible_map);
+
+ /* Per CPU bogomips and other parameters */
+ struct cpuinfo_m32r cpu_data[NR_CPUS] __cacheline_aligned;
+@@ -110,7 +116,6 @@ static unsigned int calibration_result;
+
+ void smp_prepare_boot_cpu(void);
+ void smp_prepare_cpus(unsigned int);
+-static void smp_tune_scheduling(void);
+ static void init_ipi_lock(void);
+ static void do_boot_cpu(int);
+ int __cpu_up(unsigned int);
+@@ -177,6 +182,9 @@ void __init smp_prepare_cpus(unsigned in
+ }
+ for (phys_id = 0 ; phys_id < nr_cpu ; phys_id++)
+ physid_set(phys_id, phys_cpu_present_map);
++#ifndef CONFIG_HOTPLUG_CPU
++ cpu_present_map = cpu_possible_map;
++#endif
+
+ show_mp_info(nr_cpu);
+
+@@ -186,7 +194,6 @@ void __init smp_prepare_cpus(unsigned in
+ * Setup boot CPU information
+ */
+ smp_store_cpu_info(0); /* Final full version of the data */
+- smp_tune_scheduling();
+
+ /*
+ * If SMP should be disabled, then really disable it!
+@@ -230,11 +237,6 @@ smp_done:
+ Dprintk("Boot done.\n");
+ }
+
+-static void __init smp_tune_scheduling(void)
+-{
+- /* Nothing to do. */
+-}
+-
+ /*
+ * init_ipi_lock : Initialize IPI locks.
+ */
+@@ -629,4 +631,3 @@ static void __init unmap_cpu_to_physid(i
+ physid_2_cpu[phys_id] = -1;
+ cpu_2_physid[cpu_id] = -1;
+ }
+-
+diff -upr linux-2.6.16.orig/arch/m32r/lib/Makefile linux-2.6.16-026test015/arch/m32r/lib/Makefile
+--- linux-2.6.16.orig/arch/m32r/lib/Makefile 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/m32r/lib/Makefile 2006-07-04 14:41:36.000000000 +0400
+@@ -2,6 +2,6 @@
+ # Makefile for M32R-specific library files..
+ #
+
+-lib-y := checksum.o ashxdi3.o memset.o memcpy.o getuser.o \
+- putuser.o delay.o strlen.o usercopy.o csum_partial_copy.o
++lib-y := checksum.o ashxdi3.o memset.o memcpy.o \
++ delay.o strlen.o usercopy.o csum_partial_copy.o
+
+diff -upr linux-2.6.16.orig/arch/mips/kernel/branch.c linux-2.6.16-026test015/arch/mips/kernel/branch.c
+--- linux-2.6.16.orig/arch/mips/kernel/branch.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/mips/kernel/branch.c 2006-07-04 14:41:36.000000000 +0400
+@@ -184,7 +184,7 @@ int __compute_return_epc(struct pt_regs
+ bit = (insn.i_format.rt >> 2);
+ bit += (bit != 0);
+ bit += 23;
+- switch (insn.i_format.rt) {
++ switch (insn.i_format.rt & 3) {
+ case 0: /* bc1f */
+ case 2: /* bc1fl */
+ if (~fcr31 & (1 << bit))
+diff -upr linux-2.6.16.orig/arch/mips/kernel/irixelf.c linux-2.6.16-026test015/arch/mips/kernel/irixelf.c
+--- linux-2.6.16.orig/arch/mips/kernel/irixelf.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/mips/kernel/irixelf.c 2006-07-04 14:41:37.000000000 +0400
+@@ -432,7 +432,7 @@ static inline int look_for_irix_interpre
+ if (retval < 0)
+ goto out;
+
+- file = open_exec(*name);
++ file = open_exec(*name, bprm);
+ if (IS_ERR(file)) {
+ retval = PTR_ERR(file);
+ goto out;
+diff -upr linux-2.6.16.orig/arch/mips/kernel/sysirix.c linux-2.6.16-026test015/arch/mips/kernel/sysirix.c
+--- linux-2.6.16.orig/arch/mips/kernel/sysirix.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/mips/kernel/sysirix.c 2006-07-04 14:41:38.000000000 +0400
+@@ -110,7 +110,7 @@ asmlinkage int irix_prctl(unsigned optio
+ printk("irix_prctl[%s:%d]: Wants PR_ISBLOCKED\n",
+ current->comm, current->pid);
+ read_lock(&tasklist_lock);
+- task = find_task_by_pid(va_arg(args, pid_t));
++ task = find_task_by_pid_ve(va_arg(args, pid_t));
+ error = -ESRCH;
+ if (error)
+ error = (task->run_list.next != NULL);
+diff -upr linux-2.6.16.orig/arch/mips/mm/c-r4k.c linux-2.6.16-026test015/arch/mips/mm/c-r4k.c
+--- linux-2.6.16.orig/arch/mips/mm/c-r4k.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/mips/mm/c-r4k.c 2006-07-04 14:41:36.000000000 +0400
+@@ -154,7 +154,8 @@ static inline void blast_icache32_r4600_
+
+ static inline void tx49_blast_icache32_page_indexed(unsigned long page)
+ {
+- unsigned long start = page;
++ unsigned long indexmask = current_cpu_data.icache.waysize - 1;
++ unsigned long start = INDEX_BASE + (page & indexmask);
+ unsigned long end = start + PAGE_SIZE;
+ unsigned long ws_inc = 1UL << current_cpu_data.icache.waybit;
+ unsigned long ws_end = current_cpu_data.icache.ways <<
+diff -upr linux-2.6.16.orig/arch/powerpc/Kconfig linux-2.6.16-026test015/arch/powerpc/Kconfig
+--- linux-2.6.16.orig/arch/powerpc/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -517,6 +517,7 @@ config HIGHMEM
+ bool "High memory support"
+ depends on PPC32
+
++source "kernel/Kconfig.fairsched"
+ source kernel/Kconfig.hz
+ source kernel/Kconfig.preempt
+ source "fs/Kconfig.binfmt"
+@@ -956,6 +957,8 @@ source "arch/powerpc/platforms/iseries/K
+
+ source "lib/Kconfig"
+
++source "kernel/ub/Kconfig"
++
+ menu "Instrumentation Support"
+ depends on EXPERIMENTAL
+
+@@ -974,6 +977,8 @@ endmenu
+
+ source "arch/powerpc/Kconfig.debug"
+
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+
+ config KEYS_COMPAT
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/irq.c linux-2.6.16-026test015/arch/powerpc/kernel/irq.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400
+@@ -50,6 +50,8 @@
+ #include <linux/profile.h>
+ #include <linux/bitops.h>
+
++#include <ub/beancounter.h>
++
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+ #include <asm/io.h>
+@@ -189,7 +191,11 @@ void do_IRQ(struct pt_regs *regs)
+ #ifdef CONFIG_IRQSTACKS
+ struct thread_info *curtp, *irqtp;
+ #endif
++ struct ve_struct *ve;
++ struct user_beancounter *ub;
+
++ ve = set_exec_env(get_ve0());
++ ub = set_exec_ub(get_ub0());
+ irq_enter();
+
+ #ifdef CONFIG_DEBUG_STACKOVERFLOW
+@@ -236,6 +242,8 @@ void do_IRQ(struct pt_regs *regs)
+ ppc_spurious_interrupts++;
+
+ irq_exit();
++ (void)set_exec_ub(ub);
++ (void)set_exec_env(ve);
+
+ #ifdef CONFIG_PPC_ISERIES
+ if (get_lppaca()->int_dword.fields.decr_int) {
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/misc_32.S linux-2.6.16-026test015/arch/powerpc/kernel/misc_32.S
+--- linux-2.6.16.orig/arch/powerpc/kernel/misc_32.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/misc_32.S 2006-07-04 14:41:37.000000000 +0400
+@@ -973,7 +973,7 @@ _GLOBAL(_get_SP)
+ * Create a kernel thread
+ * kernel_thread(fn, arg, flags)
+ */
+-_GLOBAL(kernel_thread)
++_GLOBAL(ppc_kernel_thread)
+ stwu r1,-16(r1)
+ stw r30,8(r1)
+ stw r31,12(r1)
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/misc_64.S linux-2.6.16-026test015/arch/powerpc/kernel/misc_64.S
+--- linux-2.6.16.orig/arch/powerpc/kernel/misc_64.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/misc_64.S 2006-07-04 14:41:37.000000000 +0400
+@@ -677,7 +677,7 @@ _GLOBAL(scom970_write)
+ * Create a kernel thread
+ * kernel_thread(fn, arg, flags)
+ */
+-_GLOBAL(kernel_thread)
++_GLOBAL(ppc_kernel_thread)
+ std r29,-24(r1)
+ std r30,-16(r1)
+ stdu r1,-STACK_FRAME_OVERHEAD(r1)
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/pci_64.c linux-2.6.16-026test015/arch/powerpc/kernel/pci_64.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/pci_64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/pci_64.c 2006-07-04 14:41:36.000000000 +0400
+@@ -78,6 +78,7 @@ int global_phb_number; /* Global phb co
+
+ /* Cached ISA bridge dev. */
+ struct pci_dev *ppc64_isabridge_dev = NULL;
++EXPORT_SYMBOL_GPL(ppc64_isabridge_dev);
+
+ static void fixup_broken_pcnet32(struct pci_dev* dev)
+ {
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/process.c linux-2.6.16-026test015/arch/powerpc/kernel/process.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -429,7 +429,7 @@ void show_regs(struct pt_regs * regs)
+ current, current->pid, current->comm, task_thread_info(current));
+
+ #ifdef CONFIG_SMP
+- printk(" CPU: %d", smp_processor_id());
++ printk(" CPU: %d VCPU %d:%d", smp_processor_id(), task_vsched_id(current), task_cpu(current);
+ #endif /* CONFIG_SMP */
+
+ for (i = 0; i < 32; i++) {
+@@ -774,12 +774,12 @@ static int validate_sp(unsigned long sp,
+ return 1;
+
+ #ifdef CONFIG_IRQSTACKS
+- stack_page = (unsigned long) hardirq_ctx[task_cpu(p)];
++ stack_page = (unsigned long) hardirq_ctx[task_pcpu(p)];
+ if (sp >= stack_page + sizeof(struct thread_struct)
+ && sp <= stack_page + THREAD_SIZE - nbytes)
+ return 1;
+
+- stack_page = (unsigned long) softirq_ctx[task_cpu(p)];
++ stack_page = (unsigned long) softirq_ctx[task_pcpu(p)];
+ if (sp >= stack_page + sizeof(struct thread_struct)
+ && sp <= stack_page + THREAD_SIZE - nbytes)
+ return 1;
+@@ -889,6 +889,20 @@ void dump_stack(void)
+ }
+ EXPORT_SYMBOL(dump_stack);
+
++long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
++{
++ extern long ppc_kernel_thread(int (*fn)(void *), void *arg,
++ unsigned long flags);
++
++ if (!ve_is_super(get_exec_env())) {
++ printk("kernel_thread call inside VE\n");
++ dump_stack();
++ return -EPERM;
++ }
++
++ return ppc_kernel_thread(fn, arg, flags);
++}
++
+ #ifdef CONFIG_PPC64
+ void ppc64_runlatch_on(void)
+ {
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/setup_64.c linux-2.6.16-026test015/arch/powerpc/kernel/setup_64.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/setup_64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/setup_64.c 2006-07-04 14:41:36.000000000 +0400
+@@ -256,12 +256,10 @@ void __init early_setup(unsigned long dt
+ /*
+ * Initialize stab / SLB management except on iSeries
+ */
+- if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
+- if (cpu_has_feature(CPU_FTR_SLB))
+- slb_initialize();
+- else
+- stab_initialize(lpaca->stab_real);
+- }
++ if (cpu_has_feature(CPU_FTR_SLB))
++ slb_initialize();
++ else if (!firmware_has_feature(FW_FEATURE_ISERIES))
++ stab_initialize(lpaca->stab_real);
+
+ DBG(" <- early_setup()\n");
+ }
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/signal_32.c linux-2.6.16-026test015/arch/powerpc/kernel/signal_32.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/signal_32.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/signal_32.c 2006-07-04 14:41:36.000000000 +0400
+@@ -802,10 +802,13 @@ static int do_setcontext(struct ucontext
+ if (__get_user(cmcp, &ucp->uc_regs))
+ return -EFAULT;
+ mcp = (struct mcontext __user *)(u64)cmcp;
++ /* no need to check access_ok(mcp), since mcp < 4GB */
+ }
+ #else
+ if (__get_user(mcp, &ucp->uc_regs))
+ return -EFAULT;
++ if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp)))
++ return -EFAULT;
+ #endif
+ restore_sigmask(&set);
+ if (restore_user_regs(regs, mcp, sig))
+@@ -907,13 +910,14 @@ int sys_debug_setcontext(struct ucontext
+ {
+ struct sig_dbg_op op;
+ int i;
++ unsigned char tmp;
+ unsigned long new_msr = regs->msr;
+ #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+ unsigned long new_dbcr0 = current->thread.dbcr0;
+ #endif
+
+ for (i=0; i<ndbg; i++) {
+- if (__copy_from_user(&op, dbg, sizeof(op)))
++ if (copy_from_user(&op, dbg + i, sizeof(op)))
+ return -EFAULT;
+ switch (op.dbg_type) {
+ case SIG_DBG_SINGLE_STEPPING:
+@@ -958,6 +962,11 @@ int sys_debug_setcontext(struct ucontext
+ current->thread.dbcr0 = new_dbcr0;
+ #endif
+
++ if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx))
++ || __get_user(tmp, (u8 __user *) ctx)
++ || __get_user(tmp, (u8 __user *) (ctx + 1) - 1))
++ return -EFAULT;
++
+ /*
+ * If we get a fault copying the context into the kernel's
+ * image of the user's registers, we can't just return -EFAULT
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/signal_64.c linux-2.6.16-026test015/arch/powerpc/kernel/signal_64.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/signal_64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/signal_64.c 2006-07-04 14:41:36.000000000 +0400
+@@ -183,6 +183,8 @@ static long restore_sigcontext(struct pt
+ err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+ if (err)
+ return err;
++ if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
++ return -EFAULT;
+ /* Copy 33 vec registers (vr0..31 and vscr) from the stack */
+ if (v_regs != 0 && (msr & MSR_VEC) != 0)
+ err |= __copy_from_user(current->thread.vr, v_regs,
+@@ -213,7 +215,7 @@ static inline void __user * get_sigframe
+ /* Default to using normal stack */
+ newsp = regs->gpr[1];
+
+- if (ka->sa.sa_flags & SA_ONSTACK) {
++ if ((ka->sa.sa_flags & SA_ONSTACK) && current->sas_ss_size) {
+ if (! on_sig_stack(regs->gpr[1]))
+ newsp = (current->sas_ss_sp + current->sas_ss_size);
+ }
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/syscalls.c linux-2.6.16-026test015/arch/powerpc/kernel/syscalls.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/syscalls.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/syscalls.c 2006-07-04 14:41:38.000000000 +0400
+@@ -259,7 +259,7 @@ long ppc_newuname(struct new_utsname __u
+ int err = 0;
+
+ down_read(&uts_sem);
+- if (copy_to_user(name, &system_utsname, sizeof(*name)))
++ if (copy_to_user(name, &ve_utsname, sizeof(*name)))
+ err = -EFAULT;
+ up_read(&uts_sem);
+ if (!err)
+@@ -272,7 +272,7 @@ int sys_uname(struct old_utsname __user
+ int err = 0;
+
+ down_read(&uts_sem);
+- if (copy_to_user(name, &system_utsname, sizeof(*name)))
++ if (copy_to_user(name, &ve_utsname, sizeof(*name)))
+ err = -EFAULT;
+ up_read(&uts_sem);
+ if (!err)
+@@ -288,19 +288,19 @@ int sys_olduname(struct oldold_utsname _
+ return -EFAULT;
+
+ down_read(&uts_sem);
+- error = __copy_to_user(&name->sysname, &system_utsname.sysname,
++ error = __copy_to_user(&name->sysname, &ve_utsname.sysname,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
+- error |= __copy_to_user(&name->nodename, &system_utsname.nodename,
++ error |= __copy_to_user(&name->nodename, &ve_utsname.nodename,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
+- error |= __copy_to_user(&name->release, &system_utsname.release,
++ error |= __copy_to_user(&name->release, &ve_utsname.release,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->release + __OLD_UTS_LEN);
+- error |= __copy_to_user(&name->version, &system_utsname.version,
++ error |= __copy_to_user(&name->version, &ve_utsname.version,
+ __OLD_UTS_LEN);
+ error |= __put_user(0, name->version + __OLD_UTS_LEN);
+- error |= __copy_to_user(&name->machine, &system_utsname.machine,
++ error |= __copy_to_user(&name->machine, &ve_utsname.machine,
+ __OLD_UTS_LEN);
+ error |= override_machine(name->machine);
+ up_read(&uts_sem);
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/systbl.S linux-2.6.16-026test015/arch/powerpc/kernel/systbl.S
+--- linux-2.6.16.orig/arch/powerpc/kernel/systbl.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/systbl.S 2006-07-04 14:41:37.000000000 +0400
+@@ -322,3 +322,12 @@ SYSCALL(spu_create)
+ COMPAT_SYS(pselect6)
+ COMPAT_SYS(ppoll)
+ SYSCALL(unshare)
++
++.rept 410 - (. - sys_call_table)/8
++SYSX(sys_ni_syscall, sys_ni_syscall, sys_ni_syscall)
++.endr
++
++SYSX(sys_getluid, sys_ni_syscall, sys_getluid)
++SYSX(sys_setluid, sys_ni_syscall, sys_setluid)
++SYSX(sys_setublimit, sys_ni_syscall, sys_setublimit)
++SYSX(sys_ubstat, sys_ni_syscall, sys_ubstat)
+diff -upr linux-2.6.16.orig/arch/powerpc/kernel/time.c linux-2.6.16-026test015/arch/powerpc/kernel/time.c
+--- linux-2.6.16.orig/arch/powerpc/kernel/time.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/kernel/time.c 2006-07-04 14:41:38.000000000 +0400
+@@ -431,12 +431,14 @@ void timer_interrupt(struct pt_regs * re
+ int next_dec;
+ int cpu = smp_processor_id();
+ unsigned long ticks;
++ struct ve_struct *ve;
+
+ #ifdef CONFIG_PPC32
+ if (atomic_read(&ppc_n_lost_interrupts) != 0)
+ do_IRQ(regs);
+ #endif
+
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+
+ profile_tick(CPU_PROFILING, regs);
+@@ -496,6 +498,7 @@ void timer_interrupt(struct pt_regs * re
+ #endif
+
+ irq_exit();
++ (void)set_exec_env(ve);
+ }
+
+ void wakeup_decrementer(void)
+diff -upr linux-2.6.16.orig/arch/powerpc/mm/fault.c linux-2.6.16-026test015/arch/powerpc/mm/fault.c
+--- linux-2.6.16.orig/arch/powerpc/mm/fault.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/mm/fault.c 2006-07-04 14:41:37.000000000 +0400
+@@ -307,7 +307,6 @@ good_area:
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+- survive:
+ switch (handle_mm_fault(mm, vma, address, is_write)) {
+
+ case VM_FAULT_MINOR:
+@@ -351,14 +350,12 @@ bad_area_nosemaphore:
+ */
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+- if (current->pid == 1) {
+- yield();
+- down_read(&mm->mmap_sem);
+- goto survive;
+- }
+- printk("VM: killing process %s\n", current->comm);
+ if (user_mode(regs))
+- do_exit(SIGKILL);
++ /*
++ * 0-order allocation always success if something really
++ * fatal not happen: beancounter overdraft or OOM. Den
++ */
++ force_sig(SIGKILL, current);
+ return SIGKILL;
+
+ do_sigbus:
+diff -upr linux-2.6.16.orig/arch/powerpc/mm/init_64.c linux-2.6.16-026test015/arch/powerpc/mm/init_64.c
+--- linux-2.6.16.orig/arch/powerpc/mm/init_64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/mm/init_64.c 2006-07-04 14:41:37.000000000 +0400
+@@ -225,7 +225,8 @@ void pgtable_cache_init(void)
+ pgtable_cache[i] = kmem_cache_create(name,
+ size, size,
+ SLAB_HWCACHE_ALIGN |
+- SLAB_MUST_HWCACHE_ALIGN,
++ SLAB_MUST_HWCACHE_ALIGN |
++ SLAB_UBC | SLAB_NO_CHARGE,
+ zero_ctor,
+ NULL);
+ if (! pgtable_cache[i])
+diff -upr linux-2.6.16.orig/arch/powerpc/mm/mem.c linux-2.6.16-026test015/arch/powerpc/mm/mem.c
+--- linux-2.6.16.orig/arch/powerpc/mm/mem.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/mm/mem.c 2006-07-04 14:41:38.000000000 +0400
+@@ -222,6 +222,7 @@ void show_mem(void)
+ printk("%ld pages shared\n", shared);
+ printk("%ld pages swap cached\n", cached);
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /*
+ * Initialize the bootmem system and give it all the memory we
+diff -upr linux-2.6.16.orig/arch/powerpc/mm/pgtable_32.c linux-2.6.16-026test015/arch/powerpc/mm/pgtable_32.c
+--- linux-2.6.16.orig/arch/powerpc/mm/pgtable_32.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/mm/pgtable_32.c 2006-07-04 14:41:37.000000000 +0400
+@@ -85,7 +85,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+ pgd_t *ret;
+
+- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC |
++ __GFP_ZERO, PGDIR_ORDER);
+ return ret;
+ }
+
+@@ -119,6 +120,7 @@ struct page *pte_alloc_one(struct mm_str
+ #else
+ gfp_t flags = GFP_KERNEL | __GFP_REPEAT;
+ #endif
++ flags |= (__GFP_UBC | __GFP_SOFT_UBC);
+
+ ptepage = alloc_pages(flags, 0);
+ if (ptepage)
+diff -upr linux-2.6.16.orig/arch/powerpc/platforms/powermac/setup.c linux-2.6.16-026test015/arch/powerpc/platforms/powermac/setup.c
+--- linux-2.6.16.orig/arch/powerpc/platforms/powermac/setup.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/powerpc/platforms/powermac/setup.c 2006-07-04 14:41:36.000000000 +0400
+@@ -456,11 +456,23 @@ static int pmac_pm_finish(suspend_state_
+ return 0;
+ }
+
++static int pmac_pm_valid(suspend_state_t state)
++{
++ switch (state) {
++ case PM_SUSPEND_DISK:
++ return 1;
++ /* can't do any other states via generic mechanism yet */
++ default:
++ return 0;
++ }
++}
++
+ static struct pm_ops pmac_pm_ops = {
+ .pm_disk_mode = PM_DISK_SHUTDOWN,
+ .prepare = pmac_pm_prepare,
+ .enter = pmac_pm_enter,
+ .finish = pmac_pm_finish,
++ .valid = pmac_pm_valid,
+ };
+
+ #endif /* CONFIG_SOFTWARE_SUSPEND */
+diff -upr linux-2.6.16.orig/arch/ppc/Kconfig linux-2.6.16-026test015/arch/ppc/Kconfig
+--- linux-2.6.16.orig/arch/ppc/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ppc/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -920,6 +920,7 @@ config NR_CPUS
+ config HIGHMEM
+ bool "High memory support"
+
++source "kernel/Kconfig.fairsched"
+ source kernel/Kconfig.hz
+ source kernel/Kconfig.preempt
+ source "mm/Kconfig"
+@@ -1394,6 +1395,10 @@ source "arch/powerpc/oprofile/Kconfig"
+
+ source "arch/ppc/Kconfig.debug"
+
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+
++source "kernel/ub/Kconfig"
++
+ source "crypto/Kconfig"
+diff -upr linux-2.6.16.orig/arch/ppc/kernel/misc.S linux-2.6.16-026test015/arch/ppc/kernel/misc.S
+--- linux-2.6.16.orig/arch/ppc/kernel/misc.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ppc/kernel/misc.S 2006-07-04 14:41:37.000000000 +0400
+@@ -1004,7 +1004,7 @@ _GLOBAL(_get_SP)
+ * Create a kernel thread
+ * kernel_thread(fn, arg, flags)
+ */
+-_GLOBAL(kernel_thread)
++_GLOBAL(ppc_kernel_thread)
+ stwu r1,-16(r1)
+ stw r30,8(r1)
+ stw r31,12(r1)
+diff -upr linux-2.6.16.orig/arch/ppc/kernel/time.c linux-2.6.16-026test015/arch/ppc/kernel/time.c
+--- linux-2.6.16.orig/arch/ppc/kernel/time.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ppc/kernel/time.c 2006-07-04 14:41:38.000000000 +0400
+@@ -58,6 +58,8 @@
+ #include <linux/init.h>
+ #include <linux/profile.h>
+
++#include <ub/beancounter.h>
++
+ #include <asm/io.h>
+ #include <asm/nvram.h>
+ #include <asm/cache.h>
+@@ -136,10 +138,14 @@ void timer_interrupt(struct pt_regs * re
+ unsigned long cpu = smp_processor_id();
+ unsigned jiffy_stamp = last_jiffy_stamp(cpu);
+ extern void do_IRQ(struct pt_regs *);
++ struct ve_struct *ve;
++ struct user_beancounter *ub;
+
+ if (atomic_read(&ppc_n_lost_interrupts) != 0)
+ do_IRQ(regs);
+
++ ve = set_exec_env(get_ve0());
++ ub = set_exec_ub(get_ub0());
+ irq_enter();
+
+ while ((next_dec = tb_ticks_per_jiffy - tb_delta(&jiffy_stamp)) <= 0) {
+@@ -192,6 +198,8 @@ void timer_interrupt(struct pt_regs * re
+ ppc_md.heartbeat();
+
+ irq_exit();
++ (void)set_exec_ub(ub);
++ (void)set_exec_env(ve);
+ }
+
+ /*
+diff -upr linux-2.6.16.orig/arch/ppc/mm/fault.c linux-2.6.16-026test015/arch/ppc/mm/fault.c
+--- linux-2.6.16.orig/arch/ppc/mm/fault.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ppc/mm/fault.c 2006-07-04 14:41:37.000000000 +0400
+@@ -247,7 +247,6 @@ good_area:
+ * make sure we exit gracefully rather than endlessly redo
+ * the fault.
+ */
+- survive:
+ switch (handle_mm_fault(mm, vma, address, is_write)) {
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+@@ -290,14 +289,12 @@ bad_area:
+ */
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+- if (current->pid == 1) {
+- yield();
+- down_read(&mm->mmap_sem);
+- goto survive;
+- }
+- printk("VM: killing process %s\n", current->comm);
+ if (user_mode(regs))
+- do_exit(SIGKILL);
++ /*
++ * 0-order allocation always success if something really
++ * fatal not happen: beancounter overdraft or OOM. Den
++ */
++ force_sig(SIGKILL, current);
+ return SIGKILL;
+
+ do_sigbus:
+diff -upr linux-2.6.16.orig/arch/ppc/mm/init.c linux-2.6.16-026test015/arch/ppc/mm/init.c
+--- linux-2.6.16.orig/arch/ppc/mm/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ppc/mm/init.c 2006-07-04 14:41:38.000000000 +0400
+@@ -132,6 +132,7 @@ void show_mem(void)
+ printk("%d pages shared\n",shared);
+ printk("%d pages swap cached\n",cached);
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /* Free up now-unused memory */
+ static void free_sec(unsigned long start, unsigned long end, const char *name)
+diff -upr linux-2.6.16.orig/arch/ppc/mm/pgtable.c linux-2.6.16-026test015/arch/ppc/mm/pgtable.c
+--- linux-2.6.16.orig/arch/ppc/mm/pgtable.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/ppc/mm/pgtable.c 2006-07-04 14:41:37.000000000 +0400
+@@ -84,7 +84,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+ pgd_t *ret;
+
+- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER);
++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC |
++ __GFP_ZERO, PGDIR_ORDER);
+ return ret;
+ }
+
+@@ -118,6 +119,7 @@ struct page *pte_alloc_one(struct mm_str
+ #else
+ gfp_t flags = GFP_KERNEL | __GFP_REPEAT;
+ #endif
++ flags |= (__GFP_UBC | __GFP_SOFT_UBC);
+
+ ptepage = alloc_pages(flags, 0);
+ if (ptepage)
+diff -upr linux-2.6.16.orig/arch/s390/Kconfig linux-2.6.16-026test015/arch/s390/Kconfig
+--- linux-2.6.16.orig/arch/s390/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/Kconfig 2006-07-04 14:41:37.000000000 +0400
+@@ -472,8 +472,12 @@ source "arch/s390/oprofile/Kconfig"
+
+ source "arch/s390/Kconfig.debug"
+
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+
+ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
++
++source "kernel/ub/Kconfig"
+diff -upr linux-2.6.16.orig/arch/s390/kernel/process.c linux-2.6.16-026test015/arch/s390/kernel/process.c
+--- linux-2.6.16.orig/arch/s390/kernel/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/kernel/process.c 2006-07-04 14:41:38.000000000 +0400
+@@ -164,9 +164,10 @@ void show_regs(struct pt_regs *regs)
+ struct task_struct *tsk = current;
+
+ printk("CPU: %d %s\n", task_thread_info(tsk)->cpu, print_tainted());
+- printk("Process %s (pid: %d, task: %p, ksp: %p)\n",
+- current->comm, current->pid, (void *) tsk,
+- (void *) tsk->thread.ksp);
++ printk("Process %s (pid: %d, veid: %d, task: %p, ksp: %p)\n",
++ current->comm, current->pid,
++ VEID(VE_TASK_INFO(current)->owner_env),
++ (void *) tsk, (void *) tsk->thread.ksp);
+
+ show_registers(regs);
+ /* Show stack backtrace if pt_regs is from kernel mode */
+@@ -187,6 +188,13 @@ int kernel_thread(int (*fn)(void *), voi
+ {
+ struct pt_regs regs;
+
++ if (!ve_is_super(get_exec_env())) {
++ /* Don't allow kernel_thread() inside VE */
++ printk("kernel_thread call inside VE\n");
++ dump_stack();
++ return -EPERM;
++ }
++
+ memset(&regs, 0, sizeof(regs));
+ regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | PSW_MASK_EXT;
+ regs.psw.addr = (unsigned long) kernel_thread_starter | PSW_ADDR_AMODE;
+diff -upr linux-2.6.16.orig/arch/s390/kernel/s390_ext.c linux-2.6.16-026test015/arch/s390/kernel/s390_ext.c
+--- linux-2.6.16.orig/arch/s390/kernel/s390_ext.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/kernel/s390_ext.c 2006-07-04 14:41:38.000000000 +0400
+@@ -114,7 +114,9 @@ void do_extint(struct pt_regs *regs, uns
+ {
+ ext_int_info_t *p;
+ int index;
++ struct ve_struct *envid;
+
++ envid = set_exec_env(get_ve0());
+ irq_enter();
+ asm volatile ("mc 0,0");
+ if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer)
+@@ -132,6 +134,7 @@ void do_extint(struct pt_regs *regs, uns
+ }
+ }
+ irq_exit();
++ (void)set_exec_env(envid);
+ }
+
+ EXPORT_SYMBOL(register_external_interrupt);
+diff -upr linux-2.6.16.orig/arch/s390/kernel/smp.c linux-2.6.16-026test015/arch/s390/kernel/smp.c
+--- linux-2.6.16.orig/arch/s390/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/kernel/smp.c 2006-07-04 14:41:38.000000000 +0400
+@@ -526,6 +526,17 @@ int __devinit start_secondary(void *cpuv
+ {
+ /* Setup the cpu */
+ cpu_init();
++
++#ifdef CONFIG_VE
++ /* TSC reset. kill whatever might rely on old values */
++ VE_TASK_INFO(current)->wakeup_stamp = 0;
++ /*
++ * Cosmetic: sleep_time won't be changed afterwards for the idle
++ * thread; keep it 0 rather than -cycles.
++ */
++ VE_TASK_INFO(idle)->sleep_time = 0;
++#endif
++
+ preempt_disable();
+ /* init per CPU timer */
+ init_cpu_timer();
+@@ -834,6 +845,11 @@ void __init smp_prepare_cpus(unsigned in
+ for_each_cpu(cpu)
+ if (cpu != smp_processor_id())
+ smp_create_idle(cpu);
++
++#ifdef CONFIG_VE
++ /* TSC reset. kill whatever might rely on old values */
++ VE_TASK_INFO(current)->wakeup_stamp = 0;
++#endif
+ }
+
+ void __devinit smp_prepare_boot_cpu(void)
+diff -upr linux-2.6.16.orig/arch/s390/kernel/syscalls.S linux-2.6.16-026test015/arch/s390/kernel/syscalls.S
+--- linux-2.6.16.orig/arch/s390/kernel/syscalls.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/kernel/syscalls.S 2006-07-04 14:41:37.000000000 +0400
+@@ -312,3 +312,12 @@ SYSCALL(sys_faccessat,sys_faccessat,sys_
+ SYSCALL(sys_pselect6,sys_pselect6,compat_sys_pselect6_wrapper)
+ SYSCALL(sys_ppoll,sys_ppoll,compat_sys_ppoll_wrapper)
+ SYSCALL(sys_unshare,sys_unshare,sys_unshare_wrapper)
++
++.rept 410-(.-sys_call_table)/4
++ NI_SYSCALL
++.endr
++
++SYSCALL(sys_getluid, sys_getluid, sys_ni_syscall) /* 410 */
++SYSCALL(sys_setluid, sys_setluid, sys_ni_syscall)
++SYSCALL(sys_setublimit, sys_setublimit, sys_ni_syscall)
++SYSCALL(sys_ubstat, sys_ubstat, sys_ni_syscall)
+diff -upr linux-2.6.16.orig/arch/s390/mm/fault.c linux-2.6.16-026test015/arch/s390/mm/fault.c
+--- linux-2.6.16.orig/arch/s390/mm/fault.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/mm/fault.c 2006-07-04 14:41:37.000000000 +0400
+@@ -61,17 +61,9 @@ void bust_spinlocks(int yes)
+ if (yes) {
+ oops_in_progress = 1;
+ } else {
+- int loglevel_save = console_loglevel;
+ console_unblank();
+ oops_in_progress = 0;
+- /*
+- * OK, the message is on the console. Now we call printk()
+- * without oops_in_progress set so that printk will give klogd
+- * a poke. Hold onto your hats...
+- */
+- console_loglevel = 15;
+- printk(" ");
+- console_loglevel = loglevel_save;
++ wake_up_klogd();
+ }
+ }
+
+diff -upr linux-2.6.16.orig/arch/s390/mm/init.c linux-2.6.16-026test015/arch/s390/mm/init.c
+--- linux-2.6.16.orig/arch/s390/mm/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/s390/mm/init.c 2006-07-04 14:41:38.000000000 +0400
+@@ -89,6 +89,7 @@ void show_mem(void)
+ printk("%d pages shared\n",shared);
+ printk("%d pages swap cached\n",cached);
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /* References to section boundaries */
+
+diff -upr linux-2.6.16.orig/arch/sh/kernel/kgdb_stub.c linux-2.6.16-026test015/arch/sh/kernel/kgdb_stub.c
+--- linux-2.6.16.orig/arch/sh/kernel/kgdb_stub.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sh/kernel/kgdb_stub.c 2006-07-04 14:41:38.000000000 +0400
+@@ -412,7 +412,7 @@ static struct task_struct *get_thread(in
+ if (pid == PID_MAX) pid = 0;
+
+ /* First check via PID */
+- thread = find_task_by_pid(pid);
++ thread = find_task_by_pid_all(pid);
+
+ if (thread)
+ return thread;
+diff -upr linux-2.6.16.orig/arch/sh64/kernel/process.c linux-2.6.16-026test015/arch/sh64/kernel/process.c
+--- linux-2.6.16.orig/arch/sh64/kernel/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sh64/kernel/process.c 2006-07-04 14:41:38.000000000 +0400
+@@ -906,7 +906,7 @@ asids_proc_info(char *buf, char **start,
+ int len=0;
+ struct task_struct *p;
+ read_lock(&tasklist_lock);
+- for_each_process(p) {
++ for_each_process_ve(p) {
+ int pid = p->pid;
+ struct mm_struct *mm;
+ if (!pid) continue;
+diff -upr linux-2.6.16.orig/arch/sparc64/kernel/pci_iommu.c linux-2.6.16-026test015/arch/sparc64/kernel/pci_iommu.c
+--- linux-2.6.16.orig/arch/sparc64/kernel/pci_iommu.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sparc64/kernel/pci_iommu.c 2006-07-04 14:41:36.000000000 +0400
+@@ -219,7 +219,7 @@ static inline void iommu_free_ctx(struct
+ * DMA for PCI device PDEV. Return non-NULL cpu-side address if
+ * successful and set *DMA_ADDRP to the PCI side dma address.
+ */
+-void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
++void *__pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp, gfp_t gfp)
+ {
+ struct pcidev_cookie *pcp;
+ struct pci_iommu *iommu;
+@@ -233,7 +233,7 @@ void *pci_alloc_consistent(struct pci_de
+ if (order >= 10)
+ return NULL;
+
+- first_page = __get_free_pages(GFP_ATOMIC, order);
++ first_page = __get_free_pages(gfp, order);
+ if (first_page == 0UL)
+ return NULL;
+ memset((char *)first_page, 0, PAGE_SIZE << order);
+diff -upr linux-2.6.16.orig/arch/sparc64/kernel/setup.c linux-2.6.16-026test015/arch/sparc64/kernel/setup.c
+--- linux-2.6.16.orig/arch/sparc64/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sparc64/kernel/setup.c 2006-07-04 14:41:38.000000000 +0400
+@@ -156,7 +156,7 @@ int prom_callback(long *args)
+ pte_t *ptep;
+ pte_t pte;
+
+- for_each_process(p) {
++ for_each_process_all(p) {
+ mm = p->mm;
+ if (CTX_NRBITS(mm->context) == ctx)
+ break;
+diff -upr linux-2.6.16.orig/arch/sparc64/kernel/sparc64_ksyms.c linux-2.6.16-026test015/arch/sparc64/kernel/sparc64_ksyms.c
+--- linux-2.6.16.orig/arch/sparc64/kernel/sparc64_ksyms.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sparc64/kernel/sparc64_ksyms.c 2006-07-04 14:41:36.000000000 +0400
+@@ -221,7 +221,7 @@ EXPORT_SYMBOL(insl);
+ EXPORT_SYMBOL(ebus_chain);
+ EXPORT_SYMBOL(isa_chain);
+ EXPORT_SYMBOL(pci_memspace_mask);
+-EXPORT_SYMBOL(pci_alloc_consistent);
++EXPORT_SYMBOL(__pci_alloc_consistent);
+ EXPORT_SYMBOL(pci_free_consistent);
+ EXPORT_SYMBOL(pci_map_single);
+ EXPORT_SYMBOL(pci_unmap_single);
+diff -upr linux-2.6.16.orig/arch/sparc64/lib/checksum.S linux-2.6.16-026test015/arch/sparc64/lib/checksum.S
+--- linux-2.6.16.orig/arch/sparc64/lib/checksum.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sparc64/lib/checksum.S 2006-07-04 14:41:36.000000000 +0400
+@@ -165,8 +165,9 @@ csum_partial_end_cruft:
+ sll %g1, 8, %g1
+ or %o5, %g1, %o4
+
+-1: add %o2, %o4, %o2
++1: addcc %o2, %o4, %o2
++ addc %g0, %o2, %o2
+
+ csum_partial_finish:
+ retl
+- mov %o2, %o0
++ srl %o2, 0, %o0
+diff -upr linux-2.6.16.orig/arch/sparc64/lib/csum_copy.S linux-2.6.16-026test015/arch/sparc64/lib/csum_copy.S
+--- linux-2.6.16.orig/arch/sparc64/lib/csum_copy.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/sparc64/lib/csum_copy.S 2006-07-04 14:41:36.000000000 +0400
+@@ -221,11 +221,12 @@ FUNC_NAME: /* %o0=src, %o1=dst, %o2=len
+ sll %g1, 8, %g1
+ or %o5, %g1, %o4
+
+-1: add %o3, %o4, %o3
++1: addcc %o3, %o4, %o3
++ addc %g0, %o3, %o3
+
+ 70:
+ retl
+- mov %o3, %o0
++ srl %o3, 0, %o0
+
+ 95: mov 0, GLOBAL_SPARE
+ brlez,pn %o2, 4f
+diff -upr linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c linux-2.6.16-026test015/arch/um/drivers/mconsole_kern.c
+--- linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/um/drivers/mconsole_kern.c 2006-07-04 14:41:38.000000000 +0400
+@@ -600,7 +600,7 @@ static void do_stack_trace(struct mc_req
+
+ from = current;
+
+- to = find_task_by_pid(pid_requested);
++ to = find_task_by_pid_all(pid_requested);
+ if((to == NULL) || (pid_requested == 0)) {
+ mconsole_reply(req, "Couldn't find that pid", 1, 0);
+ return;
+diff -upr linux-2.6.16.orig/arch/um/kernel/skas/process_kern.c linux-2.6.16-026test015/arch/um/kernel/skas/process_kern.c
+--- linux-2.6.16.orig/arch/um/kernel/skas/process_kern.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/um/kernel/skas/process_kern.c 2006-07-04 14:41:38.000000000 +0400
+@@ -197,7 +197,7 @@ void kill_off_processes_skas(void)
+ int pid, me;
+
+ me = os_getpid();
+- for_each_process(p){
++ for_each_process_all(p){
+ if(p->mm == NULL)
+ continue;
+
+diff -upr linux-2.6.16.orig/arch/um/kernel/tt/process_kern.c linux-2.6.16-026test015/arch/um/kernel/tt/process_kern.c
+--- linux-2.6.16.orig/arch/um/kernel/tt/process_kern.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/um/kernel/tt/process_kern.c 2006-07-04 14:41:38.000000000 +0400
+@@ -301,7 +301,7 @@ void kill_off_processes_tt(void)
+ int me;
+
+ me = os_getpid();
+- for_each_process(p){
++ for_each_process_all(p){
+ if(p->thread.mode.tt.extern_pid != me)
+ os_kill_process(p->thread.mode.tt.extern_pid, 0);
+ }
+@@ -444,7 +444,7 @@ int is_valid_pid(int pid)
+ struct task_struct *task;
+
+ read_lock(&tasklist_lock);
+- for_each_process(task){
++ for_each_process_all(task){
+ if(task->thread.mode.tt.extern_pid == pid){
+ read_unlock(&tasklist_lock);
+ return(1);
+diff -upr linux-2.6.16.orig/arch/x86_64/Kconfig linux-2.6.16-026test015/arch/x86_64/Kconfig
+--- linux-2.6.16.orig/arch/x86_64/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -246,6 +246,8 @@ config SCHED_SMT
+ cost of slightly increased overhead in some places. If unsure say
+ N here.
+
++source "kernel/Kconfig.fairsched"
++
+ source "kernel/Kconfig.preempt"
+
+ config NUMA
+@@ -588,8 +590,12 @@ endmenu
+
+ source "arch/x86_64/Kconfig.debug"
+
++source "kernel/Kconfig.openvz"
++
+ source "security/Kconfig"
+
+ source "crypto/Kconfig"
+
+ source "lib/Kconfig"
++
++source "kernel/ub/Kconfig"
+diff -upr linux-2.6.16.orig/arch/x86_64/boot/compressed/head.S linux-2.6.16-026test015/arch/x86_64/boot/compressed/head.S
+--- linux-2.6.16.orig/arch/x86_64/boot/compressed/head.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/boot/compressed/head.S 2006-07-04 14:41:39.000000000 +0400
+@@ -34,7 +34,7 @@
+ startup_32:
+ cld
+ cli
+- movl $(__KERNEL_DS),%eax
++ movl $(__BOOT_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs
+@@ -76,7 +76,7 @@ startup_32:
+ jnz 3f
+ addl $8,%esp
+ xorl %ebx,%ebx
+- ljmp $(__KERNEL_CS), $__PHYSICAL_START
++ ljmp $(__BOOT_CS), $__PHYSICAL_START
+
+ /*
+ * We come here, if we were loaded high.
+@@ -104,7 +104,7 @@ startup_32:
+ popl %eax # hcount
+ movl $__PHYSICAL_START,%edi
+ cli # make sure we don't get interrupted
+- ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
++ ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine
+
+ /*
+ * Routine (template) for moving the decompressed kernel in place,
+@@ -127,7 +127,7 @@ move_routine_start:
+ movsl
+ movl %ebx,%esi # Restore setup pointer
+ xorl %ebx,%ebx
+- ljmp $(__KERNEL_CS), $__PHYSICAL_START
++ ljmp $(__BOOT_CS), $__PHYSICAL_START
+ move_routine_end:
+
+
+@@ -137,5 +137,5 @@ user_stack:
+ .fill 4096,4,0
+ stack_start:
+ .long user_stack+4096
+- .word __KERNEL_DS
++ .word __BOOT_DS
+
+diff -upr linux-2.6.16.orig/arch/x86_64/boot/setup.S linux-2.6.16-026test015/arch/x86_64/boot/setup.S
+--- linux-2.6.16.orig/arch/x86_64/boot/setup.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/boot/setup.S 2006-07-04 14:41:39.000000000 +0400
+@@ -729,7 +729,7 @@ flush_instr:
+ subw $DELTA_INITSEG, %si
+ shll $4, %esi # Convert to 32-bit pointer
+ # NOTE: For high loaded big kernels we need a
+-# jmpi 0x100000,__KERNEL_CS
++# jmpi 0x100000,__BOOT_CS
+ #
+ # but we yet haven't reloaded the CS register, so the default size
+ # of the target offset still is 16 bit.
+@@ -740,7 +740,7 @@ flush_instr:
+ .byte 0x66, 0xea # prefix + jmpi-opcode
+ code32: .long 0x1000 # will be set to 0x100000
+ # for big kernels
+- .word __KERNEL_CS
++ .word __BOOT_CS
+
+ # Here's a bunch of information about your current kernel..
+ kernel_version: .ascii UTS_RELEASE
+diff -upr linux-2.6.16.orig/arch/x86_64/ia32/Makefile linux-2.6.16-026test015/arch/x86_64/ia32/Makefile
+--- linux-2.6.16.orig/arch/x86_64/ia32/Makefile 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/ia32/Makefile 2006-07-04 14:41:36.000000000 +0400
+@@ -27,5 +27,5 @@ $(obj)/vsyscall-sysenter.so $(obj)/vsysc
+ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
+ $(call if_changed,syscall)
+
+-AFLAGS_vsyscall-sysenter.o = -m32
+-AFLAGS_vsyscall-syscall.o = -m32
++AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
++AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
+diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_aout.c linux-2.6.16-026test015/arch/x86_64/ia32/ia32_aout.c
+--- linux-2.6.16.orig/arch/x86_64/ia32/ia32_aout.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/ia32/ia32_aout.c 2006-07-04 14:41:38.000000000 +0400
+@@ -347,14 +347,14 @@ static int load_aout_binary(struct linux
+ if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+ (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
+ {
+- printk(KERN_NOTICE "executable not page aligned\n");
++ ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n");
+ error_time2 = jiffies;
+ }
+
+ if ((fd_offset & ~PAGE_MASK) != 0 &&
+ (jiffies-error_time) > 5*HZ)
+ {
+- printk(KERN_WARNING
++ ve_printk(VE_LOG, KERN_WARNING
+ "fd_offset is not page aligned. Please convert program: %s\n",
+ bprm->file->f_dentry->d_name.name);
+ error_time = jiffies;
+@@ -467,7 +467,7 @@ static int load_aout_library(struct file
+ static unsigned long error_time;
+ if ((jiffies-error_time) > 5*HZ)
+ {
+- printk(KERN_WARNING
++ ve_printk(VE_LOG, KERN_WARNING
+ "N_TXTOFF is not page aligned. Please convert library: %s\n",
+ file->f_dentry->d_name.name);
+ error_time = jiffies;
+diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_binfmt.c linux-2.6.16-026test015/arch/x86_64/ia32/ia32_binfmt.c
+--- linux-2.6.16.orig/arch/x86_64/ia32/ia32_binfmt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/ia32/ia32_binfmt.c 2006-07-04 14:41:39.000000000 +0400
+@@ -27,12 +27,14 @@
+ #include <asm/ia32.h>
+ #include <asm/vsyscall32.h>
+
++#include <ub/ub_vmpages.h>
++
+ #define ELF_NAME "elf/i386"
+
+ #define AT_SYSINFO 32
+ #define AT_SYSINFO_EHDR 33
+
+-int sysctl_vsyscall32 = 1;
++int sysctl_vsyscall32 = 0;
+
+ #define ARCH_DLINFO do { \
+ if (sysctl_vsyscall32) { \
+@@ -347,9 +349,15 @@ int ia32_setup_arg_pages(struct linux_bi
+ bprm->loader += stack_base;
+ bprm->exec += stack_base;
+
++ ret = -ENOMEM;
++ if (ub_memory_charge(mm, IA32_STACK_TOP -
++ (PAGE_MASK & (unsigned long)bprm->p),
++ VM_STACK_FLAGS, NULL, UB_SOFT))
++ goto err_charge;
++
+ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!mpnt)
+- return -ENOMEM;
++ goto err_alloc;
+
+ memset(mpnt, 0, sizeof(*mpnt));
+
+@@ -366,11 +374,8 @@ int ia32_setup_arg_pages(struct linux_bi
+ mpnt->vm_flags = VM_STACK_FLAGS;
+ mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ?
+ PAGE_COPY_EXEC : PAGE_COPY;
+- if ((ret = insert_vm_struct(mm, mpnt))) {
+- up_write(&mm->mmap_sem);
+- kmem_cache_free(vm_area_cachep, mpnt);
+- return ret;
+- }
++ if ((ret = insert_vm_struct(mm, mpnt)))
++ goto err_insert;
+ mm->stack_vm = mm->total_vm = vma_pages(mpnt);
+ }
+
+@@ -385,6 +390,16 @@ int ia32_setup_arg_pages(struct linux_bi
+ up_write(&mm->mmap_sem);
+
+ return 0;
++
++err_insert:
++ up_write(&mm->mmap_sem);
++ kmem_cache_free(vm_area_cachep, mpnt);
++err_alloc:
++ ub_memory_uncharge(mm, IA32_STACK_TOP -
++ (PAGE_MASK & (unsigned long)bprm->p),
++ VM_STACK_FLAGS, NULL);
++err_charge:
++ return ret;
+ }
+ EXPORT_SYMBOL(ia32_setup_arg_pages);
+
+diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_signal.c linux-2.6.16-026test015/arch/x86_64/ia32/ia32_signal.c
+--- linux-2.6.16.orig/arch/x86_64/ia32/ia32_signal.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/ia32/ia32_signal.c 2006-07-04 14:41:39.000000000 +0400
+@@ -39,7 +39,6 @@
+
+ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
+ void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+@@ -118,22 +117,17 @@ asmlinkage long
+ sys32_sigsuspend(int history0, int history1, old_sigset_t mask,
+ struct pt_regs *regs)
+ {
+- sigset_t saveset;
+-
+ mask &= _BLOCKABLE;
+ spin_lock_irq(&current->sighand->siglock);
+- saveset = current->blocked;
++ current->saved_sigmask = current->blocked;
+ siginitset(&current->blocked, mask);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+- regs->rax = -EINTR;
+- while (1) {
+- current->state = TASK_INTERRUPTIBLE;
+- schedule();
+- if (do_signal(regs, &saveset))
+- return -EINTR;
+- }
++ current->state = TASK_INTERRUPTIBLE;
++ schedule();
++ set_thread_flag(TIF_RESTORE_SIGMASK);
++ return -ERESTARTNOHAND;
+ }
+
+ asmlinkage long
+@@ -510,11 +504,11 @@ int ia32_setup_frame(int sig, struct k_s
+ current->comm, current->pid, frame, regs->rip, frame->pretcode);
+ #endif
+
+- return 1;
++ return 0;
+
+ give_sigsegv:
+ force_sigsegv(sig, current);
+- return 0;
++ return -EFAULT;
+ }
+
+ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+@@ -606,9 +600,9 @@ int ia32_setup_rt_frame(int sig, struct
+ current->comm, current->pid, frame, regs->rip, frame->pretcode);
+ #endif
+
+- return 1;
++ return 0;
+
+ give_sigsegv:
+ force_sigsegv(sig, current);
+- return 0;
++ return -EFAULT;
+ }
+diff -upr linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c linux-2.6.16-026test015/arch/x86_64/ia32/sys_ia32.c
+--- linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/ia32/sys_ia32.c 2006-07-04 14:41:38.000000000 +0400
+@@ -527,7 +527,7 @@ int sys32_ni_syscall(int call)
+ static char lastcomm[sizeof(me->comm)];
+
+ if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+- printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
++ ve_printk(VE_LOG, KERN_INFO "IA32 syscall %d from %s not implemented\n",
+ call, me->comm);
+ strncpy(lastcomm, me->comm, sizeof(lastcomm));
+ }
+@@ -890,13 +890,13 @@ asmlinkage long sys32_olduname(struct ol
+
+ down_read(&uts_sem);
+
+- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
++ error = __copy_to_user(&name->sysname,&ve_utsname.sysname,__OLD_UTS_LEN);
+ __put_user(0,name->sysname+__OLD_UTS_LEN);
+- __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
++ __copy_to_user(&name->nodename,&ve_utsname.nodename,__OLD_UTS_LEN);
+ __put_user(0,name->nodename+__OLD_UTS_LEN);
+- __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
++ __copy_to_user(&name->release,&ve_utsname.release,__OLD_UTS_LEN);
+ __put_user(0,name->release+__OLD_UTS_LEN);
+- __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
++ __copy_to_user(&name->version,&ve_utsname.version,__OLD_UTS_LEN);
+ __put_user(0,name->version+__OLD_UTS_LEN);
+ {
+ char *arch = "x86_64";
+@@ -919,7 +919,7 @@ long sys32_uname(struct old_utsname __us
+ if (!name)
+ return -EFAULT;
+ down_read(&uts_sem);
+- err=copy_to_user(name, &system_utsname, sizeof (*name));
++ err=copy_to_user(name, &ve_utsname, sizeof (*name));
+ up_read(&uts_sem);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
+@@ -1005,7 +1005,7 @@ long sys32_vm86_warning(void)
+ struct task_struct *me = current;
+ static char lastcomm[sizeof(me->comm)];
+ if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+- printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
++ ve_printk(VE_LOG, KERN_INFO "%s: vm87 mode not supported on 64 bit kernel\n",
+ me->comm);
+ strncpy(lastcomm, me->comm, sizeof(lastcomm));
+ }
+diff -upr linux-2.6.16.orig/arch/x86_64/ia32/syscall32.c linux-2.6.16-026test015/arch/x86_64/ia32/syscall32.c
+--- linux-2.6.16.orig/arch/x86_64/ia32/syscall32.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/ia32/syscall32.c 2006-07-04 14:41:37.000000000 +0400
+@@ -14,6 +14,8 @@
+ #include <asm/tlbflush.h>
+ #include <asm/ia32_unistd.h>
+
++#include <ub/ub_vmpages.h>
++
+ extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
+ extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
+ extern int sysctl_vsyscall32;
+@@ -47,32 +49,45 @@ int syscall32_setup_pages(struct linux_b
+ int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
++ unsigned long flags;
+ int ret;
+
++ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE |
++ mm->def_flags;
++
++ ret = -ENOMEM;
++ if (ub_memory_charge(mm, VSYSCALL32_END - VSYSCALL32_BASE,
++ flags, NULL, UB_SOFT))
++ goto err_charge;
++
+ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!vma)
+- return -ENOMEM;
++ goto err_alloc;
+
+ memset(vma, 0, sizeof(struct vm_area_struct));
+ /* Could randomize here */
+ vma->vm_start = VSYSCALL32_BASE;
+ vma->vm_end = VSYSCALL32_END;
+ /* MAYWRITE to allow gdb to COW and set breakpoints */
+- vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+- vma->vm_flags |= mm->def_flags;
++ vma->vm_flags = flags;
+ vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+ vma->vm_ops = &syscall32_vm_ops;
+ vma->vm_mm = mm;
+
+ down_write(&mm->mmap_sem);
+- if ((ret = insert_vm_struct(mm, vma))) {
+- up_write(&mm->mmap_sem);
+- kmem_cache_free(vm_area_cachep, vma);
+- return ret;
+- }
++ if ((ret = insert_vm_struct(mm, vma)))
++ goto err_ins;
+ mm->total_vm += npages;
+ up_write(&mm->mmap_sem);
+ return 0;
++
++err_ins:
++ up_write(&mm->mmap_sem);
++ kmem_cache_free(vm_area_cachep, vma);
++err_alloc:
++ ub_memory_uncharge(mm, VSYSCALL32_END - VSYSCALL32_BASE, flags, NULL);
++err_charge:
++ return ret;
+ }
+
+ static int __init init_syscall32(void)
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/acpi/wakeup.S linux-2.6.16-026test015/arch/x86_64/kernel/acpi/wakeup.S
+--- linux-2.6.16.orig/arch/x86_64/kernel/acpi/wakeup.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/acpi/wakeup.S 2006-07-04 14:41:39.000000000 +0400
+@@ -77,7 +77,7 @@ wakeup_code:
+
+ .byte 0x66, 0xea # prefix + jmpi-opcode
+ .long wakeup_32 - __START_KERNEL_map
+- .word __KERNEL_CS
++ .word __BOOT_CS
+
+ .code32
+ wakeup_32:
+@@ -96,13 +96,13 @@ wakeup_32:
+ jnc bogus_cpu
+ movl %edx,%edi
+
+- movw $__KERNEL_DS, %ax
++ movw $__BOOT_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+- movw $__KERNEL_DS, %ax
++ movw $__BOOT_DS, %ax
+ movw %ax, %ss
+
+ mov $(wakeup_stack - __START_KERNEL_map), %esp
+@@ -187,7 +187,7 @@ reach_compatibility_mode:
+
+ wakeup_jumpvector:
+ .long wakeup_long64 - __START_KERNEL_map
+- .word __KERNEL_CS
++ .word __BOOT_CS
+
+ .code64
+
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/apic.c linux-2.6.16-026test015/arch/x86_64/kernel/apic.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/apic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/apic.c 2006-07-04 14:41:38.000000000 +0400
+@@ -941,6 +941,7 @@ void smp_local_timer_interrupt(struct pt
+ */
+ void smp_apic_timer_interrupt(struct pt_regs *regs)
+ {
++ struct ve_struct *ve;
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+@@ -957,9 +958,11 @@ void smp_apic_timer_interrupt(struct pt_
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ exit_idle();
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+ smp_local_timer_interrupt(regs);
+ irq_exit();
++ (void)set_exec_env(ve);
+ }
+
+ /*
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/entry.S linux-2.6.16-026test015/arch/x86_64/kernel/entry.S
+--- linux-2.6.16.orig/arch/x86_64/kernel/entry.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/entry.S 2006-07-04 14:41:39.000000000 +0400
+@@ -180,6 +180,10 @@ rff_trace:
+ *
+ * XXX if we had a free scratch register we could save the RSP into the stack frame
+ * and report it properly in ps. Unfortunately we haven't.
++ *
++ * When user can change the frames always force IRET. That is because
++ * it deals with uncanonical addresses better. SYSRET has trouble
++ * with them due to bugs in both AMD and Intel CPUs.
+ */
+
+ ENTRY(system_call)
+@@ -244,7 +248,7 @@ sysret_careful:
+ /* Handle a signal */
+ sysret_signal:
+ sti
+- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
++ testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ jz 1f
+
+ /* Really a signal */
+@@ -254,7 +258,10 @@ sysret_signal:
+ xorl %esi,%esi # oldset -> arg2
+ call ptregscall_common
+ 1: movl $_TIF_NEED_RESCHED,%edi
+- jmp sysret_check
++ /* Use IRET because user could have changed frame. This
++ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
++ cli
++ jmp int_with_check
+
+ badsys:
+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+@@ -274,13 +281,9 @@ tracesys:
+ ja 1f
+ movq %r10,%rcx /* fixup for C */
+ call *sys_call_table(,%rax,8)
+- movq %rax,RAX-ARGOFFSET(%rsp)
+-1: SAVE_REST
+- movq %rsp,%rdi
+- call syscall_trace_leave
+- RESTORE_TOP_OF_STACK %rbx
+- RESTORE_REST
+- jmp ret_from_sys_call
++1: movq %rax,RAX-ARGOFFSET(%rsp)
++ /* Use IRET because user could have changed frame */
++ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+
+ /*
+@@ -350,7 +353,7 @@ int_very_careful:
+ jmp int_restore_rest
+
+ int_signal:
+- testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
++ testl $(_TIF_NOTIFY_RESUME|_TIF_RESTORE_SIGMASK|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+ jz 1f
+ movq %rsp,%rdi # &ptregs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+@@ -408,25 +411,9 @@ ENTRY(stub_execve)
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_REGISTER rip, r11
+ SAVE_REST
+- movq %r11, %r15
+- CFI_REGISTER rip, r15
+ FIXUP_TOP_OF_STACK %r11
+ call sys_execve
+- GET_THREAD_INFO(%rcx)
+- bt $TIF_IA32,threadinfo_flags(%rcx)
+- CFI_REMEMBER_STATE
+- jc exec_32bit
+ RESTORE_TOP_OF_STACK %r11
+- movq %r15, %r11
+- CFI_REGISTER rip, r11
+- RESTORE_REST
+- pushq %r11
+- CFI_ADJUST_CFA_OFFSET 8
+- CFI_REL_OFFSET rip, 0
+- ret
+-
+-exec_32bit:
+- CFI_RESTORE_STATE
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+@@ -574,7 +561,7 @@ retint_careful:
+ jmp retint_check
+
+ retint_signal:
+- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
++ testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ jz retint_swapgs
+ sti
+ SAVE_REST
+@@ -845,7 +832,7 @@ ENTRY(kernel_thread)
+ xorl %r9d,%r9d
+
+ # clone now
+- call do_fork
++ call do_fork_kthread
+ movq %rax,RAX(%rsp)
+ xorl %edi,%edi
+
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/head.S linux-2.6.16-026test015/arch/x86_64/kernel/head.S
+--- linux-2.6.16.orig/arch/x86_64/kernel/head.S 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/head.S 2006-07-04 14:41:39.000000000 +0400
+@@ -40,7 +40,7 @@ startup_32:
+ */
+
+ /* Initialize the %ds segment register */
+- movl $__KERNEL_DS,%eax
++ movl $__BOOT_DS,%eax
+ movl %eax,%ds
+
+ /* Load new GDT with the 64bit segments using 32bit descriptor */
+@@ -183,7 +183,14 @@ startup_64:
+ /* esi is pointer to real mode structure with interesting info.
+ pass it to C */
+ movl %esi, %edi
+-
++
++ /* Switch to __KERNEL_CS. The segment is the same, but selector
++ * is different. */
++ pushq $__KERNEL_CS
++ pushq $switch_cs
++ lretq
++switch_cs:
++
+ /* Finally jump to run C code and to be on real kernel address
+ * Since we are running on identity-mapped space we have to jump
+ * to the full 64bit address , this is only possible as indirect
+@@ -243,7 +250,7 @@ pGDT32:
+ .org 0xf10
+ ljumpvector:
+ .long startup_64-__START_KERNEL_map
+- .word __KERNEL_CS
++ .word __BOOT_CS
+
+ ENTRY(stext)
+ ENTRY(_stext)
+@@ -355,21 +362,30 @@ gdt:
+ .align PAGE_SIZE
+
+ /* The TLS descriptors are currently at a different place compared to i386.
+- Hopefully nobody expects them at a fixed place (Wine?) */
++ Hopefully nobody expects them at a fixed place (Wine?)
++ Descriptors rearranged to plase 32bit and TLS selectors in the same
++ places, because it is really necessary. sysret/exit mandates order
++ of kernel/user cs/ds, so we have to extend gdt.
++*/
+
+ ENTRY(cpu_gdt_table)
+- .quad 0x0000000000000000 /* NULL descriptor */
+- .quad 0x0 /* unused */
+- .quad 0x00af9a000000ffff /* __KERNEL_CS */
+- .quad 0x00cf92000000ffff /* __KERNEL_DS */
+- .quad 0x00cffa000000ffff /* __USER32_CS */
+- .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
+- .quad 0x00affa000000ffff /* __USER_CS */
+- .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+- .quad 0,0 /* TSS */
+- .quad 0,0 /* LDT */
+- .quad 0,0,0 /* three TLS descriptors */
+- .quad 0 /* unused */
++ .quad 0x0000000000000000 /* 0 NULL descriptor */
++ .quad 0x0 /* 1 unused */
++ .quad 0x00af9a000000ffff /* 2 __BOOT_CS */
++ .quad 0x00cf92000000ffff /* 3 __BOOT_DS */
++ .quad 0,0 /* 4,5 TSS */
++ .quad 0,0,0 /* 6-8 three TLS descriptors */
++ .quad 0,0 /* 9,10 LDT */
++ .quad 0x00cf9a000000ffff /* 11 __KERNEL32_CS */
++ .quad 0x00af9a000000ffff /* 12 __KERNEL_CS */
++ .quad 0x00cf92000000ffff /* 13 __KERNEL_DS */
++ .quad 0x00cffa000000ffff /* 14 __USER32_CS */
++ .quad 0x00cff2000000ffff /* 15 __USER_DS, __USER32_DS */
++ .quad 0x00affa000000ffff /* 16 __USER_CS */
++ .quad 0x0 /* 17 unused */
++ .quad 0,0,0,0,0,0
++ .quad 0,0,0,0,0,0,0,0
++
+ gdt_end:
+ /* asm/segment.h:GDT_ENTRIES must match this */
+ /* This should be a multiple of the cache line size */
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/irq.c linux-2.6.16-026test015/arch/x86_64/kernel/irq.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400
+@@ -98,12 +98,15 @@ asmlinkage unsigned int do_IRQ(struct pt
+ {
+ /* high bits used in ret_from_ code */
+ unsigned irq = regs->orig_rax & 0xff;
++ struct ve_struct *ve;
+
+ exit_idle();
++ ve = set_exec_env(get_ve0());
+ irq_enter();
+
+ __do_IRQ(irq, regs);
+ irq_exit();
++ (void)set_exec_env(ve);
+
+ return 1;
+ }
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/ldt.c linux-2.6.16-026test015/arch/x86_64/kernel/ldt.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/ldt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/ldt.c 2006-07-04 14:41:39.000000000 +0400
+@@ -16,6 +16,7 @@
+ #include <linux/smp_lock.h>
+ #include <linux/vmalloc.h>
+ #include <linux/slab.h>
++#include <linux/module.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -23,6 +24,8 @@
+ #include <asm/desc.h>
+ #include <asm/proto.h>
+
++#include <ub/ub_mem.h>
++
+ #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+ static void flush_ldt(void *null)
+ {
+@@ -42,9 +45,9 @@ static int alloc_ldt(mm_context_t *pc, u
+ oldsize = pc->size;
+ mincount = (mincount+511)&(~511);
+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+- newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
++ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE);
+ else
+- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
++ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+ if (!newldt)
+ return -ENOMEM;
+@@ -109,6 +112,7 @@ int init_new_context(struct task_struct
+ }
+ return retval;
+ }
++EXPORT_SYMBOL_GPL(init_new_context);
+
+ /*
+ *
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/nmi.c linux-2.6.16-026test015/arch/x86_64/kernel/nmi.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/nmi.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/nmi.c 2006-07-04 14:41:37.000000000 +0400
+@@ -522,6 +522,7 @@ static __kprobes int dummy_nmi_callback(
+ }
+
+ static nmi_callback_t nmi_callback = dummy_nmi_callback;
++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback;
+
+ asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+ {
+@@ -531,9 +532,21 @@ asmlinkage __kprobes void do_nmi(struct
+ add_pda(__nmi_count,1);
+ if (!rcu_dereference(nmi_callback)(regs, cpu))
+ default_do_nmi(regs);
++
++ nmi_ipi_callback(regs, cpu);
+ nmi_exit();
+ }
+
++void set_nmi_ipi_callback(nmi_callback_t callback)
++{
++ nmi_ipi_callback = callback;
++}
++
++void unset_nmi_ipi_callback(void)
++{
++ nmi_ipi_callback = dummy_nmi_callback;
++}
++
+ void set_nmi_callback(nmi_callback_t callback)
+ {
+ rcu_assign_pointer(nmi_callback, callback);
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/pci-gart.c linux-2.6.16-026test015/arch/x86_64/kernel/pci-gart.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/pci-gart.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/pci-gart.c 2006-07-04 14:41:36.000000000 +0400
+@@ -114,10 +114,6 @@ static unsigned long alloc_iommu(int siz
+ static void free_iommu(unsigned long offset, int size)
+ {
+ unsigned long flags;
+- if (size == 1) {
+- clear_bit(offset, iommu_gart_bitmap);
+- return;
+- }
+ spin_lock_irqsave(&iommu_bitmap_lock, flags);
+ __clear_bit_string(iommu_gart_bitmap, offset, size);
+ spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/process.c linux-2.6.16-026test015/arch/x86_64/kernel/process.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -54,6 +54,11 @@
+ #include <asm/idle.h>
+
+ asmlinkage extern void ret_from_fork(void);
++asmlinkage extern void int_ret_from_sys_call(void);
++asmlinkage extern void execve(void);
++EXPORT_SYMBOL_GPL(ret_from_fork);
++EXPORT_SYMBOL_GPL(int_ret_from_sys_call);
++EXPORT_SYMBOL_GPL(execve);
+
+ unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
+
+@@ -303,7 +308,8 @@ void __show_regs(struct pt_regs * regs)
+ (int)strcspn(system_utsname.version, " "),
+ system_utsname.version);
+ printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
+- printk_address(regs->rip);
++ if (decode_call_traces)
++ printk_address(regs->rip);
+ printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+ regs->eflags);
+ printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+@@ -340,11 +346,26 @@ void __show_regs(struct pt_regs * regs)
+
+ void show_regs(struct pt_regs *regs)
+ {
+- printk("CPU %d:", smp_processor_id());
++ printk("CPU %d, VCPU %d%d", smp_processor_id(), task_vsched_id(current), task_cpu(current));
+ __show_regs(regs);
+ show_trace(&regs->rsp);
+ }
+
++void smp_show_regs(struct pt_regs *regs, void *data)
++{
++ static DEFINE_SPINLOCK(show_regs_lock);
++
++ if (regs == NULL)
++ return;
++
++ bust_spinlocks(1);
++ spin_lock(&show_regs_lock);
++ printk("----------- IPI show regs -----------\n");
++ show_regs(regs);
++ spin_unlock(&show_regs_lock);
++ bust_spinlocks(0);
++}
++
+ /*
+ * Free current thread data structures etc..
+ */
+@@ -527,8 +548,6 @@ __switch_to(struct task_struct *prev_p,
+ int cpu = smp_processor_id();
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+- unlazy_fpu(prev_p);
+-
+ /*
+ * Reload esp0, LDT and the page table pointer:
+ */
+@@ -591,6 +610,12 @@ __switch_to(struct task_struct *prev_p,
+ prev->userrsp = read_pda(oldrsp);
+ write_pda(oldrsp, next->userrsp);
+ write_pda(pcurrent, next_p);
++
++ /* This must be here to ensure both math_state_restore() and
++ kernel_fpu_begin() work consistently.
++ And the AMD workaround requires it to be after DS reload. */
++ unlazy_fpu(prev_p);
++
+ write_pda(kernelstack,
+ task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+
+@@ -841,3 +866,20 @@ unsigned long arch_align_stack(unsigned
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+ }
++
++long do_fork_kthread(unsigned long clone_flags,
++ unsigned long stack_start,
++ struct pt_regs *regs,
++ unsigned long stack_size,
++ int __user *parent_tidptr,
++ int __user *child_tidptr)
++{
++ if (ve_is_super(get_exec_env()))
++ return do_fork(clone_flags, stack_start, regs, stack_size,
++ parent_tidptr, child_tidptr);
++
++ /* Don't allow kernel_thread() inside VE */
++ printk("kernel_thread call inside VE\n");
++ dump_stack();
++ return -EPERM;
++}
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/ptrace.c linux-2.6.16-026test015/arch/x86_64/kernel/ptrace.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/ptrace.c 2006-07-04 14:41:39.000000000 +0400
+@@ -300,6 +300,15 @@ static unsigned long getreg(struct task_
+ return child->thread.fs;
+ case offsetof(struct user_regs_struct, gs_base):
+ return child->thread.gs;
++ case offsetof(struct user_regs_struct, cs):
++ if (test_tsk_thread_flag(child, TIF_SYSCALL_TRACE)) {
++ val = get_stack_long(child, regno - sizeof(struct pt_regs));
++ if (val == __USER_CS)
++ return 0x33;
++ if (val == __USER32_CS)
++ return 0x23;
++ }
++ /* fall through */
+ default:
+ regno = regno - sizeof(struct pt_regs);
+ val = get_stack_long(child, regno);
+@@ -581,8 +590,10 @@ static void syscall_trace(struct pt_regs
+ current_thread_info()->flags, current->ptrace);
+ #endif
+
++ set_pn_state(current, (regs->rax != -ENOSYS) ? PN_STOP_LEAVE : PN_STOP_ENTRY);
+ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
+ ? 0x80 : 0));
++ clear_pn_state(current);
+ /*
+ * this isn't the same as continuing with a signal, but it will do
+ * for normal use. strace only continues with a signal if the
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/setup.c linux-2.6.16-026test015/arch/x86_64/kernel/setup.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/setup.c 2006-07-04 14:41:36.000000000 +0400
+@@ -909,6 +909,10 @@ static int __init init_amd(struct cpuinf
+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
+ set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+
++ /* Enable workaround for FXSAVE leak */
++ if (c->x86 >= 6)
++ set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
++
+ r = get_model_name(c);
+ if (!r) {
+ switch (c->x86) {
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/setup64.c linux-2.6.16-026test015/arch/x86_64/kernel/setup64.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/setup64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/setup64.c 2006-07-04 14:41:39.000000000 +0400
+@@ -290,3 +290,5 @@ void __cpuinit cpu_init (void)
+
+ fpu_init();
+ }
++
++EXPORT_SYMBOL_GPL(cpu_gdt_descr);
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/signal.c linux-2.6.16-026test015/arch/x86_64/kernel/signal.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/signal.c 2006-07-04 14:41:39.000000000 +0400
+@@ -40,37 +40,6 @@ int ia32_setup_frame(int sig, struct k_s
+ sigset_t *set, struct pt_regs * regs);
+
+ asmlinkage long
+-sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
+-{
+- sigset_t saveset, newset;
+-
+- /* XXX: Don't preclude handling different sized sigset_t's. */
+- if (sigsetsize != sizeof(sigset_t))
+- return -EINVAL;
+-
+- if (copy_from_user(&newset, unewset, sizeof(newset)))
+- return -EFAULT;
+- sigdelsetmask(&newset, ~_BLOCKABLE);
+-
+- spin_lock_irq(&current->sighand->siglock);
+- saveset = current->blocked;
+- current->blocked = newset;
+- recalc_sigpending();
+- spin_unlock_irq(&current->sighand->siglock);
+-#ifdef DEBUG_SIG
+- printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
+- saveset, newset, regs, regs->rip);
+-#endif
+- regs->rax = -EINTR;
+- while (1) {
+- current->state = TASK_INTERRUPTIBLE;
+- schedule();
+- if (do_signal(regs, &saveset))
+- return -EINTR;
+- }
+-}
+-
+-asmlinkage long
+ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
+ struct pt_regs *regs)
+ {
+@@ -344,11 +313,11 @@ static int setup_rt_frame(int sig, struc
+ current->comm, current->pid, frame, regs->rip, frame->pretcode);
+ #endif
+
+- return 1;
++ return 0;
+
+ give_sigsegv:
+ force_sigsegv(sig, current);
+- return 0;
++ return -EFAULT;
+ }
+
+ /*
+@@ -411,7 +380,7 @@ handle_signal(unsigned long sig, siginfo
+ #endif
+ ret = setup_rt_frame(sig, ka, info, oldset, regs);
+
+- if (ret) {
++ if (ret == 0) {
+ spin_lock_irq(&current->sighand->siglock);
+ sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+ if (!(ka->sa.sa_flags & SA_NODEFER))
+@@ -428,9 +397,10 @@ handle_signal(unsigned long sig, siginfo
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+-int do_signal(struct pt_regs *regs, sigset_t *oldset)
++static void do_signal(struct pt_regs *regs)
+ {
+ struct k_sigaction ka;
++ sigset_t *oldset;
+ siginfo_t info;
+ int signr;
+
+@@ -441,12 +411,14 @@ int do_signal(struct pt_regs *regs, sigs
+ * if so.
+ */
+ if (!user_mode(regs))
+- return 1;
++ return;
+
+- if (try_to_freeze())
++ if (try_to_freeze() && !signal_pending(current))
+ goto no_signal;
+
+- if (!oldset)
++ if (test_thread_flag(TIF_RESTORE_SIGMASK))
++ oldset = &current->saved_sigmask;
++ else
+ oldset = &current->blocked;
+
+ signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+@@ -460,7 +432,15 @@ int do_signal(struct pt_regs *regs, sigs
+ set_debugreg(current->thread.debugreg7, 7);
+
+ /* Whee! Actually deliver the signal. */
+- return handle_signal(signr, &info, &ka, oldset, regs);
++ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
++ /* a signal was successfully delivered; the saved
++ * sigmask will have been stored in the signal frame,
++ * and will be restored by sigreturn, so we can simply
++ * clear the TIF_RESTORE_SIGMASK flag */
++ if (test_thread_flag(TIF_RESTORE_SIGMASK))
++ clear_thread_flag(TIF_RESTORE_SIGMASK);
++ }
++ return;
+ }
+
+ no_signal:
+@@ -481,10 +461,16 @@ int do_signal(struct pt_regs *regs, sigs
+ regs->rip -= 2;
+ }
+ }
+- return 0;
++
++ /* if there's no signal to deliver, we just put the saved sigmask
++ * back */
++ if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
++ clear_thread_flag(TIF_RESTORE_SIGMASK);
++ sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
++ }
+ }
+
+-void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
++void do_notify_resume(struct pt_regs *regs, sigset_t *unused, __u32 thread_info_flags)
+ {
+ #ifdef DEBUG_SIG
+ printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
+@@ -498,8 +484,8 @@ void do_notify_resume(struct pt_regs *re
+ }
+
+ /* deal with pending signal delivery */
+- if (thread_info_flags & _TIF_SIGPENDING)
+- do_signal(regs,oldset);
++ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
++ do_signal(regs);
+ }
+
+ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/smp.c linux-2.6.16-026test015/arch/x86_64/kernel/smp.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/smp.c 2006-07-04 14:41:37.000000000 +0400
+@@ -28,6 +28,7 @@
+ #include <asm/proto.h>
+ #include <asm/apicdef.h>
+ #include <asm/idle.h>
++#include <asm/nmi.h>
+
+ /*
+ * Smarter SMP flushing macros.
+@@ -444,6 +445,84 @@ int smp_call_function (void (*func) (voi
+ return 0;
+ }
+
++static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED;
++static struct nmi_call_data_struct {
++ smp_nmi_function func;
++ void *info;
++ atomic_t started;
++ atomic_t finished;
++ cpumask_t cpus_called;
++ int wait;
++} *nmi_call_data;
++
++static int smp_nmi_callback(struct pt_regs * regs, int cpu)
++{
++ smp_nmi_function func;
++ void *info;
++ int wait;
++
++ func = nmi_call_data->func;
++ info = nmi_call_data->info;
++ wait = nmi_call_data->wait;
++ ack_APIC_irq();
++ /* prevent from calling func() multiple times */
++ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called))
++ return 0;
++ /*
++ * notify initiating CPU that I've grabbed the data and am
++ * about to execute the function
++ */
++ mb();
++ atomic_inc(&nmi_call_data->started);
++ /* at this point the nmi_call_data structure is out of scope */
++ irq_enter();
++ func(regs, info);
++ irq_exit();
++ if (wait)
++ atomic_inc(&nmi_call_data->finished);
++
++ return 0;
++}
++
++int smp_nmi_call_function(smp_nmi_function func, void *info, int wait)
++{
++ struct nmi_call_data_struct data;
++ int cpus;
++
++ cpus = num_online_cpus() - 1;
++ if (!cpus)
++ return 0;
++
++ data.func = func;
++ data.info = info;
++ data.wait = wait;
++ atomic_set(&data.started, 0);
++ atomic_set(&data.finished, 0);
++ cpus_clear(data.cpus_called);
++ /* prevent this cpu from calling func if NMI happens */
++ cpu_set(smp_processor_id(), data.cpus_called);
++
++ if (!spin_trylock(&nmi_call_lock))
++ return -1;
++
++ nmi_call_data = &data;
++ set_nmi_ipi_callback(smp_nmi_callback);
++ mb();
++
++ /* Send a message to all other CPUs and wait for them to respond */
++ send_IPI_allbutself(APIC_DM_NMI);
++ while (atomic_read(&data.started) != cpus)
++ barrier();
++
++ unset_nmi_ipi_callback();
++ if (wait)
++ while (atomic_read(&data.finished) != cpus)
++ barrier();
++ spin_unlock(&nmi_call_lock);
++
++ return 0;
++}
++
+ void smp_stop_cpu(void)
+ {
+ unsigned long flags;
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/sys_x86_64.c linux-2.6.16-026test015/arch/x86_64/kernel/sys_x86_64.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/sys_x86_64.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/sys_x86_64.c 2006-07-04 14:41:38.000000000 +0400
+@@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_uts
+ {
+ int err;
+ down_read(&uts_sem);
+- err = copy_to_user(name, &system_utsname, sizeof (*name));
++ err = copy_to_user(name, &ve_utsname, sizeof (*name));
+ up_read(&uts_sem);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/time.c linux-2.6.16-026test015/arch/x86_64/kernel/time.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/time.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/time.c 2006-07-04 14:41:38.000000000 +0400
+@@ -66,6 +66,8 @@ unsigned long vxtime_hz = PIT_TICK_RATE;
+ int report_lost_ticks; /* command line option */
+ unsigned long long monotonic_base;
+
++EXPORT_SYMBOL(cpu_khz);
++
+ struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
+
+ volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+diff -upr linux-2.6.16.orig/arch/x86_64/kernel/traps.c linux-2.6.16-026test015/arch/x86_64/kernel/traps.c
+--- linux-2.6.16.orig/arch/x86_64/kernel/traps.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/kernel/traps.c 2006-07-04 14:41:38.000000000 +0400
+@@ -30,6 +30,7 @@
+ #include <linux/moduleparam.h>
+ #include <linux/nmi.h>
+ #include <linux/kprobes.h>
++#include <linux/kexec.h>
+
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+@@ -116,6 +117,9 @@ int printk_address(unsigned long address
+ char *delim = ":";
+ char namebuf[128];
+
++ if (!decode_call_traces)
++ return printk("[<%016lx>]", address);
++
+ symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
+ if (!symname)
+ return printk("[<%016lx>]", address);
+@@ -208,7 +212,7 @@ void show_trace(unsigned long *stack)
+ do while (cond) { \
+ unsigned long addr = *stack++; \
+ if (kernel_text_address(addr)) { \
+- if (i > 50) { \
++ if (i > 50 && decode_call_traces) { \
+ printk("\n "); \
+ i = 0; \
+ } \
+@@ -290,7 +294,7 @@ void show_stack(struct task_struct *tsk,
+ if (((long) stack & (THREAD_SIZE-1)) == 0)
+ break;
+ }
+- if (i && ((i % 4) == 0))
++ if (i && ((i % 4) == 0) && decode_call_traces)
+ printk("\n ");
+ printk("%016lx ", *stack++);
+ touch_nmi_watchdog();
+@@ -319,10 +323,12 @@ void show_registers(struct pt_regs *regs
+
+ rsp = regs->rsp;
+
+- printk("CPU %d ", cpu);
++ printk("CPU: %d ", cpu);
+ __show_regs(regs);
+- printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+- cur->comm, cur->pid, task_thread_info(cur), cur);
++ printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n",
++ cur->comm, cur->pid,
++ VEID(VE_TASK_INFO(current)->owner_env),
++ task_thread_info(cur), cur);
+
+ /*
+ * When in-kernel, we also print out the stack and code at the
+@@ -434,6 +440,8 @@ void __kprobes __die(const char * str, s
+ printk(KERN_ALERT "RIP ");
+ printk_address(regs->rip);
+ printk(" RSP <%016lx>\n", regs->rsp);
++ if (kexec_should_crash(current))
++ crash_kexec(regs);
+ }
+
+ void die(const char * str, struct pt_regs * regs, long err)
+@@ -456,8 +464,11 @@ void __kprobes die_nmi(char *str, struct
+ */
+ printk(str, safe_smp_processor_id());
+ show_registers(regs);
++ if (kexec_should_crash(current))
++ crash_kexec(regs);
+ if (panic_on_timeout || panic_on_oops)
+ panic("nmi watchdog");
++ smp_nmi_call_function(smp_show_regs, NULL, 1);
+ printk("console shuts up ...\n");
+ oops_end(flags);
+ do_exit(SIGSEGV);
+diff -upr linux-2.6.16.orig/arch/x86_64/mm/fault.c linux-2.6.16-026test015/arch/x86_64/mm/fault.c
+--- linux-2.6.16.orig/arch/x86_64/mm/fault.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/mm/fault.c 2006-07-04 14:41:38.000000000 +0400
+@@ -41,27 +41,6 @@
+ #define PF_RSVD (1<<3)
+ #define PF_INSTR (1<<4)
+
+-void bust_spinlocks(int yes)
+-{
+- int loglevel_save = console_loglevel;
+- if (yes) {
+- oops_in_progress = 1;
+- } else {
+-#ifdef CONFIG_VT
+- unblank_screen();
+-#endif
+- oops_in_progress = 0;
+- /*
+- * OK, the message is on the console. Now we call printk()
+- * without oops_in_progress set so that printk will give klogd
+- * a poke. Hold onto your hats...
+- */
+- console_loglevel = 15; /* NMI oopser may have shut the console up */
+- printk(" ");
+- console_loglevel = loglevel_save;
+- }
+-}
+-
+ /* Sometimes the CPU reports invalid exceptions on prefetch.
+ Check that here and ignore.
+ Opcode checker based on code by Richard Brunner */
+@@ -293,7 +272,7 @@ static int vmalloc_fault(unsigned long a
+ }
+
+ int page_fault_trace = 0;
+-int exception_trace = 1;
++int exception_trace = 0;
+
+ /*
+ * This routine handles page faults. It determines the address,
+@@ -322,7 +301,7 @@ asmlinkage void __kprobes do_page_fault(
+ local_irq_enable();
+
+ if (unlikely(page_fault_trace))
+- printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
++ ve_printk(VE_LOG, "pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
+ regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code);
+
+ tsk = current;
+@@ -372,7 +351,6 @@ asmlinkage void __kprobes do_page_fault(
+ if (unlikely(in_atomic() || !mm))
+ goto bad_area_nosemaphore;
+
+- again:
+ /* When running in the kernel we expect faults to occur only to
+ * addresses in user space. All other faults represent errors in the
+ * kernel and should generate an OOPS. Unfortunatly, in the case of an
+@@ -476,7 +454,7 @@ bad_area_nosemaphore:
+ return;
+
+ if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+- printk(
++ ve_printk(VE_LOG,
+ "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
+ tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
+ tsk->comm, tsk->pid, address, regs->rip,
+@@ -526,8 +504,10 @@ no_context:
+ else
+ printk(KERN_ALERT "Unable to handle kernel paging request");
+ printk(" at %016lx RIP: \n" KERN_ALERT,address);
+- printk_address(regs->rip);
+- printk("\n");
++ if (decode_call_traces) {
++ printk_address(regs->rip);
++ printk("\n");
++ }
+ dump_pagetable(address);
+ tsk->thread.cr2 = address;
+ tsk->thread.trap_no = 14;
+@@ -544,13 +524,14 @@ no_context:
+ */
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+- if (current->pid == 1) {
+- yield();
+- goto again;
+- }
+- printk("VM: killing process %s\n", tsk->comm);
+- if (error_code & 4)
+- do_exit(SIGKILL);
++ if (error_code & 4) {
++ /*
++ * 0-order allocation always success if something really
++ * fatal not happen: beancounter overdraft or OOM.
++ */
++ force_sig(SIGKILL, tsk);
++ return;
++ }
+ goto no_context;
+
+ do_sigbus:
+diff -upr linux-2.6.16.orig/arch/x86_64/mm/init.c linux-2.6.16-026test015/arch/x86_64/mm/init.c
+--- linux-2.6.16.orig/arch/x86_64/mm/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/arch/x86_64/mm/init.c 2006-07-04 14:41:38.000000000 +0400
+@@ -89,6 +89,7 @@ void show_mem(void)
+ printk(KERN_INFO "%lu pages shared\n",shared);
+ printk(KERN_INFO "%lu pages swap cached\n",cached);
+ }
++EXPORT_SYMBOL(show_mem);
+
+ /* References to section boundaries */
+
+diff -upr linux-2.6.16.orig/block/elevator.c linux-2.6.16-026test015/block/elevator.c
+--- linux-2.6.16.orig/block/elevator.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/block/elevator.c 2006-07-04 14:41:38.000000000 +0400
+@@ -314,6 +314,7 @@ void elv_insert(request_queue_t *q, stru
+ {
+ struct list_head *pos;
+ unsigned ordseq;
++ int unplug_it = 1;
+
+ rq->q = q;
+
+@@ -378,6 +379,11 @@ void elv_insert(request_queue_t *q, stru
+ }
+
+ list_add_tail(&rq->queuelist, pos);
++ /*
++ * most requeues happen because of a busy condition, don't
++ * force unplug of the queue for that case.
++ */
++ unplug_it = 0;
+ break;
+
+ default:
+@@ -386,7 +392,7 @@ void elv_insert(request_queue_t *q, stru
+ BUG();
+ }
+
+- if (blk_queue_plugged(q)) {
++ if (unplug_it && blk_queue_plugged(q)) {
+ int nrq = q->rq.count[READ] + q->rq.count[WRITE]
+ - q->in_flight;
+
+@@ -676,7 +682,7 @@ void elv_unregister(struct elevator_type
+ * Iterate every thread in the process to remove the io contexts.
+ */
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ struct io_context *ioc = p->io_context;
+ if (ioc && ioc->cic) {
+ ioc->cic->exit(ioc->cic);
+@@ -688,7 +694,7 @@ void elv_unregister(struct elevator_type
+ ioc->aic->dtor(ioc->aic);
+ ioc->aic = NULL;
+ }
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+ read_unlock(&tasklist_lock);
+
+ spin_lock_irq(&elv_list_lock);
+diff -upr linux-2.6.16.orig/block/genhd.c linux-2.6.16-026test015/block/genhd.c
+--- linux-2.6.16.orig/block/genhd.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/block/genhd.c 2006-07-04 14:41:38.000000000 +0400
+@@ -16,9 +16,8 @@
+ #include <linux/kobj_map.h>
+ #include <linux/buffer_head.h>
+
+-#define MAX_PROBE_HASH 255 /* random */
+-
+-static struct subsystem block_subsys;
++struct subsystem block_subsys;
++EXPORT_SYMBOL(block_subsys);
+
+ static DECLARE_MUTEX(block_subsys_sem);
+
+@@ -30,108 +29,29 @@ static struct blk_major_name {
+ struct blk_major_name *next;
+ int major;
+ char name[16];
+-} *major_names[MAX_PROBE_HASH];
++} *major_names[BLKDEV_MAJOR_HASH_SIZE];
+
+ /* index in the above - for now: assume no multimajor ranges */
+ static inline int major_to_index(int major)
+ {
+- return major % MAX_PROBE_HASH;
+-}
+-
+-struct blkdev_info {
+- int index;
+- struct blk_major_name *bd;
+-};
+-
+-/*
+- * iterate over a list of blkdev_info structures. allows
+- * the major_names array to be iterated over from outside this file
+- * must be called with the block_subsys_sem held
+- */
+-void *get_next_blkdev(void *dev)
+-{
+- struct blkdev_info *info;
+-
+- if (dev == NULL) {
+- info = kmalloc(sizeof(*info), GFP_KERNEL);
+- if (!info)
+- goto out;
+- info->index=0;
+- info->bd = major_names[info->index];
+- if (info->bd)
+- goto out;
+- } else {
+- info = dev;
+- }
+-
+- while (info->index < ARRAY_SIZE(major_names)) {
+- if (info->bd)
+- info->bd = info->bd->next;
+- if (info->bd)
+- goto out;
+- /*
+- * No devices on this chain, move to the next
+- */
+- info->index++;
+- info->bd = (info->index < ARRAY_SIZE(major_names)) ?
+- major_names[info->index] : NULL;
+- if (info->bd)
+- goto out;
+- }
+-
+-out:
+- return info;
+-}
+-
+-void *acquire_blkdev_list(void)
+-{
+- down(&block_subsys_sem);
+- return get_next_blkdev(NULL);
+-}
+-
+-void release_blkdev_list(void *dev)
+-{
+- up(&block_subsys_sem);
+- kfree(dev);
++ return major % BLKDEV_MAJOR_HASH_SIZE;
+ }
+
++#ifdef CONFIG_PROC_FS
+
+-/*
+- * Count the number of records in the blkdev_list.
+- * must be called with the block_subsys_sem held
+- */
+-int count_blkdev_list(void)
++void blkdev_show(struct seq_file *f, off_t offset)
+ {
+- struct blk_major_name *n;
+- int i, count;
+-
+- count = 0;
++ struct blk_major_name *dp;
+
+- for (i = 0; i < ARRAY_SIZE(major_names); i++) {
+- for (n = major_names[i]; n; n = n->next)
+- count++;
++ if (offset < BLKDEV_MAJOR_HASH_SIZE) {
++ down(&block_subsys_sem);
++ for (dp = major_names[offset]; dp; dp = dp->next)
++ seq_printf(f, "%3d %s\n", dp->major, dp->name);
++ up(&block_subsys_sem);
+ }
+-
+- return count;
+-}
+-
+-/*
+- * extract the major and name values from a blkdev_info struct
+- * passed in as a void to *dev. Must be called with
+- * block_subsys_sem held
+- */
+-int get_blkdev_info(void *dev, int *major, char **name)
+-{
+- struct blkdev_info *info = dev;
+-
+- if (info->bd == NULL)
+- return 1;
+-
+- *major = info->bd->major;
+- *name = info->bd->name;
+- return 0;
+ }
+
++#endif /* CONFIG_PROC_FS */
+
+ int register_blkdev(unsigned int major, const char *name)
+ {
+@@ -592,7 +512,7 @@ static struct kset_uevent_ops block_ueve
+ };
+
+ /* declare block_subsys. */
+-static decl_subsys(block, &ktype_block, &block_uevent_ops);
++decl_subsys(block, &ktype_block, &block_uevent_ops);
+
+
+ /*
+diff -upr linux-2.6.16.orig/block/ll_rw_blk.c linux-2.6.16-026test015/block/ll_rw_blk.c
+--- linux-2.6.16.orig/block/ll_rw_blk.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/block/ll_rw_blk.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1719,8 +1719,21 @@ void blk_run_queue(struct request_queue
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_remove_plug(q);
+- if (!elv_queue_empty(q))
+- q->request_fn(q);
++
++ /*
++ * Only recurse once to avoid overrunning the stack, let the unplug
++ * handling reinvoke the handler shortly if we already got there.
++ */
++ if (!elv_queue_empty(q)) {
++ if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
++ q->request_fn(q);
++ clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
++ } else {
++ blk_plug_device(q);
++ kblockd_schedule_work(&q->unplug_work);
++ }
++ }
++
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
+ EXPORT_SYMBOL(blk_run_queue);
+diff -upr linux-2.6.16.orig/drivers/acpi/processor_perflib.c linux-2.6.16-026test015/drivers/acpi/processor_perflib.c
+--- linux-2.6.16.orig/drivers/acpi/processor_perflib.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/acpi/processor_perflib.c 2006-07-04 14:41:36.000000000 +0400
+@@ -577,6 +577,8 @@ acpi_processor_register_performance(stru
+ return_VALUE(-EBUSY);
+ }
+
++ WARN_ON(!performance);
++
+ pr->performance = performance;
+
+ if (acpi_processor_get_performance_info(pr)) {
+@@ -609,7 +611,8 @@ acpi_processor_unregister_performance(st
+ return_VOID;
+ }
+
+- kfree(pr->performance->states);
++ if (pr->performance)
++ kfree(pr->performance->states);
+ pr->performance = NULL;
+
+ acpi_cpufreq_remove_file(pr);
+diff -upr linux-2.6.16.orig/drivers/base/class.c linux-2.6.16-026test015/drivers/base/class.c
+--- linux-2.6.16.orig/drivers/base/class.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/base/class.c 2006-07-04 14:41:38.000000000 +0400
+@@ -72,8 +72,13 @@ static struct kobj_type ktype_class = {
+ };
+
+ /* Hotplug events for classes go to the class_obj subsys */
+-static decl_subsys(class, &ktype_class, NULL);
++decl_subsys(class, &ktype_class, NULL);
+
++#ifndef CONFIG_VE
++#define visible_class_subsys class_subsys
++#else
++#define visible_class_subsys (*get_exec_env()->class_subsys)
++#endif
+
+ int class_create_file(struct class * cls, const struct class_attribute * attr)
+ {
+@@ -148,7 +153,7 @@ int class_register(struct class * cls)
+ if (error)
+ return error;
+
+- subsys_set_kset(cls, class_subsys);
++ subsys_set_kset(cls, visible_class_subsys);
+
+ error = subsystem_register(&cls->subsys);
+ if (!error) {
+@@ -420,8 +425,13 @@ static struct kset_uevent_ops class_ueve
+ .uevent = class_uevent,
+ };
+
+-static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops);
++decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops);
+
++#ifndef CONFIG_VE
++#define visible_class_obj_subsys class_obj_subsys
++#else
++#define visible_class_obj_subsys (*get_exec_env()->class_obj_subsys)
++#endif
+
+ static int class_device_add_attrs(struct class_device * cd)
+ {
+@@ -470,7 +480,7 @@ static ssize_t store_uevent(struct class
+
+ void class_device_initialize(struct class_device *class_dev)
+ {
+- kobj_set_kset_s(class_dev, class_obj_subsys);
++ kobj_set_kset_s(class_dev, visible_class_obj_subsys);
+ kobject_init(&class_dev->kobj);
+ INIT_LIST_HEAD(&class_dev->node);
+ }
+@@ -805,12 +815,19 @@ void class_interface_unregister(struct c
+ class_put(parent);
+ }
+
+-
++void prepare_sysfs_classes(void)
++{
++#ifdef CONFIG_VE
++ get_ve0()->class_subsys = &class_subsys;
++ get_ve0()->class_obj_subsys = &class_obj_subsys;
++#endif
++}
+
+ int __init classes_init(void)
+ {
+ int retval;
+
++ prepare_sysfs_classes();
+ retval = subsystem_register(&class_subsys);
+ if (retval)
+ return retval;
+@@ -848,3 +865,6 @@ EXPORT_SYMBOL_GPL(class_device_remove_bi
+
+ EXPORT_SYMBOL_GPL(class_interface_register);
+ EXPORT_SYMBOL_GPL(class_interface_unregister);
++
++EXPORT_SYMBOL(class_subsys);
++EXPORT_SYMBOL(class_obj_subsys);
+diff -upr linux-2.6.16.orig/drivers/base/cpu.c linux-2.6.16-026test015/drivers/base/cpu.c
+--- linux-2.6.16.orig/drivers/base/cpu.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/base/cpu.c 2006-07-04 14:41:36.000000000 +0400
+@@ -141,7 +141,7 @@ int __devinit register_cpu(struct cpu *c
+ return error;
+ }
+
+-struct sys_device *get_cpu_sysdev(int cpu)
++struct sys_device *get_cpu_sysdev(unsigned cpu)
+ {
+ if (cpu < NR_CPUS)
+ return cpu_sys_devices[cpu];
+diff -upr linux-2.6.16.orig/drivers/base/firmware_class.c linux-2.6.16-026test015/drivers/base/firmware_class.c
+--- linux-2.6.16.orig/drivers/base/firmware_class.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/base/firmware_class.c 2006-07-04 14:41:36.000000000 +0400
+@@ -211,18 +211,20 @@ static int
+ fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size)
+ {
+ u8 *new_data;
++ int new_size = fw_priv->alloc_size;
+
+ if (min_size <= fw_priv->alloc_size)
+ return 0;
+
+- new_data = vmalloc(fw_priv->alloc_size + PAGE_SIZE);
++ new_size = ALIGN(min_size, PAGE_SIZE);
++ new_data = vmalloc(new_size);
+ if (!new_data) {
+ printk(KERN_ERR "%s: unable to alloc buffer\n", __FUNCTION__);
+ /* Make sure that we don't keep incomplete data */
+ fw_load_abort(fw_priv);
+ return -ENOMEM;
+ }
+- fw_priv->alloc_size += PAGE_SIZE;
++ fw_priv->alloc_size = new_size;
+ if (fw_priv->fw->data) {
+ memcpy(new_data, fw_priv->fw->data, fw_priv->fw->size);
+ vfree(fw_priv->fw->data);
+diff -upr linux-2.6.16.orig/drivers/base/node.c linux-2.6.16-026test015/drivers/base/node.c
+--- linux-2.6.16.orig/drivers/base/node.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/base/node.c 2006-07-04 14:41:36.000000000 +0400
+@@ -106,7 +106,7 @@ static ssize_t node_read_numastat(struct
+ other_node = 0;
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *z = &pg->node_zones[i];
+- for (cpu = 0; cpu < NR_CPUS; cpu++) {
++ for_each_online_cpu(cpu) {
+ struct per_cpu_pageset *ps = zone_pcp(z,cpu);
+ numa_hit += ps->numa_hit;
+ numa_miss += ps->numa_miss;
+diff -upr linux-2.6.16.orig/drivers/block/cciss.c linux-2.6.16-026test015/drivers/block/cciss.c
+--- linux-2.6.16.orig/drivers/block/cciss.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/block/cciss.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1181,6 +1181,53 @@ static int revalidate_allvol(ctlr_info_t
+ return 0;
+ }
+
++static inline void complete_buffers(struct bio *bio, int status)
++{
++ while (bio) {
++ struct bio *xbh = bio->bi_next;
++ int nr_sectors = bio_sectors(bio);
++
++ bio->bi_next = NULL;
++ blk_finished_io(len);
++ bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO);
++ bio = xbh;
++ }
++
++}
++
++static void cciss_softirq_done(struct request *rq)
++{
++ CommandList_struct *cmd = rq->completion_data;
++ ctlr_info_t *h = hba[cmd->ctlr];
++ unsigned long flags;
++ u64bit temp64;
++ int i, ddir;
++
++ if (cmd->Request.Type.Direction == XFER_READ)
++ ddir = PCI_DMA_FROMDEVICE;
++ else
++ ddir = PCI_DMA_TODEVICE;
++
++ /* command did not need to be retried */
++ /* unmap the DMA mapping for all the scatter gather elements */
++ for(i=0; i<cmd->Header.SGList; i++) {
++ temp64.val32.lower = cmd->SG[i].Addr.lower;
++ temp64.val32.upper = cmd->SG[i].Addr.upper;
++ pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
++ }
++
++ complete_buffers(rq->bio, rq->errors);
++
++#ifdef CCISS_DEBUG
++ printk("Done with %p\n", rq);
++#endif /* CCISS_DEBUG */
++
++ spin_lock_irqsave(&h->lock, flags);
++ end_that_request_last(rq, rq->errors);
++ cmd_free(h, cmd,1);
++ spin_unlock_irqrestore(&h->lock, flags);
++}
++
+ /* This function will check the usage_count of the drive to be updated/added.
+ * If the usage_count is zero then the drive information will be updated and
+ * the disk will be re-registered with the kernel. If not then it will be
+@@ -1249,6 +1296,8 @@ static void cciss_update_drive_info(int
+
+ blk_queue_max_sectors(disk->queue, 512);
+
++ blk_queue_softirq_done(disk->queue, cciss_softirq_done);
++
+ disk->queue->queuedata = hba[ctlr];
+
+ blk_queue_hardsect_size(disk->queue,
+@@ -2148,20 +2197,6 @@ static void start_io( ctlr_info_t *h)
+ addQ (&(h->cmpQ), c);
+ }
+ }
+-
+-static inline void complete_buffers(struct bio *bio, int status)
+-{
+- while (bio) {
+- struct bio *xbh = bio->bi_next;
+- int nr_sectors = bio_sectors(bio);
+-
+- bio->bi_next = NULL;
+- blk_finished_io(len);
+- bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO);
+- bio = xbh;
+- }
+-
+-}
+ /* Assumes that CCISS_LOCK(h->ctlr) is held. */
+ /* Zeros out the error record and then resends the command back */
+ /* to the controller */
+@@ -2179,39 +2214,6 @@ static inline void resend_cciss_cmd( ctl
+ start_io(h);
+ }
+
+-static void cciss_softirq_done(struct request *rq)
+-{
+- CommandList_struct *cmd = rq->completion_data;
+- ctlr_info_t *h = hba[cmd->ctlr];
+- unsigned long flags;
+- u64bit temp64;
+- int i, ddir;
+-
+- if (cmd->Request.Type.Direction == XFER_READ)
+- ddir = PCI_DMA_FROMDEVICE;
+- else
+- ddir = PCI_DMA_TODEVICE;
+-
+- /* command did not need to be retried */
+- /* unmap the DMA mapping for all the scatter gather elements */
+- for(i=0; i<cmd->Header.SGList; i++) {
+- temp64.val32.lower = cmd->SG[i].Addr.lower;
+- temp64.val32.upper = cmd->SG[i].Addr.upper;
+- pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir);
+- }
+-
+- complete_buffers(rq->bio, rq->errors);
+-
+-#ifdef CCISS_DEBUG
+- printk("Done with %p\n", rq);
+-#endif /* CCISS_DEBUG */
+-
+- spin_lock_irqsave(&h->lock, flags);
+- end_that_request_last(rq, rq->errors);
+- cmd_free(h, cmd,1);
+- spin_unlock_irqrestore(&h->lock, flags);
+-}
+-
+ /* checks the status of the job and calls complete buffers to mark all
+ * buffers for the completed job. Note that this function does not need
+ * to hold the hba/queue lock.
+@@ -3269,8 +3271,8 @@ clean2:
+ unregister_blkdev(hba[i]->major, hba[i]->devname);
+ clean1:
+ release_io_mem(hba[i]);
+- free_hba(i);
+ hba[i]->busy_initializing = 0;
++ free_hba(i);
+ return(-1);
+ }
+
+diff -upr linux-2.6.16.orig/drivers/block/ub.c linux-2.6.16-026test015/drivers/block/ub.c
+--- linux-2.6.16.orig/drivers/block/ub.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/block/ub.c 2006-07-04 14:41:36.000000000 +0400
+@@ -704,6 +704,9 @@ static void ub_cleanup(struct ub_dev *sc
+ kfree(lun);
+ }
+
++ usb_set_intfdata(sc->intf, NULL);
++ usb_put_intf(sc->intf);
++ usb_put_dev(sc->dev);
+ kfree(sc);
+ }
+
+@@ -2428,7 +2431,12 @@ static int ub_probe(struct usb_interface
+ // sc->ifnum = intf->cur_altsetting->desc.bInterfaceNumber;
+ usb_set_intfdata(intf, sc);
+ usb_get_dev(sc->dev);
+- // usb_get_intf(sc->intf); /* Do we need this? */
++ /*
++ * Since we give the interface struct to the block level through
++ * disk->driverfs_dev, we have to pin it. Otherwise, block_uevent
++ * oopses on close after a disconnect (kernels 2.6.16 and up).
++ */
++ usb_get_intf(sc->intf);
+
+ snprintf(sc->name, 12, DRV_NAME "(%d.%d)",
+ sc->dev->bus->busnum, sc->dev->devnum);
+@@ -2509,7 +2517,7 @@ static int ub_probe(struct usb_interface
+ err_diag:
+ err_dev_desc:
+ usb_set_intfdata(intf, NULL);
+- // usb_put_intf(sc->intf);
++ usb_put_intf(sc->intf);
+ usb_put_dev(sc->dev);
+ kfree(sc);
+ err_core:
+@@ -2688,12 +2696,6 @@ static void ub_disconnect(struct usb_int
+ */
+
+ device_remove_file(&sc->intf->dev, &dev_attr_diag);
+- usb_set_intfdata(intf, NULL);
+- // usb_put_intf(sc->intf);
+- sc->intf = NULL;
+- usb_put_dev(sc->dev);
+- sc->dev = NULL;
+-
+ ub_put(sc);
+ }
+
+diff -upr linux-2.6.16.orig/drivers/char/Kconfig linux-2.6.16-026test015/drivers/char/Kconfig
+--- linux-2.6.16.orig/drivers/char/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -187,6 +187,7 @@ config MOXA_SMARTIO
+ config ISI
+ tristate "Multi-Tech multiport card support (EXPERIMENTAL)"
+ depends on SERIAL_NONSTANDARD
++ select FW_LOADER
+ help
+ This is a driver for the Multi-Tech cards which provide several
+ serial ports. The driver is experimental and can currently only be
+diff -upr linux-2.6.16.orig/drivers/char/agp/efficeon-agp.c linux-2.6.16-026test015/drivers/char/agp/efficeon-agp.c
+--- linux-2.6.16.orig/drivers/char/agp/efficeon-agp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/agp/efficeon-agp.c 2006-07-04 14:41:36.000000000 +0400
+@@ -64,6 +64,12 @@ static struct gatt_mask efficeon_generic
+ {.mask = 0x00000001, .type = 0}
+ };
+
++/* This function does the same thing as mask_memory() for this chipset... */
++static inline unsigned long efficeon_mask_memory(unsigned long addr)
++{
++ return addr | 0x00000001;
++}
++
+ static struct aper_size_info_lvl2 efficeon_generic_sizes[4] =
+ {
+ {256, 65536, 0},
+@@ -251,7 +257,7 @@ static int efficeon_insert_memory(struct
+ last_page = NULL;
+ for (i = 0; i < count; i++) {
+ int index = pg_start + i;
+- unsigned long insert = mem->memory[i];
++ unsigned long insert = efficeon_mask_memory(mem->memory[i]);
+
+ page = (unsigned int *) efficeon_private.l1_table[index >> 10];
+
+diff -upr linux-2.6.16.orig/drivers/char/cs5535_gpio.c linux-2.6.16-026test015/drivers/char/cs5535_gpio.c
+--- linux-2.6.16.orig/drivers/char/cs5535_gpio.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/cs5535_gpio.c 2006-07-04 14:41:36.000000000 +0400
+@@ -241,9 +241,10 @@ static int __init cs5535_gpio_init(void)
+ static void __exit cs5535_gpio_cleanup(void)
+ {
+ dev_t dev_id = MKDEV(major, 0);
++
++ cdev_del(&cs5535_gpio_cdev);
+ unregister_chrdev_region(dev_id, CS5535_GPIO_COUNT);
+- if (gpio_base != 0)
+- release_region(gpio_base, CS5535_GPIO_SIZE);
++ release_region(gpio_base, CS5535_GPIO_SIZE);
+ }
+
+ module_init(cs5535_gpio_init);
+diff -upr linux-2.6.16.orig/drivers/char/ipmi/ipmi_bt_sm.c linux-2.6.16-026test015/drivers/char/ipmi/ipmi_bt_sm.c
+--- linux-2.6.16.orig/drivers/char/ipmi/ipmi_bt_sm.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/ipmi/ipmi_bt_sm.c 2006-07-04 14:41:36.000000000 +0400
+@@ -165,7 +165,7 @@ static int bt_start_transaction(struct s
+ {
+ unsigned int i;
+
+- if ((size < 2) || (size > IPMI_MAX_MSG_LENGTH))
++ if ((size < 2) || (size > (IPMI_MAX_MSG_LENGTH - 2)))
+ return -1;
+
+ if ((bt->state != BT_STATE_IDLE) && (bt->state != BT_STATE_HOSED))
+diff -upr linux-2.6.16.orig/drivers/char/pcmcia/cm4000_cs.c linux-2.6.16-026test015/drivers/char/pcmcia/cm4000_cs.c
+--- linux-2.6.16.orig/drivers/char/pcmcia/cm4000_cs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/pcmcia/cm4000_cs.c 2006-07-04 14:41:36.000000000 +0400
+@@ -2010,10 +2010,6 @@ static int __init cmm_init(void)
+ if (!cmm_class)
+ return -1;
+
+- rc = pcmcia_register_driver(&cm4000_driver);
+- if (rc < 0)
+- return rc;
+-
+ major = register_chrdev(0, DEVICE_NAME, &cm4000_fops);
+ if (major < 0) {
+ printk(KERN_WARNING MODULE_NAME
+@@ -2021,6 +2017,12 @@ static int __init cmm_init(void)
+ return -1;
+ }
+
++ rc = pcmcia_register_driver(&cm4000_driver);
++ if (rc < 0) {
++ unregister_chrdev(major, DEVICE_NAME);
++ return rc;
++ }
++
+ return 0;
+ }
+
+diff -upr linux-2.6.16.orig/drivers/char/pcmcia/cm4040_cs.c linux-2.6.16-026test015/drivers/char/pcmcia/cm4040_cs.c
+--- linux-2.6.16.orig/drivers/char/pcmcia/cm4040_cs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/pcmcia/cm4040_cs.c 2006-07-04 14:41:36.000000000 +0400
+@@ -769,16 +769,19 @@ static int __init cm4040_init(void)
+ if (!cmx_class)
+ return -1;
+
+- rc = pcmcia_register_driver(&reader_driver);
+- if (rc < 0)
+- return rc;
+-
+ major = register_chrdev(0, DEVICE_NAME, &reader_fops);
+ if (major < 0) {
+ printk(KERN_WARNING MODULE_NAME
+ ": could not get major number\n");
+ return -1;
+ }
++
++ rc = pcmcia_register_driver(&reader_driver);
++ if (rc < 0) {
++ unregister_chrdev(major, DEVICE_NAME);
++ return rc;
++ }
++
+ return 0;
+ }
+
+diff -upr linux-2.6.16.orig/drivers/char/pty.c linux-2.6.16-026test015/drivers/char/pty.c
+--- linux-2.6.16.orig/drivers/char/pty.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/pty.c 2006-07-04 14:41:38.000000000 +0400
+@@ -32,16 +32,30 @@
+ #include <linux/bitops.h>
+ #include <linux/devpts_fs.h>
+
++#include <ub/ub_misc.h>
++
+ /* These are global because they are accessed in tty_io.c */
+ #ifdef CONFIG_UNIX98_PTYS
+ struct tty_driver *ptm_driver;
+-static struct tty_driver *pts_driver;
++struct tty_driver *pts_driver;
++EXPORT_SYMBOL(ptm_driver);
++EXPORT_SYMBOL(pts_driver);
++
++void prepare_pty(void)
++{
++#ifdef CONFIG_VE
++ get_ve0()->ptm_driver = ptm_driver;
++ /* don't clean ptm_driver and co. here, they are used in vecalls.c */
++#endif
++}
+ #endif
+
+ static void pty_close(struct tty_struct * tty, struct file * filp)
+ {
+ if (!tty)
+ return;
++
++ ub_pty_uncharge(tty);
+ if (tty->driver->subtype == PTY_TYPE_MASTER) {
+ if (tty->count > 1)
+ printk("master pty_close: count = %d!!\n", tty->count);
+@@ -61,8 +75,12 @@ static void pty_close(struct tty_struct
+ if (tty->driver->subtype == PTY_TYPE_MASTER) {
+ set_bit(TTY_OTHER_CLOSED, &tty->flags);
+ #ifdef CONFIG_UNIX98_PTYS
+- if (tty->driver == ptm_driver)
++ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) {
++ struct ve_struct *old_env;
++ old_env = set_exec_env(VE_OWNER_TTY(tty));
+ devpts_pty_kill(tty->index);
++ (void)set_exec_env(old_env);
++ }
+ #endif
+ tty_vhangup(tty->link);
+ }
+@@ -212,6 +230,10 @@ static int pty_open(struct tty_struct *t
+ if (tty->link->count != 1)
+ goto out;
+
++ retval = -ENODEV;
++ if (ub_pty_charge(tty))
++ goto out;
++
+ clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
+ set_bit(TTY_THROTTLED, &tty->flags);
+ set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
+@@ -239,7 +261,9 @@ static struct tty_operations pty_ops = {
+
+ /* Traditional BSD devices */
+ #ifdef CONFIG_LEGACY_PTYS
+-static struct tty_driver *pty_driver, *pty_slave_driver;
++struct tty_driver *pty_driver, *pty_slave_driver;
++EXPORT_SYMBOL(pty_driver);
++EXPORT_SYMBOL(pty_slave_driver);
+
+ static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file,
+ unsigned int cmd, unsigned long arg)
+@@ -397,6 +421,7 @@ static void __init unix98_pty_init(void)
+ panic("Couldn't register Unix98 pts driver");
+
+ pty_table[1].data = &ptm_driver->refcount;
++ prepare_pty();
+ }
+ #else
+ static inline void unix98_pty_init(void) { }
+diff -upr linux-2.6.16.orig/drivers/char/snsc.c linux-2.6.16-026test015/drivers/char/snsc.c
+--- linux-2.6.16.orig/drivers/char/snsc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/snsc.c 2006-07-04 14:41:36.000000000 +0400
+@@ -391,7 +391,8 @@ scdrv_init(void)
+ format_module_id(devnamep, geo_module(geoid),
+ MODULE_FORMAT_BRIEF);
+ devnamep = devname + strlen(devname);
+- sprintf(devnamep, "#%d", geo_slab(geoid));
++ sprintf(devnamep, "^%d#%d", geo_slot(geoid),
++ geo_slab(geoid));
+
+ /* allocate sysctl device data */
+ scd = kmalloc(sizeof (struct sysctl_data_s),
+diff -upr linux-2.6.16.orig/drivers/char/snsc_event.c linux-2.6.16-026test015/drivers/char/snsc_event.c
+--- linux-2.6.16.orig/drivers/char/snsc_event.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/snsc_event.c 2006-07-04 14:41:38.000000000 +0400
+@@ -206,7 +206,7 @@ scdrv_dispatch_event(char *event, int le
+
+ /* first find init's task */
+ read_lock(&tasklist_lock);
+- for_each_process(p) {
++ for_each_process_all(p) {
+ if (p->pid == 1)
+ break;
+ }
+diff -upr linux-2.6.16.orig/drivers/char/sonypi.c linux-2.6.16-026test015/drivers/char/sonypi.c
+--- linux-2.6.16.orig/drivers/char/sonypi.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/sonypi.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1341,6 +1341,9 @@ static int __devinit sonypi_probe(struct
+ else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
+ PCI_DEVICE_ID_INTEL_ICH6_1, NULL)))
+ sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3;
++ else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL,
++ PCI_DEVICE_ID_INTEL_ICH7_1, NULL)))
++ sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3;
+ else
+ sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE2;
+
+diff -upr linux-2.6.16.orig/drivers/char/sysrq.c linux-2.6.16-026test015/drivers/char/sysrq.c
+--- linux-2.6.16.orig/drivers/char/sysrq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/sysrq.c 2006-07-04 14:41:39.000000000 +0400
+@@ -174,8 +174,13 @@ static struct sysrq_key_op sysrq_showloc
+ static void sysrq_handle_showregs(int key, struct pt_regs *pt_regs,
+ struct tty_struct *tty)
+ {
++ bust_spinlocks(1);
+ if (pt_regs)
+ show_regs(pt_regs);
++ bust_spinlocks(0);
++#if defined(__i386__) || defined(__x86_64__)
++ smp_nmi_call_function(smp_show_regs, NULL, 0);
++#endif
+ }
+ static struct sysrq_key_op sysrq_showregs_op = {
+ .handler = sysrq_handle_showregs,
+@@ -221,7 +226,7 @@ static void send_sig_all(int sig)
+ {
+ struct task_struct *p;
+
+- for_each_process(p) {
++ for_each_process_all(p) {
+ if (p->mm && p->pid != 1)
+ /* Not swapper, init nor kernel thread */
+ force_sig(sig, p);
+@@ -272,6 +277,19 @@ static struct sysrq_key_op sysrq_kill_op
+ .enable_mask = SYSRQ_ENABLE_SIGNAL,
+ };
+
++#ifdef CONFIG_SCHED_VCPU
++static void sysrq_handle_vschedstate(int key, struct pt_regs *pt_regs,
++ struct tty_struct *tty)
++{
++ show_vsched();
++}
++static struct sysrq_key_op sysrq_vschedstate_op = {
++ .handler = sysrq_handle_vschedstate,
++ .help_msg = "vsced_stAte",
++ .action_msg = "Show Vsched",
++};
++#endif
++
+ /* END SIGNAL SYSRQ HANDLERS BLOCK */
+
+ static void sysrq_handle_unrt(int key, struct pt_regs *pt_regs,
+@@ -300,9 +318,13 @@ static struct sysrq_key_op *sysrq_key_ta
+ /* 7 */ &sysrq_loglevel_op,
+ /* 8 */ &sysrq_loglevel_op,
+ /* 9 */ &sysrq_loglevel_op,
++#ifdef CONFIG_SCHED_VCPU
++/* a */ &sysrq_vschedstate_op,
++#else
+ /* a */ NULL, /* Don't use for system provided sysrqs,
+ it is handled specially on the sparc
+ and will never arrive */
++#endif
+ /* b */ &sysrq_reboot_op,
+ #ifdef CONFIG_KEXEC
+ /* c */ &sysrq_crashdump_op,
+diff -upr linux-2.6.16.orig/drivers/char/tipar.c linux-2.6.16-026test015/drivers/char/tipar.c
+--- linux-2.6.16.orig/drivers/char/tipar.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/tipar.c 2006-07-04 14:41:36.000000000 +0400
+@@ -515,7 +515,7 @@ tipar_init_module(void)
+ err = PTR_ERR(tipar_class);
+ goto out_chrdev;
+ }
+- if (parport_register_driver(&tipar_driver) || tp_count == 0) {
++ if (parport_register_driver(&tipar_driver)) {
+ printk(KERN_ERR "tipar: unable to register with parport\n");
+ err = -EIO;
+ goto out_class;
+diff -upr linux-2.6.16.orig/drivers/char/tlclk.c linux-2.6.16-026test015/drivers/char/tlclk.c
+--- linux-2.6.16.orig/drivers/char/tlclk.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/tlclk.c 2006-07-04 14:41:36.000000000 +0400
+@@ -327,7 +327,7 @@ static ssize_t store_received_ref_clk3a(
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(received_ref_clk3a, S_IWUGO, NULL,
++static DEVICE_ATTR(received_ref_clk3a, (S_IWUSR|S_IWGRP), NULL,
+ store_received_ref_clk3a);
+
+
+@@ -349,7 +349,7 @@ static ssize_t store_received_ref_clk3b(
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(received_ref_clk3b, S_IWUGO, NULL,
++static DEVICE_ATTR(received_ref_clk3b, (S_IWUSR|S_IWGRP), NULL,
+ store_received_ref_clk3b);
+
+
+@@ -371,7 +371,7 @@ static ssize_t store_enable_clk3b_output
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(enable_clk3b_output, S_IWUGO, NULL,
++static DEVICE_ATTR(enable_clk3b_output, (S_IWUSR|S_IWGRP), NULL,
+ store_enable_clk3b_output);
+
+ static ssize_t store_enable_clk3a_output(struct device *d,
+@@ -392,7 +392,7 @@ static ssize_t store_enable_clk3a_output
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(enable_clk3a_output, S_IWUGO, NULL,
++static DEVICE_ATTR(enable_clk3a_output, (S_IWUSR|S_IWGRP), NULL,
+ store_enable_clk3a_output);
+
+ static ssize_t store_enable_clkb1_output(struct device *d,
+@@ -413,7 +413,7 @@ static ssize_t store_enable_clkb1_output
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(enable_clkb1_output, S_IWUGO, NULL,
++static DEVICE_ATTR(enable_clkb1_output, (S_IWUSR|S_IWGRP), NULL,
+ store_enable_clkb1_output);
+
+
+@@ -435,7 +435,7 @@ static ssize_t store_enable_clka1_output
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(enable_clka1_output, S_IWUGO, NULL,
++static DEVICE_ATTR(enable_clka1_output, (S_IWUSR|S_IWGRP), NULL,
+ store_enable_clka1_output);
+
+ static ssize_t store_enable_clkb0_output(struct device *d,
+@@ -456,7 +456,7 @@ static ssize_t store_enable_clkb0_output
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(enable_clkb0_output, S_IWUGO, NULL,
++static DEVICE_ATTR(enable_clkb0_output, (S_IWUSR|S_IWGRP), NULL,
+ store_enable_clkb0_output);
+
+ static ssize_t store_enable_clka0_output(struct device *d,
+@@ -477,7 +477,7 @@ static ssize_t store_enable_clka0_output
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(enable_clka0_output, S_IWUGO, NULL,
++static DEVICE_ATTR(enable_clka0_output, (S_IWUSR|S_IWGRP), NULL,
+ store_enable_clka0_output);
+
+ static ssize_t store_select_amcb2_transmit_clock(struct device *d,
+@@ -519,7 +519,7 @@ static ssize_t store_select_amcb2_transm
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(select_amcb2_transmit_clock, S_IWUGO, NULL,
++static DEVICE_ATTR(select_amcb2_transmit_clock, (S_IWUSR|S_IWGRP), NULL,
+ store_select_amcb2_transmit_clock);
+
+ static ssize_t store_select_amcb1_transmit_clock(struct device *d,
+@@ -560,7 +560,7 @@ static ssize_t store_select_amcb1_transm
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(select_amcb1_transmit_clock, S_IWUGO, NULL,
++static DEVICE_ATTR(select_amcb1_transmit_clock, (S_IWUSR|S_IWGRP), NULL,
+ store_select_amcb1_transmit_clock);
+
+ static ssize_t store_select_redundant_clock(struct device *d,
+@@ -581,7 +581,7 @@ static ssize_t store_select_redundant_cl
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(select_redundant_clock, S_IWUGO, NULL,
++static DEVICE_ATTR(select_redundant_clock, (S_IWUSR|S_IWGRP), NULL,
+ store_select_redundant_clock);
+
+ static ssize_t store_select_ref_frequency(struct device *d,
+@@ -602,7 +602,7 @@ static ssize_t store_select_ref_frequenc
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(select_ref_frequency, S_IWUGO, NULL,
++static DEVICE_ATTR(select_ref_frequency, (S_IWUSR|S_IWGRP), NULL,
+ store_select_ref_frequency);
+
+ static ssize_t store_filter_select(struct device *d,
+@@ -623,7 +623,7 @@ static ssize_t store_filter_select(struc
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(filter_select, S_IWUGO, NULL, store_filter_select);
++static DEVICE_ATTR(filter_select, (S_IWUSR|S_IWGRP), NULL, store_filter_select);
+
+ static ssize_t store_hardware_switching_mode(struct device *d,
+ struct device_attribute *attr, const char *buf, size_t count)
+@@ -643,7 +643,7 @@ static ssize_t store_hardware_switching_
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(hardware_switching_mode, S_IWUGO, NULL,
++static DEVICE_ATTR(hardware_switching_mode, (S_IWUSR|S_IWGRP), NULL,
+ store_hardware_switching_mode);
+
+ static ssize_t store_hardware_switching(struct device *d,
+@@ -664,7 +664,7 @@ static ssize_t store_hardware_switching(
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(hardware_switching, S_IWUGO, NULL,
++static DEVICE_ATTR(hardware_switching, (S_IWUSR|S_IWGRP), NULL,
+ store_hardware_switching);
+
+ static ssize_t store_refalign (struct device *d,
+@@ -684,7 +684,7 @@ static ssize_t store_refalign (struct de
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(refalign, S_IWUGO, NULL, store_refalign);
++static DEVICE_ATTR(refalign, (S_IWUSR|S_IWGRP), NULL, store_refalign);
+
+ static ssize_t store_mode_select (struct device *d,
+ struct device_attribute *attr, const char *buf, size_t count)
+@@ -704,7 +704,7 @@ static ssize_t store_mode_select (struct
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(mode_select, S_IWUGO, NULL, store_mode_select);
++static DEVICE_ATTR(mode_select, (S_IWUSR|S_IWGRP), NULL, store_mode_select);
+
+ static ssize_t store_reset (struct device *d,
+ struct device_attribute *attr, const char *buf, size_t count)
+@@ -724,7 +724,7 @@ static ssize_t store_reset (struct devic
+ return strnlen(buf, count);
+ }
+
+-static DEVICE_ATTR(reset, S_IWUGO, NULL, store_reset);
++static DEVICE_ATTR(reset, (S_IWUSR|S_IWGRP), NULL, store_reset);
+
+ static struct attribute *tlclk_sysfs_entries[] = {
+ &dev_attr_current_ref.attr,
+@@ -767,6 +767,7 @@ static int __init tlclk_init(void)
+ printk(KERN_ERR "tlclk: can't get major %d.\n", tlclk_major);
+ return ret;
+ }
++ tlclk_major = ret;
+ alarm_events = kzalloc( sizeof(struct tlclk_alarms), GFP_KERNEL);
+ if (!alarm_events)
+ goto out1;
+diff -upr linux-2.6.16.orig/drivers/char/tty_io.c linux-2.6.16-026test015/drivers/char/tty_io.c
+--- linux-2.6.16.orig/drivers/char/tty_io.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/char/tty_io.c 2006-07-04 14:41:38.000000000 +0400
+@@ -86,6 +86,7 @@
+ #include <linux/string.h>
+ #include <linux/slab.h>
+ #include <linux/poll.h>
++#include <linux/ve_owner.h>
+ #include <linux/proc_fs.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+@@ -105,6 +106,7 @@
+ #include <linux/devfs_fs_kernel.h>
+
+ #include <linux/kmod.h>
++#include <ub/ub_mem.h>
+
+ #undef TTY_DEBUG_HANGUP
+
+@@ -122,11 +124,16 @@ struct termios tty_std_termios = { /* fo
+
+ EXPORT_SYMBOL(tty_std_termios);
+
++/* this lock protects tty_drivers list, this pretty guys do no locking */
++rwlock_t tty_driver_guard = RW_LOCK_UNLOCKED;
++EXPORT_SYMBOL(tty_driver_guard);
++
+ /* This list gets poked at by procfs and various bits of boot up code. This
+ could do with some rationalisation such as pulling the tty proc function
+ into this file */
+
+ LIST_HEAD(tty_drivers); /* linked list of tty drivers */
++EXPORT_SYMBOL(tty_drivers);
+
+ /* Semaphore to protect creating and releasing a tty. This is shared with
+ vt.c for deeply disgusting hack reasons */
+@@ -136,6 +143,15 @@ DECLARE_MUTEX(tty_sem);
+ extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
+ extern int pty_limit; /* Config limit on Unix98 ptys */
+ static DEFINE_IDR(allocated_ptys);
++#ifdef CONFIG_VE
++#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys))
++#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env())
++#define ve_ptm_driver (get_exec_env()->ptm_driver)
++#else
++#define __ve_allocated_ptys(ve) allocated_ptys
++#define ve_allocated_ptys allocated_ptys
++#define ve_ptm_driver ptm_driver
++#endif
+ static DECLARE_MUTEX(allocated_ptys_lock);
+ static int ptmx_open(struct inode *, struct file *);
+ #endif
+@@ -156,11 +172,25 @@ static int tty_fasync(int fd, struct fil
+ static void release_mem(struct tty_struct *tty, int idx);
+
+
++DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env)
++DCL_VE_OWNER(TTY, struct tty_struct, owner_env)
++
++void prepare_tty(void)
++{
++#ifdef CONFIG_VE
++ get_ve0()->allocated_ptys = &allocated_ptys;
++ /*
++ * in this case, tty_register_driver() setups
++ * owner_env correctly right from the bootup
++ */
++#endif
++}
++
+ static struct tty_struct *alloc_tty_struct(void)
+ {
+ struct tty_struct *tty;
+
+- tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL);
++ tty = ub_kmalloc(sizeof(struct tty_struct), GFP_KERNEL);
+ if (tty)
+ memset(tty, 0, sizeof(struct tty_struct));
+ return tty;
+@@ -857,14 +887,37 @@ static struct tty_driver *get_tty_driver
+ {
+ struct tty_driver *p;
+
++ read_lock(&tty_driver_guard);
+ list_for_each_entry(p, &tty_drivers, tty_drivers) {
+ dev_t base = MKDEV(p->major, p->minor_start);
+ if (device < base || device >= base + p->num)
+ continue;
+ *index = device - base;
+- return p;
++#ifdef CONFIG_VE
++ if (in_interrupt())
++ goto found;
++ if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR
++#ifdef CONFIG_UNIX98_PTYS
++ && (p->major<UNIX98_PTY_MASTER_MAJOR ||
++ p->major>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) &&
++ (p->major<UNIX98_PTY_SLAVE_MAJOR ||
++ p->major>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1)
++#endif
++ ) goto found;
++ if (ve_is_super(VE_OWNER_TTYDRV(p)) &&
++ ve_is_super(get_exec_env()))
++ goto found;
++ if (!ve_accessible_strict(VE_OWNER_TTYDRV(p), get_exec_env()))
++ continue;
++#endif
++ goto found;
+ }
++ read_unlock(&tty_driver_guard);
+ return NULL;
++
++found:
++ read_unlock(&tty_driver_guard);
++ return p;
+ }
+
+ /*
+@@ -1092,7 +1145,7 @@ static void do_tty_hangup(void *data)
+
+ read_lock(&tasklist_lock);
+ if (tty->session > 0) {
+- do_each_task_pid(tty->session, PIDTYPE_SID, p) {
++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) {
+ if (p->signal->tty == tty)
+ p->signal->tty = NULL;
+ if (!p->signal->leader)
+@@ -1101,7 +1154,7 @@ static void do_tty_hangup(void *data)
+ send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
+ if (tty->pgrp > 0)
+ p->signal->tty_old_pgrp = tty->pgrp;
+- } while_each_task_pid(tty->session, PIDTYPE_SID, p);
++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p);
+ }
+ read_unlock(&tasklist_lock);
+
+@@ -1218,9 +1271,9 @@ void disassociate_ctty(int on_exit)
+
+ /* Now clear signal->tty under the lock */
+ read_lock(&tasklist_lock);
+- do_each_task_pid(current->signal->session, PIDTYPE_SID, p) {
++ do_each_task_pid_all(current->signal->session, PIDTYPE_SID, p) {
+ p->signal->tty = NULL;
+- } while_each_task_pid(current->signal->session, PIDTYPE_SID, p);
++ } while_each_task_pid_all(current->signal->session, PIDTYPE_SID, p);
+ read_unlock(&tasklist_lock);
+ up(&tty_sem);
+ unlock_kernel();
+@@ -1446,21 +1499,28 @@ static inline void tty_line_name(struct
+ * really quite straightforward. The semaphore locking can probably be
+ * relaxed for the (most common) case of reopening a tty.
+ */
+-static int init_dev(struct tty_driver *driver, int idx,
+- struct tty_struct **ret_tty)
++static int init_dev(struct tty_driver *driver, int idx,
++ struct tty_struct *i_tty, struct tty_struct **ret_tty)
+ {
+ struct tty_struct *tty, *o_tty;
+ struct termios *tp, **tp_loc, *o_tp, **o_tp_loc;
+ struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc;
++ struct ve_struct * owner;
+ int retval=0;
+
+- /* check whether we're reopening an existing tty */
+- if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+- tty = devpts_get_tty(idx);
+- if (tty && driver->subtype == PTY_TYPE_MASTER)
+- tty = tty->link;
+- } else {
+- tty = driver->ttys[idx];
++ owner = VE_OWNER_TTYDRV(driver);
++
++ if (i_tty)
++ tty = i_tty;
++ else {
++ /* check whether we're reopening an existing tty */
++ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
++ tty = devpts_get_tty(idx);
++ if (tty && driver->subtype == PTY_TYPE_MASTER)
++ tty = tty->link;
++ } else {
++ tty = driver->ttys[idx];
++ }
+ }
+ if (tty) goto fast_track;
+
+@@ -1488,6 +1548,7 @@ static int init_dev(struct tty_driver *d
+ tty->driver = driver;
+ tty->index = idx;
+ tty_line_name(driver, idx, tty->name);
++ SET_VE_OWNER_TTY(tty, owner);
+
+ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+ tp_loc = &tty->termios;
+@@ -1498,7 +1559,7 @@ static int init_dev(struct tty_driver *d
+ }
+
+ if (!*tp_loc) {
+- tp = (struct termios *) kmalloc(sizeof(struct termios),
++ tp = (struct termios *) ub_kmalloc(sizeof(struct termios),
+ GFP_KERNEL);
+ if (!tp)
+ goto free_mem_out;
+@@ -1506,7 +1567,7 @@ static int init_dev(struct tty_driver *d
+ }
+
+ if (!*ltp_loc) {
+- ltp = (struct termios *) kmalloc(sizeof(struct termios),
++ ltp = (struct termios *) ub_kmalloc(sizeof(struct termios),
+ GFP_KERNEL);
+ if (!ltp)
+ goto free_mem_out;
+@@ -1521,6 +1582,7 @@ static int init_dev(struct tty_driver *d
+ o_tty->driver = driver->other;
+ o_tty->index = idx;
+ tty_line_name(driver->other, idx, o_tty->name);
++ SET_VE_OWNER_TTY(o_tty, owner);
+
+ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) {
+ o_tp_loc = &o_tty->termios;
+@@ -1532,7 +1594,7 @@ static int init_dev(struct tty_driver *d
+
+ if (!*o_tp_loc) {
+ o_tp = (struct termios *)
+- kmalloc(sizeof(struct termios), GFP_KERNEL);
++ ub_kmalloc(sizeof(struct termios), GFP_KERNEL);
+ if (!o_tp)
+ goto free_mem_out;
+ *o_tp = driver->other->init_termios;
+@@ -1540,7 +1602,7 @@ static int init_dev(struct tty_driver *d
+
+ if (!*o_ltp_loc) {
+ o_ltp = (struct termios *)
+- kmalloc(sizeof(struct termios), GFP_KERNEL);
++ ub_kmalloc(sizeof(struct termios), GFP_KERNEL);
+ if (!o_ltp)
+ goto free_mem_out;
+ memset(o_ltp, 0, sizeof(struct termios));
+@@ -1558,6 +1620,10 @@ static int init_dev(struct tty_driver *d
+ *o_ltp_loc = o_ltp;
+ o_tty->termios = *o_tp_loc;
+ o_tty->termios_locked = *o_ltp_loc;
++#ifdef CONFIG_VE
++ if (driver->other->refcount == 0)
++ (void)get_ve(owner);
++#endif
+ driver->other->refcount++;
+ if (driver->subtype == PTY_TYPE_MASTER)
+ o_tty->count++;
+@@ -1582,6 +1648,10 @@ static int init_dev(struct tty_driver *d
+ *ltp_loc = ltp;
+ tty->termios = *tp_loc;
+ tty->termios_locked = *ltp_loc;
++#ifdef CONFIG_VE
++ if (driver->refcount == 0)
++ (void)get_ve(owner);
++#endif
+ driver->refcount++;
+ tty->count++;
+
+@@ -1692,6 +1762,10 @@ static void release_mem(struct tty_struc
+ }
+ o_tty->magic = 0;
+ o_tty->driver->refcount--;
++#ifdef CONFIG_VE
++ if (o_tty->driver->refcount == 0)
++ put_ve(VE_OWNER_TTY(o_tty));
++#endif
+ file_list_lock();
+ list_del_init(&o_tty->tty_files);
+ file_list_unlock();
+@@ -1714,6 +1788,10 @@ static void release_mem(struct tty_struc
+
+ tty->magic = 0;
+ tty->driver->refcount--;
++#ifdef CONFIG_VE
++ if (tty->driver->refcount == 0)
++ put_ve(VE_OWNER_TTY(tty));
++#endif
+ file_list_lock();
+ list_del_init(&tty->tty_files);
+ file_list_unlock();
+@@ -1737,7 +1815,10 @@ static void release_dev(struct file * fi
+ int idx;
+ char buf[64];
+ unsigned long flags;
+-
++#ifdef CONFIG_UNIX98_PTYS
++ struct idr *idr_alloced;
++#endif
++
+ tty = (struct tty_struct *)filp->private_data;
+ if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev"))
+ return;
+@@ -1752,6 +1833,9 @@ static void release_dev(struct file * fi
+ devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0;
+ devpts_master = pty_master && devpts;
+ o_tty = tty->link;
++#ifdef CONFIG_UNIX98_PTYS
++ idr_alloced = &__ve_allocated_ptys(tty->owner_env);
++#endif
+
+ #ifdef TTY_PARANOIA_CHECK
+ if (idx < 0 || idx >= tty->driver->num) {
+@@ -1924,13 +2008,13 @@ static void release_dev(struct file * fi
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+- do_each_task_pid(tty->session, PIDTYPE_SID, p) {
++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) {
+ p->signal->tty = NULL;
+- } while_each_task_pid(tty->session, PIDTYPE_SID, p);
++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p);
+ if (o_tty)
+- do_each_task_pid(o_tty->session, PIDTYPE_SID, p) {
++ do_each_task_pid_all(o_tty->session, PIDTYPE_SID, p) {
+ p->signal->tty = NULL;
+- } while_each_task_pid(o_tty->session, PIDTYPE_SID, p);
++ } while_each_task_pid_all(o_tty->session, PIDTYPE_SID, p);
+ read_unlock(&tasklist_lock);
+ }
+
+@@ -2005,7 +2089,7 @@ static void release_dev(struct file * fi
+ /* Make this pty number available for reallocation */
+ if (devpts) {
+ down(&allocated_ptys_lock);
+- idr_remove(&allocated_ptys, idx);
++ idr_remove(idr_alloced, idx);
+ up(&allocated_ptys_lock);
+ }
+ #endif
+@@ -2026,7 +2110,7 @@ static void release_dev(struct file * fi
+ */
+ static int tty_open(struct inode * inode, struct file * filp)
+ {
+- struct tty_struct *tty;
++ struct tty_struct *tty, *c_tty;
+ int noctty, retval;
+ struct tty_driver *driver;
+ int index;
+@@ -2039,6 +2123,7 @@ retry_open:
+ noctty = filp->f_flags & O_NOCTTY;
+ index = -1;
+ retval = 0;
++ c_tty = NULL;
+
+ down(&tty_sem);
+
+@@ -2049,6 +2134,7 @@ retry_open:
+ }
+ driver = current->signal->tty->driver;
+ index = current->signal->tty->index;
++ c_tty = current->signal->tty;
+ filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
+ /* noctty = 1; */
+ goto got_driver;
+@@ -2056,6 +2142,12 @@ retry_open:
+ #ifdef CONFIG_VT
+ if (device == MKDEV(TTY_MAJOR,0)) {
+ extern struct tty_driver *console_driver;
++#ifdef CONFIG_VE
++ if (!ve_is_super(get_exec_env())) {
++ up(&tty_sem);
++ return -ENODEV;
++ }
++#endif
+ driver = console_driver;
+ index = fg_console;
+ noctty = 1;
+@@ -2063,6 +2155,12 @@ retry_open:
+ }
+ #endif
+ if (device == MKDEV(TTYAUX_MAJOR,1)) {
++#ifdef CONFIG_VE
++ if (!ve_is_super(get_exec_env())) {
++ up(&tty_sem);
++ return -ENODEV;
++ }
++#endif
+ driver = console_device(&index);
+ if (driver) {
+ /* Don't let /dev/console block */
+@@ -2080,7 +2178,7 @@ retry_open:
+ return -ENODEV;
+ }
+ got_driver:
+- retval = init_dev(driver, index, &tty);
++ retval = init_dev(driver, index, c_tty, &tty);
+ up(&tty_sem);
+ if (retval)
+ return retval;
+@@ -2149,11 +2247,11 @@ static int ptmx_open(struct inode * inod
+
+ /* find a device that is not in use. */
+ down(&allocated_ptys_lock);
+- if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) {
++ if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) {
+ up(&allocated_ptys_lock);
+ return -ENOMEM;
+ }
+- idr_ret = idr_get_new(&allocated_ptys, NULL, &index);
++ idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index);
+ if (idr_ret < 0) {
+ up(&allocated_ptys_lock);
+ if (idr_ret == -EAGAIN)
+@@ -2161,14 +2259,14 @@ static int ptmx_open(struct inode * inod
+ return -EIO;
+ }
+ if (index >= pty_limit) {
+- idr_remove(&allocated_ptys, index);
++ idr_remove(&ve_allocated_ptys, index);
+ up(&allocated_ptys_lock);
+ return -EIO;
+ }
+ up(&allocated_ptys_lock);
+
+ down(&tty_sem);
+- retval = init_dev(ptm_driver, index, &tty);
++ retval = init_dev(ve_ptm_driver, index, NULL, &tty);
+ up(&tty_sem);
+
+ if (retval)
+@@ -2183,14 +2281,14 @@ static int ptmx_open(struct inode * inod
+ goto out1;
+
+ check_tty_count(tty, "tty_open");
+- retval = ptm_driver->open(tty, filp);
++ retval = ve_ptm_driver->open(tty, filp);
+ if (!retval)
+ return 0;
+ out1:
+ release_dev(filp);
+ out:
+ down(&allocated_ptys_lock);
+- idr_remove(&allocated_ptys, index);
++ idr_remove(&ve_allocated_ptys, index);
+ up(&allocated_ptys_lock);
+ return retval;
+ }
+@@ -2303,6 +2401,8 @@ static int tioccons(struct file *file)
+ {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
++ if (!ve_is_super(get_exec_env()))
++ return -EACCES;
+ if (file->f_op->write == redirected_tty_write) {
+ struct file *f;
+ spin_lock(&redirect_lock);
+@@ -2363,9 +2463,9 @@ static int tiocsctty(struct tty_struct *
+ */
+
+ read_lock(&tasklist_lock);
+- do_each_task_pid(tty->session, PIDTYPE_SID, p) {
++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) {
+ p->signal->tty = NULL;
+- } while_each_task_pid(tty->session, PIDTYPE_SID, p);
++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p);
+ read_unlock(&tasklist_lock);
+ } else
+ return -EPERM;
+@@ -2387,7 +2487,7 @@ static int tiocgpgrp(struct tty_struct *
+ */
+ if (tty == real_tty && current->signal->tty != real_tty)
+ return -ENOTTY;
+- return put_user(real_tty->pgrp, p);
++ return put_user(pid_type_to_vpid(PIDTYPE_PGID, real_tty->pgrp), p);
+ }
+
+ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
+@@ -2407,6 +2507,9 @@ static int tiocspgrp(struct tty_struct *
+ return -EFAULT;
+ if (pgrp < 0)
+ return -EINVAL;
++ pgrp = vpid_to_pid(pgrp);
++ if (pgrp < 0)
++ return -EPERM;
+ if (session_of_pgrp(pgrp) != current->signal->session)
+ return -EPERM;
+ real_tty->pgrp = pgrp;
+@@ -2423,7 +2526,7 @@ static int tiocgsid(struct tty_struct *t
+ return -ENOTTY;
+ if (real_tty->session <= 0)
+ return -ENOTTY;
+- return put_user(real_tty->session, p);
++ return put_user(pid_type_to_vpid(PIDTYPE_SID, real_tty->session), p);
+ }
+
+ static int tiocsetd(struct tty_struct *tty, int __user *p)
+@@ -2696,7 +2799,7 @@ static void __do_SAK(void *arg)
+ tty->driver->flush_buffer(tty);
+
+ read_lock(&tasklist_lock);
+- do_each_task_pid(session, PIDTYPE_SID, p) {
++ do_each_task_pid_all(session, PIDTYPE_SID, p) {
+ if (p->signal->tty == tty || session > 0) {
+ printk(KERN_NOTICE "SAK: killed process %d"
+ " (%s): p->signal->session==tty->session\n",
+@@ -2706,7 +2809,11 @@ static void __do_SAK(void *arg)
+ }
+ task_lock(p);
+ if (p->files) {
+- rcu_read_lock();
++ /*
++ * We don't take a ref to the file, so we must
++ * hold ->file_lock instead.
++ */
++ spin_lock(&p->files->file_lock);
+ fdt = files_fdtable(p->files);
+ for (i=0; i < fdt->max_fds; i++) {
+ filp = fcheck_files(p->files, i);
+@@ -2721,10 +2828,10 @@ static void __do_SAK(void *arg)
+ break;
+ }
+ }
+- rcu_read_unlock();
++ spin_unlock(&p->files->file_lock);
+ }
+ task_unlock(p);
+- } while_each_task_pid(session, PIDTYPE_SID, p);
++ } while_each_task_pid_all(session, PIDTYPE_SID, p);
+ read_unlock(&tasklist_lock);
+ #endif
+ }
+@@ -3095,8 +3202,11 @@ int tty_register_driver(struct tty_drive
+
+ if (!driver->put_char)
+ driver->put_char = tty_default_put_char;
+-
++
++ SET_VE_OWNER_TTYDRV(driver, get_exec_env());
++ write_lock_irq(&tty_driver_guard);
+ list_add(&driver->tty_drivers, &tty_drivers);
++ write_unlock_irq(&tty_driver_guard);
+
+ if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) {
+ for(i = 0; i < driver->num; i++)
+@@ -3123,7 +3233,9 @@ int tty_unregister_driver(struct tty_dri
+ unregister_chrdev_region(MKDEV(driver->major, driver->minor_start),
+ driver->num);
+
++ write_lock_irq(&tty_driver_guard);
+ list_del(&driver->tty_drivers);
++ write_unlock_irq(&tty_driver_guard);
+
+ /*
+ * Free the termios and termios_locked structures because
+@@ -3246,6 +3358,7 @@ static int __init tty_init(void)
+
+ vty_init();
+ #endif
++ prepare_tty();
+ return 0;
+ }
+ module_init(tty_init);
+diff -upr linux-2.6.16.orig/drivers/edac/Kconfig linux-2.6.16-026test015/drivers/edac/Kconfig
+--- linux-2.6.16.orig/drivers/edac/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/edac/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -71,7 +71,7 @@ config EDAC_E7XXX
+
+ config EDAC_E752X
+ tristate "Intel e752x (e7520, e7525, e7320)"
+- depends on EDAC_MM_EDAC && PCI
++ depends on EDAC_MM_EDAC && PCI && HOTPLUG
+ help
+ Support for error detection and correction on the Intel
+ E7520, E7525, E7320 server chipsets.
+diff -upr linux-2.6.16.orig/drivers/i2c/busses/i2c-i801.c linux-2.6.16-026test015/drivers/i2c/busses/i2c-i801.c
+--- linux-2.6.16.orig/drivers/i2c/busses/i2c-i801.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/i2c/busses/i2c-i801.c 2006-07-04 14:41:36.000000000 +0400
+@@ -478,6 +478,11 @@ static s32 i801_access(struct i2c_adapte
+ ret = i801_transaction();
+ }
+
++ /* Some BIOSes don't like it when PEC is enabled at reboot or resume
++ time, so we forcibly disable it after every transaction. */
++ if (hwpec)
++ outb_p(0, SMBAUXCTL);
++
+ if(block)
+ return ret;
+ if(ret)
+diff -upr linux-2.6.16.orig/drivers/i2c/busses/scx200_acb.c linux-2.6.16-026test015/drivers/i2c/busses/scx200_acb.c
+--- linux-2.6.16.orig/drivers/i2c/busses/scx200_acb.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/i2c/busses/scx200_acb.c 2006-07-04 14:41:36.000000000 +0400
+@@ -440,7 +440,6 @@ static int __init scx200_acb_create(int
+ struct scx200_acb_iface *iface;
+ struct i2c_adapter *adapter;
+ int rc = 0;
+- char description[64];
+
+ iface = kzalloc(sizeof(*iface), GFP_KERNEL);
+ if (!iface) {
+@@ -459,8 +458,7 @@ static int __init scx200_acb_create(int
+
+ init_MUTEX(&iface->sem);
+
+- snprintf(description, sizeof(description), "NatSemi SCx200 ACCESS.bus [%s]", adapter->name);
+- if (request_region(base, 8, description) == 0) {
++ if (!request_region(base, 8, adapter->name)) {
+ dev_err(&adapter->dev, "can't allocate io 0x%x-0x%x\n",
+ base, base + 8-1);
+ rc = -EBUSY;
+diff -upr linux-2.6.16.orig/drivers/i2c/chips/m41t00.c linux-2.6.16-026test015/drivers/i2c/chips/m41t00.c
+--- linux-2.6.16.orig/drivers/i2c/chips/m41t00.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/i2c/chips/m41t00.c 2006-07-04 14:41:36.000000000 +0400
+@@ -129,13 +129,13 @@ m41t00_set_tlet(ulong arg)
+ if ((i2c_smbus_write_byte_data(save_client, 0, tm.tm_sec & 0x7f) < 0)
+ || (i2c_smbus_write_byte_data(save_client, 1, tm.tm_min & 0x7f)
+ < 0)
+- || (i2c_smbus_write_byte_data(save_client, 2, tm.tm_hour & 0x7f)
++ || (i2c_smbus_write_byte_data(save_client, 2, tm.tm_hour & 0x3f)
+ < 0)
+- || (i2c_smbus_write_byte_data(save_client, 4, tm.tm_mday & 0x7f)
++ || (i2c_smbus_write_byte_data(save_client, 4, tm.tm_mday & 0x3f)
+ < 0)
+- || (i2c_smbus_write_byte_data(save_client, 5, tm.tm_mon & 0x7f)
++ || (i2c_smbus_write_byte_data(save_client, 5, tm.tm_mon & 0x1f)
+ < 0)
+- || (i2c_smbus_write_byte_data(save_client, 6, tm.tm_year & 0x7f)
++ || (i2c_smbus_write_byte_data(save_client, 6, tm.tm_year & 0xff)
+ < 0))
+
+ dev_warn(&save_client->dev,"m41t00: can't write to rtc chip\n");
+diff -upr linux-2.6.16.orig/drivers/ide/pci/alim15x3.c linux-2.6.16-026test015/drivers/ide/pci/alim15x3.c
+--- linux-2.6.16.orig/drivers/ide/pci/alim15x3.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/ide/pci/alim15x3.c 2006-07-04 14:41:36.000000000 +0400
+@@ -731,6 +731,8 @@ static unsigned int __devinit ata66_ali1
+
+ if(m5229_revision <= 0x20)
+ tmpbyte = (tmpbyte & (~0x02)) | 0x01;
++ else if (m5229_revision == 0xc7)
++ tmpbyte |= 0x03;
+ else
+ tmpbyte |= 0x01;
+
+diff -upr linux-2.6.16.orig/drivers/ieee1394/ohci1394.c linux-2.6.16-026test015/drivers/ieee1394/ohci1394.c
+--- linux-2.6.16.orig/drivers/ieee1394/ohci1394.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/ieee1394/ohci1394.c 2006-07-04 14:41:36.000000000 +0400
+@@ -2525,7 +2525,7 @@ static irqreturn_t ohci_irq_handler(int
+ if (phys_dma) {
+ reg_write(ohci,OHCI1394_PhyReqFilterHiSet, 0xffffffff);
+ reg_write(ohci,OHCI1394_PhyReqFilterLoSet, 0xffffffff);
+- reg_write(ohci,OHCI1394_PhyUpperBound, 0xffff0000);
++ reg_write(ohci,OHCI1394_PhyUpperBound, 0x01000000);
+ } else {
+ reg_write(ohci,OHCI1394_PhyReqFilterHiSet, 0x00000000);
+ reg_write(ohci,OHCI1394_PhyReqFilterLoSet, 0x00000000);
+diff -upr linux-2.6.16.orig/drivers/ieee1394/sbp2.c linux-2.6.16-026test015/drivers/ieee1394/sbp2.c
+--- linux-2.6.16.orig/drivers/ieee1394/sbp2.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/ieee1394/sbp2.c 2006-07-04 14:41:36.000000000 +0400
+@@ -495,22 +495,17 @@ static struct sbp2_command_info *sbp2uti
+ /*
+ * This function finds the sbp2_command for a given outstanding SCpnt.
+ * Only looks at the inuse list.
++ * Must be called with scsi_id->sbp2_command_orb_lock held.
+ */
+-static struct sbp2_command_info *sbp2util_find_command_for_SCpnt(struct scsi_id_instance_data *scsi_id, void *SCpnt)
++static struct sbp2_command_info *sbp2util_find_command_for_SCpnt(
++ struct scsi_id_instance_data *scsi_id, void *SCpnt)
+ {
+ struct sbp2_command_info *command;
+- unsigned long flags;
+
+- spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
+- if (!list_empty(&scsi_id->sbp2_command_orb_inuse)) {
+- list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) {
+- if (command->Current_SCpnt == SCpnt) {
+- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
++ if (!list_empty(&scsi_id->sbp2_command_orb_inuse))
++ list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list)
++ if (command->Current_SCpnt == SCpnt)
+ return command;
+- }
+- }
+- }
+- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
+ return NULL;
+ }
+
+@@ -579,17 +574,15 @@ static void sbp2util_free_command_dma(st
+
+ /*
+ * This function moves a command to the completed orb list.
++ * Must be called with scsi_id->sbp2_command_orb_lock held.
+ */
+-static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id,
+- struct sbp2_command_info *command)
++static void sbp2util_mark_command_completed(
++ struct scsi_id_instance_data *scsi_id,
++ struct sbp2_command_info *command)
+ {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
+ list_del(&command->list);
+ sbp2util_free_command_dma(command);
+ list_add_tail(&command->list, &scsi_id->sbp2_command_orb_completed);
+- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
+ }
+
+ /*
+@@ -761,12 +754,17 @@ static struct scsi_id_instance_data *sbp
+
+ /* Register the status FIFO address range. We could use the same FIFO
+ * for targets at different nodes. However we need different FIFOs per
+- * target in order to support multi-unit devices. */
++ * target in order to support multi-unit devices.
++ * The FIFO is located out of the local host controller's physical range
++ * but, if possible, within the posted write area. Status writes will
++ * then be performed as unified transactions. This slightly reduces
++ * bandwidth usage, and some Prolific based devices seem to require it.
++ */
+ scsi_id->status_fifo_addr = hpsb_allocate_and_register_addrspace(
+ &sbp2_highlevel, ud->ne->host, &sbp2_ops,
+ sizeof(struct sbp2_status_block), sizeof(quadlet_t),
+- ~0ULL, ~0ULL);
+- if (!scsi_id->status_fifo_addr) {
++ 0x010000000000ULL, CSR1212_ALL_SPACE_END);
++ if (scsi_id->status_fifo_addr == ~0ULL) {
+ SBP2_ERR("failed to allocate status FIFO address range");
+ goto failed_alloc;
+ }
+@@ -2177,7 +2175,9 @@ static int sbp2_handle_status_write(stru
+ * Matched status with command, now grab scsi command pointers and check status
+ */
+ SCpnt = command->Current_SCpnt;
++ spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
+ sbp2util_mark_command_completed(scsi_id, command);
++ spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
+
+ if (SCpnt) {
+
+@@ -2491,9 +2491,20 @@ static int sbp2scsi_slave_alloc(struct s
+
+ static int sbp2scsi_slave_configure(struct scsi_device *sdev)
+ {
++ struct scsi_id_instance_data *scsi_id =
++ (struct scsi_id_instance_data *)sdev->host->hostdata[0];
++
+ blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
+ sdev->use_10_for_rw = 1;
+ sdev->use_10_for_ms = 1;
++
++ if ((scsi_id->sbp2_firmware_revision & 0xffff00) == 0x0a2700 &&
++ (scsi_id->ud->model_id == 0x000021 /* gen.4 iPod */ ||
++ scsi_id->ud->model_id == 0x000023 /* iPod mini */ ||
++ scsi_id->ud->model_id == 0x00007e /* iPod Photo */ )) {
++ SBP2_INFO("enabling iPod workaround: decrement disk capacity");
++ sdev->fix_capacity = 1;
++ }
+ return 0;
+ }
+
+@@ -2513,6 +2524,7 @@ static int sbp2scsi_abort(struct scsi_cm
+ (struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0];
+ struct sbp2scsi_host_info *hi = scsi_id->hi;
+ struct sbp2_command_info *command;
++ unsigned long flags;
+
+ SBP2_ERR("aborting sbp2 command");
+ scsi_print_command(SCpnt);
+@@ -2523,6 +2535,7 @@ static int sbp2scsi_abort(struct scsi_cm
+ * Right now, just return any matching command structures
+ * to the free pool.
+ */
++ spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags);
+ command = sbp2util_find_command_for_SCpnt(scsi_id, SCpnt);
+ if (command) {
+ SBP2_DEBUG("Found command to abort");
+@@ -2540,6 +2553,7 @@ static int sbp2scsi_abort(struct scsi_cm
+ command->Current_done(command->Current_SCpnt);
+ }
+ }
++ spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags);
+
+ /*
+ * Initiate a fetch agent reset.
+diff -upr linux-2.6.16.orig/drivers/input/mouse/psmouse-base.c linux-2.6.16-026test015/drivers/input/mouse/psmouse-base.c
+--- linux-2.6.16.orig/drivers/input/mouse/psmouse-base.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/input/mouse/psmouse-base.c 2006-07-04 14:41:36.000000000 +0400
+@@ -300,8 +300,10 @@ static irqreturn_t psmouse_interrupt(str
+ * Check if this is a new device announcement (0xAA 0x00)
+ */
+ if (unlikely(psmouse->packet[0] == PSMOUSE_RET_BAT && psmouse->pktcnt <= 2)) {
+- if (psmouse->pktcnt == 1)
++ if (psmouse->pktcnt == 1) {
++ psmouse->last = jiffies;
+ goto out;
++ }
+
+ if (psmouse->packet[1] == PSMOUSE_RET_ID) {
+ __psmouse_set_state(psmouse, PSMOUSE_IGNORE);
+diff -upr linux-2.6.16.orig/drivers/macintosh/therm_adt746x.c linux-2.6.16-026test015/drivers/macintosh/therm_adt746x.c
+--- linux-2.6.16.orig/drivers/macintosh/therm_adt746x.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/macintosh/therm_adt746x.c 2006-07-04 14:41:36.000000000 +0400
+@@ -627,8 +627,8 @@ thermostat_init(void)
+ if(therm_type == ADT7460)
+ device_create_file(&of_dev->dev, &dev_attr_sensor2_fan_speed);
+
+-#ifndef CONFIG_I2C_KEYWEST
+- request_module("i2c-keywest");
++#ifndef CONFIG_I2C_POWERMAC
++ request_module("i2c-powermac");
+ #endif
+
+ return i2c_add_driver(&thermostat_driver);
+diff -upr linux-2.6.16.orig/drivers/md/dm-snap.c linux-2.6.16-026test015/drivers/md/dm-snap.c
+--- linux-2.6.16.orig/drivers/md/dm-snap.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/md/dm-snap.c 2006-07-04 14:41:36.000000000 +0400
+@@ -542,8 +542,12 @@ static void snapshot_dtr(struct dm_targe
+ {
+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
+
++ /* Prevent further origin writes from using this snapshot. */
++ /* After this returns there can be no new kcopyd jobs. */
+ unregister_snapshot(s);
+
++ kcopyd_client_destroy(s->kcopyd_client);
++
+ exit_exception_table(&s->pending, pending_cache);
+ exit_exception_table(&s->complete, exception_cache);
+
+@@ -552,7 +556,7 @@ static void snapshot_dtr(struct dm_targe
+
+ dm_put_device(ti, s->origin);
+ dm_put_device(ti, s->cow);
+- kcopyd_client_destroy(s->kcopyd_client);
++
+ kfree(s);
+ }
+
+diff -upr linux-2.6.16.orig/drivers/md/dm.c linux-2.6.16-026test015/drivers/md/dm.c
+--- linux-2.6.16.orig/drivers/md/dm.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/md/dm.c 2006-07-04 14:41:36.000000000 +0400
+@@ -533,30 +533,35 @@ static void __clone_and_map(struct clone
+
+ } else {
+ /*
+- * Create two copy bios to deal with io that has
+- * been split across a target.
++ * Handle a bvec that must be split between two or more targets.
+ */
+ struct bio_vec *bv = bio->bi_io_vec + ci->idx;
++ sector_t remaining = to_sector(bv->bv_len);
++ unsigned int offset = 0;
+
+- clone = split_bvec(bio, ci->sector, ci->idx,
+- bv->bv_offset, max);
+- __map_bio(ti, clone, tio);
+-
+- ci->sector += max;
+- ci->sector_count -= max;
+- ti = dm_table_find_target(ci->map, ci->sector);
+-
+- len = to_sector(bv->bv_len) - max;
+- clone = split_bvec(bio, ci->sector, ci->idx,
+- bv->bv_offset + to_bytes(max), len);
+- tio = alloc_tio(ci->md);
+- tio->io = ci->io;
+- tio->ti = ti;
+- memset(&tio->info, 0, sizeof(tio->info));
+- __map_bio(ti, clone, tio);
++ do {
++ if (offset) {
++ ti = dm_table_find_target(ci->map, ci->sector);
++ max = max_io_len(ci->md, ci->sector, ti);
++
++ tio = alloc_tio(ci->md);
++ tio->io = ci->io;
++ tio->ti = ti;
++ memset(&tio->info, 0, sizeof(tio->info));
++ }
++
++ len = min(remaining, max);
++
++ clone = split_bvec(bio, ci->sector, ci->idx,
++ bv->bv_offset + offset, len);
++
++ __map_bio(ti, clone, tio);
++
++ ci->sector += len;
++ ci->sector_count -= len;
++ offset += to_bytes(len);
++ } while (remaining -= len);
+
+- ci->sector += len;
+- ci->sector_count -= len;
+ ci->idx++;
+ }
+ }
+@@ -1093,6 +1098,7 @@ int dm_suspend(struct mapped_device *md,
+ {
+ struct dm_table *map = NULL;
+ DECLARE_WAITQUEUE(wait, current);
++ struct bio *def;
+ int r = -EINVAL;
+
+ down(&md->suspend_lock);
+@@ -1152,9 +1158,11 @@ int dm_suspend(struct mapped_device *md,
+ /* were we interrupted ? */
+ r = -EINTR;
+ if (atomic_read(&md->pending)) {
++ clear_bit(DMF_BLOCK_IO, &md->flags);
++ def = bio_list_get(&md->deferred);
++ __flush_deferred_io(md, def);
+ up_write(&md->io_lock);
+ unlock_fs(md);
+- clear_bit(DMF_BLOCK_IO, &md->flags);
+ goto out;
+ }
+ up_write(&md->io_lock);
+diff -upr linux-2.6.16.orig/drivers/md/kcopyd.c linux-2.6.16-026test015/drivers/md/kcopyd.c
+--- linux-2.6.16.orig/drivers/md/kcopyd.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/md/kcopyd.c 2006-07-04 14:41:36.000000000 +0400
+@@ -44,6 +44,9 @@ struct kcopyd_client {
+ struct page_list *pages;
+ unsigned int nr_pages;
+ unsigned int nr_free_pages;
++
++ wait_queue_head_t destroyq;
++ atomic_t nr_jobs;
+ };
+
+ static struct page_list *alloc_pl(void)
+@@ -293,10 +296,15 @@ static int run_complete_job(struct kcopy
+ int read_err = job->read_err;
+ unsigned int write_err = job->write_err;
+ kcopyd_notify_fn fn = job->fn;
++ struct kcopyd_client *kc = job->kc;
+
+- kcopyd_put_pages(job->kc, job->pages);
++ kcopyd_put_pages(kc, job->pages);
+ mempool_free(job, _job_pool);
+ fn(read_err, write_err, context);
++
++ if (atomic_dec_and_test(&kc->nr_jobs))
++ wake_up(&kc->destroyq);
++
+ return 0;
+ }
+
+@@ -431,6 +439,7 @@ static void do_work(void *ignored)
+ */
+ static void dispatch_job(struct kcopyd_job *job)
+ {
++ atomic_inc(&job->kc->nr_jobs);
+ push(&_pages_jobs, job);
+ wake();
+ }
+@@ -670,6 +679,9 @@ int kcopyd_client_create(unsigned int nr
+ return r;
+ }
+
++ init_waitqueue_head(&kc->destroyq);
++ atomic_set(&kc->nr_jobs, 0);
++
+ client_add(kc);
+ *result = kc;
+ return 0;
+@@ -677,6 +689,9 @@ int kcopyd_client_create(unsigned int nr
+
+ void kcopyd_client_destroy(struct kcopyd_client *kc)
+ {
++ /* Wait for completion of all jobs submitted by this client. */
++ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs));
++
+ dm_io_put(kc->nr_pages);
+ client_free_pages(kc);
+ client_del(kc);
+diff -upr linux-2.6.16.orig/drivers/md/raid10.c linux-2.6.16-026test015/drivers/md/raid10.c
+--- linux-2.6.16.orig/drivers/md/raid10.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/md/raid10.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1436,9 +1436,9 @@ static void raid10d(mddev_t *mddev)
+ sl--;
+ d = r10_bio->devs[sl].devnum;
+ rdev = conf->mirrors[d].rdev;
+- atomic_add(s, &rdev->corrected_errors);
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
++ atomic_add(s, &rdev->corrected_errors);
+ if (sync_page_io(rdev->bdev,
+ r10_bio->devs[sl].addr +
+ sect + rdev->data_offset,
+diff -upr linux-2.6.16.orig/drivers/media/dvb/dvb-usb/cxusb.c linux-2.6.16-026test015/drivers/media/dvb/dvb-usb/cxusb.c
+--- linux-2.6.16.orig/drivers/media/dvb/dvb-usb/cxusb.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/media/dvb/dvb-usb/cxusb.c 2006-07-04 14:41:36.000000000 +0400
+@@ -149,6 +149,15 @@ static int cxusb_power_ctrl(struct dvb_u
+ return cxusb_ctrl_msg(d, CMD_POWER_OFF, &b, 1, NULL, 0);
+ }
+
++static int cxusb_bluebird_power_ctrl(struct dvb_usb_device *d, int onoff)
++{
++ u8 b = 0;
++ if (onoff)
++ return cxusb_ctrl_msg(d, CMD_POWER_ON, &b, 1, NULL, 0);
++ else
++ return 0;
++}
++
+ static int cxusb_streaming_ctrl(struct dvb_usb_device *d, int onoff)
+ {
+ u8 buf[2] = { 0x03, 0x00 };
+@@ -505,7 +514,7 @@ static struct dvb_usb_properties cxusb_b
+ .size_of_priv = sizeof(struct cxusb_state),
+
+ .streaming_ctrl = cxusb_streaming_ctrl,
+- .power_ctrl = cxusb_power_ctrl,
++ .power_ctrl = cxusb_bluebird_power_ctrl,
+ .frontend_attach = cxusb_lgdt3303_frontend_attach,
+ .tuner_attach = cxusb_lgh064f_tuner_attach,
+
+@@ -545,7 +554,7 @@ static struct dvb_usb_properties cxusb_b
+ .size_of_priv = sizeof(struct cxusb_state),
+
+ .streaming_ctrl = cxusb_streaming_ctrl,
+- .power_ctrl = cxusb_power_ctrl,
++ .power_ctrl = cxusb_bluebird_power_ctrl,
+ .frontend_attach = cxusb_dee1601_frontend_attach,
+ .tuner_attach = cxusb_dee1601_tuner_attach,
+
+@@ -594,7 +603,7 @@ static struct dvb_usb_properties cxusb_b
+ .size_of_priv = sizeof(struct cxusb_state),
+
+ .streaming_ctrl = cxusb_streaming_ctrl,
+- .power_ctrl = cxusb_power_ctrl,
++ .power_ctrl = cxusb_bluebird_power_ctrl,
+ .frontend_attach = cxusb_mt352_frontend_attach,
+ .tuner_attach = cxusb_lgz201_tuner_attach,
+
+@@ -634,7 +643,7 @@ static struct dvb_usb_properties cxusb_b
+ .size_of_priv = sizeof(struct cxusb_state),
+
+ .streaming_ctrl = cxusb_streaming_ctrl,
+- .power_ctrl = cxusb_power_ctrl,
++ .power_ctrl = cxusb_bluebird_power_ctrl,
+ .frontend_attach = cxusb_mt352_frontend_attach,
+ .tuner_attach = cxusb_dtt7579_tuner_attach,
+
+diff -upr linux-2.6.16.orig/drivers/media/video/Kconfig linux-2.6.16-026test015/drivers/media/video/Kconfig
+--- linux-2.6.16.orig/drivers/media/video/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/media/video/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -349,6 +349,7 @@ config VIDEO_AUDIO_DECODER
+ config VIDEO_DECODER
+ tristate "Add support for additional video chipsets"
+ depends on VIDEO_DEV && I2C && EXPERIMENTAL
++ select FW_LOADER
+ ---help---
+ Say Y here to compile drivers for SAA7115, SAA7127 and CX25840
+ video decoders.
+diff -upr linux-2.6.16.orig/drivers/media/video/saa7127.c linux-2.6.16-026test015/drivers/media/video/saa7127.c
+--- linux-2.6.16.orig/drivers/media/video/saa7127.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/media/video/saa7127.c 2006-07-04 14:41:36.000000000 +0400
+@@ -141,6 +141,7 @@ struct i2c_reg_value {
+ static const struct i2c_reg_value saa7129_init_config_extra[] = {
+ { SAA7127_REG_OUTPUT_PORT_CONTROL, 0x38 },
+ { SAA7127_REG_VTRIG, 0xfa },
++ { 0, 0 }
+ };
+
+ static const struct i2c_reg_value saa7127_init_config_common[] = {
+diff -upr linux-2.6.16.orig/drivers/media/video/tuner-types.c linux-2.6.16-026test015/drivers/media/video/tuner-types.c
+--- linux-2.6.16.orig/drivers/media/video/tuner-types.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/media/video/tuner-types.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1087,8 +1087,8 @@ static struct tuner_params tuner_tnf_533
+ /* ------------ TUNER_SAMSUNG_TCPN_2121P30A - Samsung NTSC ------------ */
+
+ static struct tuner_range tuner_samsung_tcpn_2121p30a_ntsc_ranges[] = {
+- { 16 * 175.75 /*MHz*/, 0x01, },
+- { 16 * 410.25 /*MHz*/, 0x02, },
++ { 16 * 130.00 /*MHz*/, 0x01, },
++ { 16 * 364.50 /*MHz*/, 0x02, },
+ { 16 * 999.99 , 0x08, },
+ };
+
+diff -upr linux-2.6.16.orig/drivers/message/i2o/exec-osm.c linux-2.6.16-026test015/drivers/message/i2o/exec-osm.c
+--- linux-2.6.16.orig/drivers/message/i2o/exec-osm.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/message/i2o/exec-osm.c 2006-07-04 14:41:36.000000000 +0400
+@@ -55,6 +55,7 @@ struct i2o_exec_wait {
+ u32 m; /* message id */
+ struct i2o_message *msg; /* pointer to the reply message */
+ struct list_head list; /* node in global wait list */
++ spinlock_t lock; /* lock before modifying */
+ };
+
+ /* Exec OSM class handling definition */
+@@ -80,6 +81,7 @@ static struct i2o_exec_wait *i2o_exec_wa
+ return NULL;
+
+ INIT_LIST_HEAD(&wait->list);
++ spin_lock_init(&wait->lock);
+
+ return wait;
+ };
+@@ -118,6 +120,7 @@ int i2o_msg_post_wait_mem(struct i2o_con
+ DECLARE_WAIT_QUEUE_HEAD(wq);
+ struct i2o_exec_wait *wait;
+ static u32 tcntxt = 0x80000000;
++ long flags;
+ int rc = 0;
+
+ wait = i2o_exec_wait_alloc();
+@@ -139,33 +142,28 @@ int i2o_msg_post_wait_mem(struct i2o_con
+ wait->tcntxt = tcntxt++;
+ msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt);
+
++ wait->wq = &wq;
++ /*
++ * we add elements to the head, because if a entry in the list will
++ * never be removed, we have to iterate over it every time
++ */
++ list_add(&wait->list, &i2o_exec_wait_list);
++
+ /*
+ * Post the message to the controller. At some point later it will
+ * return. If we time out before it returns then complete will be zero.
+ */
+ i2o_msg_post(c, msg);
+
+- if (!wait->complete) {
+- wait->wq = &wq;
+- /*
+- * we add elements add the head, because if a entry in the list
+- * will never be removed, we have to iterate over it every time
+- */
+- list_add(&wait->list, &i2o_exec_wait_list);
+-
+- wait_event_interruptible_timeout(wq, wait->complete,
+- timeout * HZ);
++ wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ);
+
+- wait->wq = NULL;
+- }
++ spin_lock_irqsave(&wait->lock, flags);
+
+- barrier();
++ wait->wq = NULL;
+
+- if (wait->complete) {
++ if (wait->complete)
+ rc = le32_to_cpu(wait->msg->body[0]) >> 24;
+- i2o_flush_reply(c, wait->m);
+- i2o_exec_wait_free(wait);
+- } else {
++ else {
+ /*
+ * We cannot remove it now. This is important. When it does
+ * terminate (which it must do if the controller has not
+@@ -179,6 +177,13 @@ int i2o_msg_post_wait_mem(struct i2o_con
+ rc = -ETIMEDOUT;
+ }
+
++ spin_unlock_irqrestore(&wait->lock, flags);
++
++ if (rc != -ETIMEDOUT) {
++ i2o_flush_reply(c, wait->m);
++ i2o_exec_wait_free(wait);
++ }
++
+ return rc;
+ };
+
+@@ -206,7 +211,6 @@ static int i2o_msg_post_wait_complete(st
+ {
+ struct i2o_exec_wait *wait, *tmp;
+ unsigned long flags;
+- static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ int rc = 1;
+
+ /*
+@@ -216,23 +220,24 @@ static int i2o_msg_post_wait_complete(st
+ * already expired. Not much we can do about that except log it for
+ * debug purposes, increase timeout, and recompile.
+ */
+- spin_lock_irqsave(&lock, flags);
+ list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
+ if (wait->tcntxt == context) {
+- list_del(&wait->list);
++ spin_lock_irqsave(&wait->lock, flags);
+
+- spin_unlock_irqrestore(&lock, flags);
++ list_del(&wait->list);
+
+ wait->m = m;
+ wait->msg = msg;
+ wait->complete = 1;
+
+- barrier();
+-
+- if (wait->wq) {
+- wake_up_interruptible(wait->wq);
++ if (wait->wq)
+ rc = 0;
+- } else {
++ else
++ rc = -1;
++
++ spin_unlock_irqrestore(&wait->lock, flags);
++
++ if (rc) {
+ struct device *dev;
+
+ dev = &c->pdev->dev;
+@@ -241,15 +246,13 @@ static int i2o_msg_post_wait_complete(st
+ c->name);
+ i2o_dma_free(dev, &wait->dma);
+ i2o_exec_wait_free(wait);
+- rc = -1;
+- }
++ } else
++ wake_up_interruptible(wait->wq);
+
+ return rc;
+ }
+ }
+
+- spin_unlock_irqrestore(&lock, flags);
+-
+ osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
+ context);
+
+@@ -315,14 +318,9 @@ static DEVICE_ATTR(product_id, S_IRUGO,
+ static int i2o_exec_probe(struct device *dev)
+ {
+ struct i2o_device *i2o_dev = to_i2o_device(dev);
+- struct i2o_controller *c = i2o_dev->iop;
+
+ i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
+
+- c->exec = i2o_dev;
+-
+- i2o_exec_lct_notify(c, c->lct->change_ind + 1);
+-
+ device_create_file(dev, &dev_attr_vendor_id);
+ device_create_file(dev, &dev_attr_product_id);
+
+@@ -510,6 +508,8 @@ static int i2o_exec_lct_notify(struct i2
+ struct device *dev;
+ struct i2o_message *msg;
+
++ down(&c->lct_lock);
++
+ dev = &c->pdev->dev;
+
+ if (i2o_dma_realloc
+@@ -532,6 +532,8 @@ static int i2o_exec_lct_notify(struct i2
+
+ i2o_msg_post(c, msg);
+
++ up(&c->lct_lock);
++
+ return 0;
+ };
+
+diff -upr linux-2.6.16.orig/drivers/message/i2o/iop.c linux-2.6.16-026test015/drivers/message/i2o/iop.c
+--- linux-2.6.16.orig/drivers/message/i2o/iop.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/message/i2o/iop.c 2006-07-04 14:41:36.000000000 +0400
+@@ -804,8 +804,6 @@ void i2o_iop_remove(struct i2o_controlle
+
+ /* Ask the IOP to switch to RESET state */
+ i2o_iop_reset(c);
+-
+- put_device(&c->device);
+ }
+
+ /**
+@@ -1059,7 +1057,7 @@ struct i2o_controller *i2o_iop_alloc(voi
+
+ snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name);
+ if (i2o_pool_alloc
+- (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4,
++ (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32),
+ I2O_MSG_INPOOL_MIN)) {
+ kfree(c);
+ return ERR_PTR(-ENOMEM);
+diff -upr linux-2.6.16.orig/drivers/mtd/nand/Kconfig linux-2.6.16-026test015/drivers/mtd/nand/Kconfig
+--- linux-2.6.16.orig/drivers/mtd/nand/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/mtd/nand/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -178,17 +178,16 @@ config MTD_NAND_DISKONCHIP_BBTWRITE
+ Even if you leave this disabled, you can enable BBT writes at module
+ load time (assuming you build diskonchip as a module) with the module
+ parameter "inftl_bbt_write=1".
+-
+- config MTD_NAND_SHARPSL
+- bool "Support for NAND Flash on Sharp SL Series (C7xx + others)"
+- depends on MTD_NAND && ARCH_PXA
+-
+- config MTD_NAND_NANDSIM
+- bool "Support for NAND Flash Simulator"
+- depends on MTD_NAND && MTD_PARTITIONS
+
++config MTD_NAND_SHARPSL
++ tristate "Support for NAND Flash on Sharp SL Series (C7xx + others)"
++ depends on MTD_NAND && ARCH_PXA
++
++config MTD_NAND_NANDSIM
++ tristate "Support for NAND Flash Simulator"
++ depends on MTD_NAND && MTD_PARTITIONS
+ help
+ The simulator may simulate verious NAND flash chips for the
+ MTD nand layer.
+-
++
+ endmenu
+diff -upr linux-2.6.16.orig/drivers/net/Makefile linux-2.6.16-026test015/drivers/net/Makefile
+--- linux-2.6.16.orig/drivers/net/Makefile 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/Makefile 2006-07-04 14:41:39.000000000 +0400
+@@ -18,6 +18,12 @@ gianfar_driver-objs := gianfar.o \
+ gianfar_mii.o \
+ gianfar_sysfs.o
+
++obj-$(CONFIG_VE_NETDEV) += vznetdev.o
++vznetdev-objs := open_vznet.o venet_core.o
++
++obj-$(CONFIG_VE_ETHDEV) += vzethdev.o
++vzethdev-objs := veth.o
++
+ #
+ # link order important here
+ #
+diff -upr linux-2.6.16.orig/drivers/net/e1000/e1000_main.c linux-2.6.16-026test015/drivers/net/e1000/e1000_main.c
+--- linux-2.6.16.orig/drivers/net/e1000/e1000_main.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/e1000/e1000_main.c 2006-07-04 14:41:36.000000000 +0400
+@@ -3851,6 +3851,7 @@ e1000_clean_rx_irq_ps(struct e1000_adapt
+ skb_shinfo(skb)->nr_frags++;
+ skb->len += length;
+ skb->data_len += length;
++ skb->truesize += length;
+ }
+
+ e1000_rx_checksum(adapter, staterr,
+diff -upr linux-2.6.16.orig/drivers/net/irda/irda-usb.c linux-2.6.16-026test015/drivers/net/irda/irda-usb.c
+--- linux-2.6.16.orig/drivers/net/irda/irda-usb.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/irda/irda-usb.c 2006-07-04 14:41:36.000000000 +0400
+@@ -740,7 +740,7 @@ static void irda_usb_receive(struct urb
+ struct sk_buff *newskb;
+ struct sk_buff *dataskb;
+ struct urb *next_urb;
+- int docopy;
++ unsigned int len, docopy;
+
+ IRDA_DEBUG(2, "%s(), len=%d\n", __FUNCTION__, urb->actual_length);
+
+@@ -851,10 +851,11 @@ static void irda_usb_receive(struct urb
+ dataskb->dev = self->netdev;
+ dataskb->mac.raw = dataskb->data;
+ dataskb->protocol = htons(ETH_P_IRDA);
++ len = dataskb->len;
+ netif_rx(dataskb);
+
+ /* Keep stats up to date */
+- self->stats.rx_bytes += dataskb->len;
++ self->stats.rx_bytes += len;
+ self->stats.rx_packets++;
+ self->netdev->last_rx = jiffies;
+
+diff -upr linux-2.6.16.orig/drivers/net/loopback.c linux-2.6.16-026test015/drivers/net/loopback.c
+--- linux-2.6.16.orig/drivers/net/loopback.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/loopback.c 2006-07-04 14:41:39.000000000 +0400
+@@ -130,6 +130,11 @@ static int loopback_xmit(struct sk_buff
+ {
+ struct net_device_stats *lb_stats;
+
++ if (unlikely(get_exec_env()->disable_net)) {
++ kfree_skb(skb);
++ return 0;
++ }
++
+ skb_orphan(skb);
+
+ skb->protocol = eth_type_trans(skb,dev);
+@@ -198,6 +203,34 @@ static struct ethtool_ops loopback_ethto
+ .set_tso = ethtool_op_set_tso,
+ };
+
++static void loopback_destructor(struct net_device *dev)
++{
++ kfree(dev->priv);
++ dev->priv = NULL;
++}
++
++struct net_device templ_loopback_dev = {
++ .name = "lo",
++ .mtu = (16 * 1024) + 20 + 20 + 12,
++ .hard_start_xmit = loopback_xmit,
++ .hard_header = eth_header,
++ .hard_header_cache = eth_header_cache,
++ .header_cache_update = eth_header_cache_update,
++ .hard_header_len = ETH_HLEN, /* 14 */
++ .addr_len = ETH_ALEN, /* 6 */
++ .tx_queue_len = 0,
++ .type = ARPHRD_LOOPBACK, /* 0x0001*/
++ .rebuild_header = eth_rebuild_header,
++ .flags = IFF_LOOPBACK,
++ .features = NETIF_F_SG|NETIF_F_FRAGLIST
++ |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA
++ |NETIF_F_LLTX|NETIF_F_VIRTUAL,
++};
++
++#ifdef loopback_dev
++#undef loopback_dev
++#endif
++
+ struct net_device loopback_dev = {
+ .name = "lo",
+ .mtu = (16 * 1024) + 20 + 20 + 12,
+@@ -231,9 +264,13 @@ int __init loopback_init(void)
+ memset(stats, 0, sizeof(struct net_device_stats));
+ loopback_dev.priv = stats;
+ loopback_dev.get_stats = &get_stats;
++ loopback_dev.destructor = &loopback_destructor;
+ }
+-
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ get_ve0()->_loopback_dev = &loopback_dev;
++#endif
+ return register_netdev(&loopback_dev);
+ };
+
+ EXPORT_SYMBOL(loopback_dev);
++EXPORT_SYMBOL(templ_loopback_dev);
+diff -upr linux-2.6.16.orig/drivers/net/open_vznet.c linux-2.6.16-026test015/drivers/net/open_vznet.c
+--- linux-2.6.16.orig/drivers/net/open_vznet.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/drivers/net/open_vznet.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,227 @@
++/*
++ * open_vznet.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * Virtual Networking device used to change VE ownership on packets
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/seq_file.h>
++
++#include <linux/inet.h>
++#include <net/ip.h>
++#include <linux/skbuff.h>
++#include <linux/venet.h>
++
++void veip_stop(struct ve_struct *ve)
++{
++ struct list_head *p, *tmp;
++
++ write_lock_irq(&veip_hash_lock);
++ if (ve->veip == NULL)
++ goto unlock;
++ list_for_each_safe(p, tmp, &ve->veip->ip_lh) {
++ struct ip_entry_struct *ptr;
++ ptr = list_entry(p, struct ip_entry_struct, ve_list);
++ ptr->active_env = NULL;
++ list_del(&ptr->ve_list);
++ list_del(&ptr->ip_hash);
++ kfree(ptr);
++ }
++ veip_put(ve->veip);
++ ve->veip = NULL;
++unlock:
++ write_unlock_irq(&veip_hash_lock);
++}
++
++int veip_start(struct ve_struct *ve)
++{
++ int err;
++
++ err = 0;
++ write_lock_irq(&veip_hash_lock);
++ ve->veip = veip_findcreate(ve->veid);
++ if (ve->veip == NULL)
++ err = -ENOMEM;
++ write_unlock_irq(&veip_hash_lock);
++ return err;
++}
++
++int veip_entry_add(struct ve_struct *ve, struct sockaddr *addr)
++{
++ struct ip_entry_struct *entry, *found;
++ int err;
++
++ entry = kmalloc(sizeof(struct ip_entry_struct), GFP_KERNEL);
++ if (entry == NULL)
++ return -ENOMEM;
++
++ memset(entry, 0, sizeof(struct ip_entry_struct));
++ entry->family = addr->sa_family;
++ if (addr->sa_family == AF_INET) {
++ entry->key[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr;
++ } else if (addr->sa_family == AF_INET6) {
++ memcpy(entry->key, &((struct sockaddr_in6*)addr)->sin6_addr, 16);
++ } else {
++ kfree(entry);
++ return -EAFNOSUPPORT;
++ }
++
++ write_lock_irq(&veip_hash_lock);
++ err = -EADDRINUSE;
++ found = venet_entry_lookup(entry->key, entry->family);
++ if (found != NULL)
++ goto out_unlock;
++ else {
++ ip_entry_hash(entry, ve->veip);
++ found = entry;
++ entry = NULL;
++ }
++ err = 0;
++ found->active_env = ve;
++out_unlock:
++ write_unlock_irq(&veip_hash_lock);
++ if (entry != NULL)
++ kfree(entry);
++ return err;
++}
++
++int veip_entry_del(envid_t veid, struct sockaddr *addr)
++{
++ struct ip_entry_struct *found;
++ u32 key[4];
++ int err;
++
++ if (addr->sa_family == AF_INET) {
++ memset(key, 0, sizeof(key));
++ key[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr;
++ } else if (addr->sa_family == AF_INET6) {
++ memcpy(key, &((struct sockaddr_in6*)addr)->sin6_addr, 16);
++ } else {
++ return -EAFNOSUPPORT;
++ }
++
++ err = -EADDRNOTAVAIL;
++ write_lock_irq(&veip_hash_lock);
++ found = venet_entry_lookup(key, addr->sa_family);
++ if (found == NULL)
++ goto out;
++ if (found->active_env->veid != veid)
++ goto out;
++
++ err = 0;
++ found->active_env = NULL;
++
++ list_del(&found->ip_hash);
++ list_del(&found->ve_list);
++ kfree(found);
++out:
++ write_unlock_irq(&veip_hash_lock);
++ return err;
++}
++
++static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir)
++{
++ struct ip_entry_struct *entry;
++
++ if (skb->protocol == __constant_htons(ETH_P_IP)) {
++ entry = ip_entry_lookup(dir ? skb->nh.iph->daddr :
++ skb->nh.iph->saddr);
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
++ entry = venet_entry_lookup(dir ? skb->nh.ipv6h->daddr.s6_addr32 :
++ skb->nh.ipv6h->saddr.s6_addr32, AF_INET6);
++#endif
++ } else {
++ return NULL;
++ }
++ if (entry == NULL)
++ return NULL;
++
++ return entry->active_env;
++}
++
++int venet_change_skb_owner(struct sk_buff *skb)
++{
++ struct ve_struct *ve, *ve_old;
++
++ ve_old = skb->owner_env;
++
++ read_lock(&veip_hash_lock);
++ if (!ve_is_super(ve_old)) {
++ /* from VE to host */
++ ve = venet_find_ve(skb, 0);
++ if (ve == NULL)
++ goto out_drop;
++ if (!ve_accessible_strict(ve, ve_old))
++ goto out_source;
++ skb->owner_env = get_ve0();
++ } else {
++ /* from host to VE */
++ ve = venet_find_ve(skb, 1);
++ if (ve == NULL)
++ goto out_drop;
++ skb->owner_env = ve;
++ }
++ read_unlock(&veip_hash_lock);
++
++ return 0;
++
++out_drop:
++ read_unlock(&veip_hash_lock);
++ return -ESRCH;
++
++out_source:
++ read_unlock(&veip_hash_lock);
++ if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) {
++ printk(KERN_WARNING "Dropped packet, source wrong "
++ "veid=%u src-IP=%u.%u.%u.%u "
++ "dst-IP=%u.%u.%u.%u\n",
++ skb->owner_env->veid,
++ NIPQUAD(skb->nh.iph->saddr),
++ NIPQUAD(skb->nh.iph->daddr));
++ }
++ return -EACCES;
++}
++
++#ifdef CONFIG_PROC_FS
++int veip_seq_show(struct seq_file *m, void *v)
++{
++ struct list_head *p;
++ struct ip_entry_struct *entry;
++ char s[40];
++
++ p = (struct list_head *)v;
++ if (p == ip_entry_hash_table) {
++ seq_puts(m, "Version: 2.5\n");
++ return 0;
++ }
++ entry = list_entry(p, struct ip_entry_struct, ip_hash);
++ if (entry->family == AF_INET)
++ sprintf(s, "%u.%u.%u.%u", NIPQUAD(entry->key[3]));
++ else
++ sprintf(s, "%x:%x:%x:%x:%x:%x:%x:%x",
++ ntohl(entry->key[0])>>16,
++ ntohl(entry->key[0])&0xFFFF,
++ ntohl(entry->key[1])>>16,
++ ntohl(entry->key[1])&0xFFFF,
++ ntohl(entry->key[2])>>16,
++ ntohl(entry->key[2])&0xFFFF,
++ ntohl(entry->key[3])>>16,
++ ntohl(entry->key[3])&0xFFFF);
++ seq_printf(m, "%39s %10u\n", s, 0);
++ return 0;
++}
++#endif
++
++MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Virtual Network Device");
++MODULE_LICENSE("GPL v2");
+diff -upr linux-2.6.16.orig/drivers/net/sky2.c linux-2.6.16-026test015/drivers/net/sky2.c
+--- linux-2.6.16.orig/drivers/net/sky2.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/sky2.c 2006-07-04 14:41:36.000000000 +0400
+@@ -579,8 +579,8 @@ static void sky2_mac_init(struct sky2_hw
+ reg = gma_read16(hw, port, GM_PHY_ADDR);
+ gma_write16(hw, port, GM_PHY_ADDR, reg | GM_PAR_MIB_CLR);
+
+- for (i = 0; i < GM_MIB_CNT_SIZE; i++)
+- gma_read16(hw, port, GM_MIB_CNT_BASE + 8 * i);
++ for (i = GM_MIB_CNT_BASE; i <= GM_MIB_CNT_END; i += 4)
++ gma_read16(hw, port, i);
+ gma_write16(hw, port, GM_PHY_ADDR, reg);
+
+ /* transmit control */
+diff -upr linux-2.6.16.orig/drivers/net/sky2.h linux-2.6.16-026test015/drivers/net/sky2.h
+--- linux-2.6.16.orig/drivers/net/sky2.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/sky2.h 2006-07-04 14:41:36.000000000 +0400
+@@ -1380,6 +1380,7 @@ enum {
+ /* MIB Counters */
+ #define GM_MIB_CNT_BASE 0x0100 /* Base Address of MIB Counters */
+ #define GM_MIB_CNT_SIZE 44 /* Number of MIB Counters */
++#define GM_MIB_CNT_END 0x025C /* Last MIB counter */
+
+ /*
+ * MIB Counters base address definitions (low word) -
+diff -upr linux-2.6.16.orig/drivers/net/tg3.c linux-2.6.16-026test015/drivers/net/tg3.c
+--- linux-2.6.16.orig/drivers/net/tg3.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/tg3.c 2006-07-04 14:41:36.000000000 +0400
+@@ -7368,21 +7368,23 @@ static int tg3_get_settings(struct net_d
+ cmd->supported |= (SUPPORTED_1000baseT_Half |
+ SUPPORTED_1000baseT_Full);
+
+- if (!(tp->tg3_flags2 & TG3_FLG2_ANY_SERDES))
++ if (!(tp->tg3_flags2 & TG3_FLG2_ANY_SERDES)) {
+ cmd->supported |= (SUPPORTED_100baseT_Half |
+ SUPPORTED_100baseT_Full |
+ SUPPORTED_10baseT_Half |
+ SUPPORTED_10baseT_Full |
+ SUPPORTED_MII);
+- else
++ cmd->port = PORT_TP;
++ } else {
+ cmd->supported |= SUPPORTED_FIBRE;
++ cmd->port = PORT_FIBRE;
++ }
+
+ cmd->advertising = tp->link_config.advertising;
+ if (netif_running(dev)) {
+ cmd->speed = tp->link_config.active_speed;
+ cmd->duplex = tp->link_config.active_duplex;
+ }
+- cmd->port = 0;
+ cmd->phy_address = PHY_ADDR;
+ cmd->transceiver = 0;
+ cmd->autoneg = tp->link_config.autoneg;
+diff -upr linux-2.6.16.orig/drivers/net/tun.c linux-2.6.16-026test015/drivers/net/tun.c
+--- linux-2.6.16.orig/drivers/net/tun.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/tun.c 2006-07-04 14:41:38.000000000 +0400
+@@ -62,6 +62,7 @@
+
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include <ub/beancounter.h>
+
+ #ifdef TUN_DEBUG
+ static int debug;
+@@ -90,6 +91,7 @@ static int tun_net_close(struct net_devi
+ static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
+ struct tun_struct *tun = netdev_priv(dev);
++ struct user_beancounter *ub;
+
+ DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len);
+
+@@ -114,6 +116,18 @@ static int tun_net_xmit(struct sk_buff *
+ }
+ }
+
++ ub = netdev_bc(dev)->exec_ub;
++ if (ub && (skb_bc(skb)->charged == 0)) {
++ unsigned long charge;
++ charge = skb_charge_fullsize(skb);
++ if (charge_beancounter(ub, UB_OTHERSOCKBUF, charge, 1))
++ goto drop;
++ get_beancounter(ub);
++ skb_bc(skb)->ub = ub;
++ skb_bc(skb)->charged = charge;
++ skb_bc(skb)->resource = UB_OTHERSOCKBUF;
++ }
++
+ /* Queue packet */
+ skb_queue_tail(&tun->readq, skb);
+ dev->trans_start = jiffies;
+@@ -410,12 +424,14 @@ static ssize_t tun_chr_readv(struct file
+ tun->dev->name, addr[0], addr[1], addr[2],
+ addr[3], addr[4], addr[5]);
+ ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
++ /* skb will be uncharged in kfree_skb() */
+ kfree_skb(skb);
+ break;
+ } else {
+ DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x\n",
+ tun->dev->name, addr[0], addr[1], addr[2],
+ addr[3], addr[4], addr[5]);
++ /* skb will be uncharged in kfree_skb() */
+ kfree_skb(skb);
+ continue;
+ }
+@@ -451,6 +467,7 @@ static void tun_setup(struct net_device
+ dev->get_stats = tun_net_stats;
+ dev->ethtool_ops = &tun_ethtool_ops;
+ dev->destructor = free_netdev;
++ dev->features |= NETIF_F_VIRTUAL;
+ }
+
+ static struct tun_struct *tun_get_by_name(const char *name)
+@@ -459,8 +476,9 @@ static struct tun_struct *tun_get_by_nam
+
+ ASSERT_RTNL();
+ list_for_each_entry(tun, &tun_dev_list, list) {
+- if (!strncmp(tun->dev->name, name, IFNAMSIZ))
+- return tun;
++ if (ve_accessible_strict(tun->dev->owner_env, get_exec_env()) &&
++ !strncmp(tun->dev->name, name, IFNAMSIZ))
++ return tun;
+ }
+
+ return NULL;
+@@ -479,7 +497,8 @@ static int tun_set_iff(struct file *file
+
+ /* Check permissions */
+ if (tun->owner != -1 &&
+- current->euid != tun->owner && !capable(CAP_NET_ADMIN))
++ current->euid != tun->owner &&
++ !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+ }
+ else if (__dev_get_by_name(ifr->ifr_name))
+diff -upr linux-2.6.16.orig/drivers/net/venet_core.c linux-2.6.16-026test015/drivers/net/venet_core.c
+--- linux-2.6.16.orig/drivers/net/venet_core.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/drivers/net/venet_core.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,675 @@
++/*
++ * venet_core.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * Common part for Virtuozzo virtual network devices
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/fs.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/socket.h>
++#include <linux/errno.h>
++#include <linux/fcntl.h>
++#include <linux/in.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/tcp.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <net/addrconf.h>
++
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <asm/unistd.h>
++
++#include <linux/inet.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <net/ip.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <linux/if_ether.h> /* For the statistics structure. */
++#include <linux/if_arp.h> /* For ARPHRD_ETHER */
++#include <linux/venet.h>
++#include <linux/ve_proto.h>
++#include <linux/vzctl.h>
++#include <linux/vzctl_venet.h>
++
++struct list_head ip_entry_hash_table[VEIP_HASH_SZ];
++rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED;
++LIST_HEAD(veip_lh);
++
++#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1))
++
++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip)
++{
++ list_add(&entry->ip_hash,
++ ip_entry_hash_table + ip_entry_hash_function(entry->key[3]));
++ list_add(&entry->ve_list, &veip->ip_lh);
++}
++
++void veip_put(struct veip_struct *veip)
++{
++ if (!list_empty(&veip->ip_lh))
++ return;
++ if (!list_empty(&veip->src_lh))
++ return;
++ if (!list_empty(&veip->dst_lh))
++ return;
++
++ list_del(&veip->list);
++ kfree(veip);
++}
++
++struct ip_entry_struct *ip_entry_lookup(u32 addr)
++{
++ struct ip_entry_struct *entry;
++ struct list_head *tmp;
++
++ list_for_each(tmp, ip_entry_hash_table + ip_entry_hash_function(addr)) {
++ entry = list_entry(tmp, struct ip_entry_struct, ip_hash);
++ if (entry->key[3] != addr || entry->family != AF_INET)
++ continue;
++ return entry;
++ }
++ return NULL;
++}
++
++struct ip_entry_struct *venet_entry_lookup(u32 *addr, int family)
++{
++ struct ip_entry_struct *entry;
++ struct list_head *tmp;
++
++ list_for_each(tmp, ip_entry_hash_table + ip_entry_hash_function(addr[3])) {
++ entry = list_entry(tmp, struct ip_entry_struct, ip_hash);
++ if (memcmp(entry->key, addr, 16) != 0
++ || entry->family != family)
++ continue;
++ return entry;
++ }
++ return NULL;
++}
++
++struct veip_struct *veip_find(envid_t veid)
++{
++ struct veip_struct *ptr;
++ list_for_each_entry(ptr, &veip_lh, list) {
++ if (ptr->veid != veid)
++ continue;
++ return ptr;
++ }
++ return NULL;
++}
++
++struct veip_struct *veip_findcreate(envid_t veid)
++{
++ struct veip_struct *ptr;
++
++ ptr = veip_find(veid);
++ if (ptr != NULL)
++ return ptr;
++
++ ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC);
++ if (ptr == NULL)
++ return NULL;
++ memset(ptr, 0, sizeof(struct veip_struct));
++ INIT_LIST_HEAD(&ptr->ip_lh);
++ INIT_LIST_HEAD(&ptr->src_lh);
++ INIT_LIST_HEAD(&ptr->dst_lh);
++ list_add(&ptr->list, &veip_lh);
++ ptr->veid = veid;
++ return ptr;
++}
++
++/*
++ * Device functions
++ */
++
++static int venet_open(struct net_device *dev)
++{
++ if (!try_module_get(THIS_MODULE))
++ return -EBUSY;
++ return 0;
++}
++
++static int venet_close(struct net_device *master)
++{
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++static void venet_destructor(struct net_device *dev)
++{
++ kfree(dev->priv);
++ dev->priv = NULL;
++}
++
++/*
++ * The higher levels take care of making this non-reentrant (it's
++ * called with bh's disabled).
++ */
++static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++ struct net_device_stats *stats = (struct net_device_stats *)dev->priv;
++ struct net_device *rcv = NULL;
++ int length;
++
++ if (unlikely(get_exec_env()->disable_net))
++ goto outf;
++
++ /*
++ * Optimise so buffers with skb->free=1 are not copied but
++ * instead are lobbed from tx queue to rx queue
++ */
++ if (atomic_read(&skb->users) != 1) {
++ struct sk_buff *skb2 = skb;
++ skb = skb_clone(skb, GFP_ATOMIC); /* Clone the buffer */
++ if (skb == NULL) {
++ kfree_skb(skb2);
++ goto out;
++ }
++ kfree_skb(skb2);
++ } else
++ skb_orphan(skb);
++
++ if (skb->protocol == __constant_htons(ETH_P_IP)) {
++ struct iphdr *iph;
++ iph = skb->nh.iph;
++ if (MULTICAST(iph->daddr))
++ goto outf;
++ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
++ struct ipv6hdr *ip6h;
++ ip6h = skb->nh.ipv6h;
++ if (ipv6_addr_is_multicast(&ip6h->daddr))
++ goto outf;
++ } else {
++ goto outf;
++ }
++
++ if (venet_change_skb_owner(skb) < 0)
++ goto outf;
++
++ if (unlikely(VE_OWNER_SKB(skb)->disable_net))
++ goto outf;
++
++ rcv = VE_OWNER_SKB(skb)->_venet_dev;
++ if (!rcv)
++ /* VE going down */
++ goto outf;
++
++ dev_hold(rcv);
++
++ if (!(rcv->flags & IFF_UP)) {
++ /* Target VE does not want to receive packets */
++ dev_put(rcv);
++ goto outf;
++ }
++
++ skb->pkt_type = PACKET_HOST;
++ skb->dev = rcv;
++
++ skb->mac.raw = skb->data;
++ memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len);
++
++ dst_release(skb->dst);
++ skb->dst = NULL;
++#ifdef CONFIG_NETFILTER
++ nf_conntrack_put(skb->nfct);
++ skb->nfct = NULL;
++#ifdef CONFIG_NETFILTER_DEBUG
++ skb->nf_debug = 0;
++#endif
++#endif
++ length = skb->len;
++
++ netif_rx(skb);
++
++ stats->tx_bytes += length;
++ stats->tx_packets++;
++ if (rcv) {
++ struct net_device_stats *rcv_stats =
++ (struct net_device_stats *)rcv->priv;
++ rcv_stats->rx_bytes += length;
++ rcv_stats->rx_packets++;
++ dev_put(rcv);
++ }
++
++ return 0;
++
++outf:
++ kfree_skb(skb);
++ ++stats->tx_dropped;
++out:
++ return 0;
++}
++
++static struct net_device_stats *get_stats(struct net_device *dev)
++{
++ return (struct net_device_stats *)dev->priv;
++}
++
++/* Initialize the rest of the LOOPBACK device. */
++int venet_init_dev(struct net_device *dev)
++{
++ dev->hard_start_xmit = venet_xmit;
++ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
++ if (dev->priv == NULL)
++ return -ENOMEM;
++ memset(dev->priv, 0, sizeof(struct net_device_stats));
++ dev->get_stats = get_stats;
++ dev->open = venet_open;
++ dev->stop = venet_close;
++ dev->destructor = venet_destructor;
++
++ /*
++ * Fill in the generic fields of the device structure.
++ */
++ dev->type = ARPHRD_VOID;
++ dev->hard_header_len = ETH_HLEN;
++ dev->mtu = 1500; /* eth_mtu */
++ dev->tx_queue_len = 0;
++
++ memset(dev->broadcast, 0xFF, ETH_ALEN);
++
++ /* New-style flags. */
++ dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT;
++ return 0;
++}
++
++static void venet_setup(struct net_device *dev)
++{
++ dev->init = venet_init_dev;
++ /*
++ * No other features, as they are:
++ * - checksumming is required, and nobody else will done our job
++ */
++ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL;
++}
++
++#ifdef CONFIG_PROC_FS
++static int veinfo_seq_show(struct seq_file *m, void *v)
++{
++ struct ve_struct *ve = (struct ve_struct *)v;
++ struct list_head *tmp;
++
++ seq_printf(m, "%10u %5u %5u", ve->veid,
++ ve->class_id, atomic_read(&ve->pcounter));
++ read_lock(&veip_hash_lock);
++ if (ve->veip == NULL)
++ goto unlock;
++ list_for_each(tmp, &ve->veip->ip_lh) {
++ char ip[40];
++ struct ip_entry_struct *entry;
++
++ entry = list_entry(tmp, struct ip_entry_struct, ve_list);
++ if (entry->active_env == NULL)
++ continue;
++
++ if (entry->family == AF_INET)
++ sprintf(ip, "%u.%u.%u.%u", NIPQUAD(entry->key[3]));
++ else
++ sprintf(ip, "%x:%x:%x:%x:%x:%x:%x:%x",
++ ntohl(entry->key[0])>>16,
++ ntohl(entry->key[0])&0xFFFF,
++ ntohl(entry->key[1])>>16,
++ ntohl(entry->key[1])&0xFFFF,
++ ntohl(entry->key[2])>>16,
++ ntohl(entry->key[2])&0xFFFF,
++ ntohl(entry->key[3])>>16,
++ ntohl(entry->key[3])&0xFFFF);
++ seq_printf(m, " %39s", ip);
++ }
++unlock:
++ read_unlock(&veip_hash_lock);
++ seq_putc(m, '\n');
++ return 0;
++}
++
++static void *ve_seq_start(struct seq_file *m, loff_t *pos)
++{
++ struct ve_struct *ve, *curve;
++ loff_t l;
++
++ curve = get_exec_env();
++ read_lock(&ve_list_guard);
++ if (!ve_is_super(curve)) {
++ if (*pos != 0)
++ return NULL;
++ return curve;
++ }
++ for (ve = ve_list_head, l = *pos;
++ ve != NULL && l > 0;
++ ve = ve->next, l--);
++ return ve;
++}
++
++static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ struct ve_struct *ve = (struct ve_struct *)v;
++
++ if (!ve_is_super(get_exec_env()))
++ return NULL;
++ (*pos)++;
++ return ve->next;
++}
++
++static void ve_seq_stop(struct seq_file *m, void *v)
++{
++ read_unlock(&ve_list_guard);
++}
++
++
++static struct seq_operations veinfo_seq_op = {
++ start: ve_seq_start,
++ next: ve_seq_next,
++ stop: ve_seq_stop,
++ show: veinfo_seq_show
++};
++
++static int veinfo_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &veinfo_seq_op);
++}
++
++static struct file_operations proc_veinfo_operations = {
++ open: veinfo_open,
++ read: seq_read,
++ llseek: seq_lseek,
++ release: seq_release
++};
++
++static void *veip_seq_start(struct seq_file *m, loff_t *pos)
++{
++ loff_t l;
++ struct list_head *p;
++ int i;
++
++ l = *pos;
++ write_lock_irq(&veip_hash_lock);
++ if (l == 0)
++ return ip_entry_hash_table;
++ for (i = 0; i < VEIP_HASH_SZ; i++) {
++ list_for_each(p, ip_entry_hash_table + i) {
++ if (--l == 0)
++ return p;
++ }
++ }
++ return NULL;
++}
++
++static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ struct list_head *p;
++
++ p = (struct list_head *)v;
++ while (1) {
++ p = p->next;
++ if (p < ip_entry_hash_table ||
++ p >= ip_entry_hash_table + VEIP_HASH_SZ) {
++ (*pos)++;
++ return p;
++ }
++ if (++p >= ip_entry_hash_table + VEIP_HASH_SZ)
++ return NULL;
++ }
++ return NULL;
++}
++
++static void veip_seq_stop(struct seq_file *m, void *v)
++{
++ write_unlock_irq(&veip_hash_lock);
++}
++
++static struct seq_operations veip_seq_op = {
++ start: veip_seq_start,
++ next: veip_seq_next,
++ stop: veip_seq_stop,
++ show: veip_seq_show
++};
++
++static int veip_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &veip_seq_op);
++}
++
++static struct file_operations proc_veip_operations = {
++ open: veip_open,
++ read: seq_read,
++ llseek: seq_lseek,
++ release: seq_release
++};
++#endif
++
++int real_ve_ip_map(envid_t veid, int op, struct sockaddr *uservaddr, int addrlen)
++{
++ int err;
++ union {
++ struct sockaddr g;
++ struct sockaddr_in a4;
++ struct sockaddr_in6 a6;
++ } addr;
++ struct ve_struct *ve;
++
++ err = -EPERM;
++ if (!capable(CAP_SETVEID))
++ goto out;
++
++ err = -EINVAL;
++ if (addrlen > sizeof(addr) || addrlen < sizeof(struct sockaddr_in))
++ goto out;
++
++ err = move_addr_to_kernel(uservaddr, addrlen, &addr);
++ if (err < 0)
++ goto out;
++
++ err = -EINVAL;
++ if (addr.g.sa_family == AF_INET) {
++ if (addrlen != sizeof(struct sockaddr_in))
++ goto out;
++ } else if (addr.g.sa_family == AF_INET6) {
++ if (addrlen != sizeof(struct sockaddr_in6))
++ goto out;
++ } else {
++ err = -EAFNOSUPPORT;
++ goto out;
++ }
++
++ switch (op)
++ {
++ case VE_IP_ADD:
++ ve = get_ve_by_id(veid);
++ err = -ESRCH;
++ if (!ve)
++ goto out;
++
++ down_read(&ve->op_sem);
++ if (ve->is_running)
++ err = veip_entry_add(ve, &addr.g);
++ up_read(&ve->op_sem);
++ put_ve(ve);
++ break;
++
++ case VE_IP_DEL:
++ err = veip_entry_del(veid, &addr.g);
++ break;
++ default:
++ err = -EINVAL;
++ }
++
++out:
++ return err;
++}
++
++int venet_ioctl(struct inode *ino, struct file *file, unsigned int cmd,
++ unsigned long arg)
++{
++ int err;
++
++ err = -ENOTTY;
++ switch(cmd) {
++ case VENETCTL_VE_IP_MAP: {
++ struct vzctl_ve_ip_map s;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen);
++ }
++ break;
++ }
++ return err;
++}
++
++static struct vzioctlinfo venetcalls = {
++ type: VENETCTLTYPE,
++ func: venet_ioctl,
++ owner: THIS_MODULE,
++};
++
++int venet_dev_start(struct ve_struct *env)
++{
++ struct net_device *dev_venet;
++ int err;
++
++ dev_venet = alloc_netdev(0, "venet%d", venet_setup);
++ if (!dev_venet)
++ return -ENOMEM;
++ err = dev_alloc_name(dev_venet, dev_venet->name);
++ if (err<0)
++ goto err;
++ if ((err = register_netdev(dev_venet)) != 0)
++ goto err;
++ env->_venet_dev = dev_venet;
++ return 0;
++err:
++ free_netdev(dev_venet);
++ printk(KERN_ERR "VENET initialization error err=%d\n", err);
++ return err;
++}
++
++static int venet_start(unsigned int hooknum, void *data)
++{
++ struct ve_struct *env;
++ int err;
++
++ env = (struct ve_struct *)data;
++ if (env->veip)
++ return -EEXIST;
++ if (!ve_is_super(env) && !try_module_get(THIS_MODULE))
++ return 0;
++
++ err = veip_start(env);
++ if (err)
++ goto err;
++
++ err = venet_dev_start(env);
++ if (err)
++ goto err_free;
++ return 0;
++
++err_free:
++ veip_stop(env);
++err:
++ if (!ve_is_super(env))
++ module_put(THIS_MODULE);
++ return err;
++}
++
++static int venet_stop(unsigned int hooknum, void *data)
++{
++ struct ve_struct *env;
++
++ env = (struct ve_struct *)data;
++ veip_stop(env);
++ if (!ve_is_super(env))
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++#define VE_HOOK_PRI_NET 0
++
++static struct ve_hook venet_ve_hook_init = {
++ hook: venet_start,
++ undo: venet_stop,
++ hooknum: VE_HOOK_INIT,
++ priority: VE_HOOK_PRI_NET
++};
++
++static struct ve_hook venet_ve_hook_fini = {
++ hook: venet_stop,
++ hooknum: VE_HOOK_FINI,
++ priority: VE_HOOK_PRI_NET
++};
++
++__init int venet_init(void)
++{
++#ifdef CONFIG_PROC_FS
++ struct proc_dir_entry *de;
++#endif
++ int i, err;
++
++ if (get_ve0()->_venet_dev != NULL)
++ return -EEXIST;
++
++ for (i = 0; i < VEIP_HASH_SZ; i++)
++ INIT_LIST_HEAD(ip_entry_hash_table + i);
++
++ err = venet_start(VE_HOOK_INIT, (void *)get_ve0());
++ if (err)
++ return err;
++
++#ifdef CONFIG_PROC_FS
++ de = create_proc_glob_entry("vz/veinfo",
++ S_IFREG|S_IRUSR, NULL);
++ if (de)
++ de->proc_fops = &proc_veinfo_operations;
++ else
++ printk(KERN_WARNING "venet: can't make veinfo proc entry\n");
++
++ de = create_proc_entry("vz/veip", S_IFREG|S_IRUSR, NULL);
++ if (de)
++ de->proc_fops = &proc_veip_operations;
++ else
++ printk(KERN_WARNING "venet: can't make veip proc entry\n");
++#endif
++
++ ve_hook_register(&venet_ve_hook_init);
++ ve_hook_register(&venet_ve_hook_fini);
++ vzioctl_register(&venetcalls);
++ return 0;
++}
++
++__exit void venet_exit(void)
++{
++ struct net_device *dev_venet;
++
++ vzioctl_unregister(&venetcalls);
++ ve_hook_unregister(&venet_ve_hook_fini);
++ ve_hook_unregister(&venet_ve_hook_init);
++#ifdef CONFIG_PROC_FS
++ remove_proc_entry("vz/veip", NULL);
++ remove_proc_entry("vz/veinfo", NULL);
++#endif
++
++ dev_venet = get_ve0()->_venet_dev;
++ if (dev_venet != NULL) {
++ get_ve0()->_venet_dev = NULL;
++ unregister_netdev(dev_venet);
++ free_netdev(dev_venet);
++ }
++ veip_stop(get_ve0());
++}
++
++module_init(venet_init);
++module_exit(venet_exit);
+diff -upr linux-2.6.16.orig/drivers/net/veth.c linux-2.6.16-026test015/drivers/net/veth.c
+--- linux-2.6.16.orig/drivers/net/veth.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/drivers/net/veth.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,582 @@
++/*
++ * veth.c
++ *
++ * Copyright (C) 2006 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * Virtual ethernet device used to change VE ownership on packets
++ */
++
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/interrupt.h>
++#include <linux/fs.h>
++#include <linux/types.h>
++#include <linux/string.h>
++#include <linux/socket.h>
++#include <linux/errno.h>
++#include <linux/fcntl.h>
++#include <linux/in.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/tcp.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/io.h>
++#include <asm/unistd.h>
++
++#include <linux/inet.h>
++#include <linux/netdevice.h>
++#include <linux/etherdevice.h>
++#include <net/ip.h>
++#include <linux/skbuff.h>
++#include <net/sock.h>
++#include <linux/if_ether.h> /* For the statistics structure. */
++#include <linux/if_arp.h> /* For ARPHRD_ETHER */
++#include <linux/ve_proto.h>
++#include <linux/vzctl.h>
++#include <linux/vzctl_veth.h>
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/vzcalluser.h>
++
++struct veth_struct
++{
++ struct net_device_stats stats;
++ struct net_device *pair;
++ struct list_head hwaddr_list;
++};
++
++struct list_head veth_hwaddr_list;
++rwlock_t ve_hwaddr_lock = RW_LOCK_UNLOCKED;
++DECLARE_MUTEX(hwaddr_sem);
++
++#define veth_from_netdev(dev) \
++ ((struct veth_struct *)(netdev_priv(dev)))
++#define veth_to_netdev(veth) \
++ ((struct net_device*)((char*)veth - \
++ (unsigned long)netdev_priv(NULL)))
++
++struct net_device * veth_dev_start(char *dev_addr, char *name);
++
++struct veth_struct *hwaddr_entry_lookup(char *name)
++{
++ struct veth_struct *entry;
++ struct list_head *tmp;
++
++ list_for_each(tmp, &veth_hwaddr_list) {
++ entry = list_entry(tmp, struct veth_struct, hwaddr_list);
++ BUG_ON(entry->pair == NULL);
++ if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0)
++ return entry;
++ }
++ return NULL;
++}
++
++int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name,
++ char *dev_addr_ve, char *name_ve)
++{
++ struct net_device *dev_ve;
++ struct net_device *dev_ve0;
++ struct ve_struct *old_env;
++ char dev_name[IFNAMSIZ];
++ int err;
++
++ down(&hwaddr_sem);
++
++ if (name[0] == '\0')
++ snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid);
++ else {
++ memcpy(dev_name, name, IFNAMSIZ - 1);
++ dev_name[IFNAMSIZ - 1] = '\0';
++ }
++ dev_ve0 = veth_dev_start(dev_addr, dev_name);
++ if (IS_ERR(dev_ve0)) {
++ err = PTR_ERR(dev_ve0);
++ goto err;
++ }
++
++ old_env = set_exec_env(ve);
++ if (name_ve[0] == '\0')
++ sprintf(dev_name, "eth%%d");
++ else {
++ memcpy(dev_name, name_ve, IFNAMSIZ - 1);
++ dev_name[IFNAMSIZ - 1] = '\0';
++ }
++ dev_ve = veth_dev_start(dev_addr_ve, dev_name);
++ if (IS_ERR(dev_ve)) {
++ err = PTR_ERR(dev_ve);
++ goto err_ve;
++ }
++ set_exec_env(old_env);
++ veth_from_netdev(dev_ve)->pair = dev_ve0;
++ veth_from_netdev(dev_ve0)->pair = dev_ve;
++
++ write_lock(&ve_hwaddr_lock);
++ list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list);
++ write_unlock(&ve_hwaddr_lock);
++
++ up(&hwaddr_sem);
++ return 0;
++
++err_ve:
++ set_exec_env(old_env);
++ unregister_netdev(dev_ve0);
++err:
++ up(&hwaddr_sem);
++ return err;
++}
++
++int veth_entry_del(struct ve_struct *ve, char *name)
++{
++ struct veth_struct *found;
++ struct ve_struct *old_env;
++ struct net_device *dev;
++ int err;
++
++ err = -ENODEV;
++ down(&hwaddr_sem);
++ found = hwaddr_entry_lookup(name);
++ if (found == NULL)
++ goto out;
++ if (veth_to_netdev(found)->owner_env != ve)
++ goto out;
++
++ write_lock(&ve_hwaddr_lock);
++ list_del(&found->hwaddr_list);
++ write_unlock(&ve_hwaddr_lock);
++ err = 0;
++ dev = found->pair;
++ BUG_ON(found->pair == NULL);
++
++ old_env = get_exec_env();
++ set_exec_env(ve);
++ unregister_netdev(veth_to_netdev(found));
++ set_exec_env(old_env);
++
++ unregister_netdev(dev);
++
++out:
++ up(&hwaddr_sem);
++ return err;
++}
++
++/*
++ * Device functions
++ */
++
++static int veth_open(struct net_device *dev)
++{
++ return 0;
++}
++
++static int veth_close(struct net_device *master)
++{
++ return 0;
++}
++
++static void veth_destructor(struct net_device *dev)
++{
++ free_netdev(dev);
++}
++
++static struct net_device_stats *get_stats(struct net_device *dev)
++{
++ return &veth_from_netdev(dev)->stats;
++}
++
++/*
++ * The higher levels take care of making this non-reentrant (it's
++ * called with bh's disabled).
++ */
++static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
++{
++ struct net_device_stats *stats = get_stats(dev);
++ struct net_device *rcv = NULL;
++ struct veth_struct *entry;
++ int length;
++
++ if (unlikely(get_exec_env()->disable_net))
++ goto outf;
++
++ skb_orphan(skb);
++
++ entry = veth_from_netdev(dev);
++ rcv = entry->pair;
++ if (!rcv)
++ /* VE going down */
++ goto outf;
++
++ if (unlikely(rcv->owner_env->disable_net))
++ goto outf;
++
++ skb->owner_env = rcv->owner_env;
++
++ if (!(rcv->flags & IFF_UP)) {
++ /* Target VE does not want to receive packets */
++ goto outf;
++ }
++
++ skb->dev = rcv;
++ skb->pkt_type = PACKET_HOST;
++ skb->protocol = eth_type_trans(skb, rcv);
++
++ dst_release(skb->dst);
++ skb->dst = NULL;
++#ifdef CONFIG_NETFILTER
++ nf_conntrack_put(skb->nfct);
++ skb->nfct = NULL;
++#ifdef CONFIG_NETFILTER_DEBUG
++ skb->nf_debug = 0;
++#endif
++#endif
++ length = skb->len;
++
++ netif_rx(skb);
++
++ stats->tx_bytes += length;
++ stats->tx_packets++;
++ if (rcv) {
++ struct net_device_stats *rcv_stats = get_stats(rcv);
++ rcv_stats->rx_bytes += length;
++ rcv_stats->rx_packets++;
++ }
++
++ return 0;
++
++outf:
++ kfree_skb(skb);
++ stats->tx_dropped++;
++ return 0;
++}
++
++int veth_init_dev(struct net_device *dev)
++{
++ dev->hard_start_xmit = veth_xmit;
++ dev->get_stats = get_stats;
++ dev->open = veth_open;
++ dev->stop = veth_close;
++ dev->destructor = veth_destructor;
++
++ ether_setup(dev);
++
++ dev->tx_queue_len = 0;
++ return 0;
++}
++
++static void veth_setup(struct net_device *dev)
++{
++ dev->init = veth_init_dev;
++ /*
++ * No other features, as they are:
++ * - checksumming is required, and nobody else will done our job
++ */
++ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL;
++}
++
++#ifdef CONFIG_PROC_FS
++#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
++#define ADDR(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5]
++static int vehwaddr_seq_show(struct seq_file *m, void *v)
++{
++ struct list_head *p;
++ struct veth_struct *entry;
++
++ p = (struct list_head *)v;
++ if (p == &veth_hwaddr_list) {
++ seq_puts(m, "Version: 1.0\n");
++ return 0;
++ }
++ entry = list_entry(p, struct veth_struct, hwaddr_list);
++ seq_printf(m, ADDR_FMT " %16s ",
++ ADDR(entry->pair->dev_addr), entry->pair->name);
++ seq_printf(m, ADDR_FMT " %16s %10u\n",
++ ADDR(veth_to_netdev(entry)->dev_addr),
++ veth_to_netdev(entry)->name,
++ VEID(veth_to_netdev(entry)->owner_env));
++ return 0;
++}
++
++static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos)
++{
++ loff_t l;
++ struct list_head *p;
++
++ l = *pos;
++ read_lock(&ve_hwaddr_lock);
++ if (l == 0)
++ return &veth_hwaddr_list;
++ list_for_each(p, &veth_hwaddr_list) {
++ if (--l == 0)
++ return p;
++ }
++ return NULL;
++}
++
++static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ struct list_head *p;
++
++ p = (struct list_head *)v;
++ (*pos)++;
++ return p->next == &veth_hwaddr_list ? NULL : p->next;
++}
++
++static void vehwaddr_seq_stop(struct seq_file *m, void *v)
++{
++ read_unlock(&ve_hwaddr_lock);
++}
++
++static struct seq_operations vehwaddr_seq_op = {
++ .start = vehwaddr_seq_start,
++ .next = vehwaddr_seq_next,
++ .stop = vehwaddr_seq_stop,
++ .show = vehwaddr_seq_show
++};
++
++static int vehwaddr_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &vehwaddr_seq_op);
++}
++
++static struct file_operations proc_vehwaddr_operations = {
++ .open = vehwaddr_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release
++};
++#endif
++
++int real_ve_hwaddr(envid_t veid, int op,
++ unsigned char *dev_addr, int addrlen, char *name,
++ unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve)
++{
++ int err;
++ struct ve_struct *ve;
++ char ve_addr[ETH_ALEN];
++
++ err = -EPERM;
++ if (!capable(CAP_NET_ADMIN))
++ goto out;
++
++ err = -EINVAL;
++ switch (op)
++ {
++ case VE_ETH_ADD:
++ if (addrlen != ETH_ALEN)
++ goto out;
++ if (addrlen_ve != ETH_ALEN && addrlen_ve != 0)
++ goto out;
++ /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */
++ if (addrlen_ve == 0 && (dev_addr[3] & 0x80))
++ goto out;
++ if (addrlen_ve == 0) {
++ memcpy(ve_addr, dev_addr, ETH_ALEN);
++ ve_addr[3] |= 0x80;
++ } else {
++ memcpy(ve_addr, dev_addr_ve, ETH_ALEN);
++ }
++
++ ve = get_ve_by_id(veid);
++ err = -ESRCH;
++ if (!ve)
++ goto out;
++
++ down_read(&ve->op_sem);
++ if (ve->is_running)
++ err = veth_entry_add(ve, dev_addr, name,
++ ve_addr, name_ve);
++ up_read(&ve->op_sem);
++ put_ve(ve);
++ break;
++
++ case VE_ETH_DEL:
++ if (name[0] == '\0')
++ goto out;
++ ve = get_ve_by_id(veid);
++ err = -ESRCH;
++ if (!ve)
++ goto out;
++
++ down_read(&ve->op_sem);
++ if (ve->is_running)
++ err = veth_entry_del(ve, name);
++ up_read(&ve->op_sem);
++ put_ve(ve);
++ break;
++ }
++
++out:
++ return err;
++}
++
++int veth_ioctl(struct inode *ino, struct file *file, unsigned int cmd,
++ unsigned long arg)
++{
++ int err;
++
++ err = -ENOTTY;
++ switch(cmd) {
++ case VETHCTL_VE_HWADDR: {
++ struct vzctl_ve_hwaddr s;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err = real_ve_hwaddr(s.veid, s.op,
++ s.dev_addr, s.addrlen, s.dev_name,
++ s.dev_addr_ve, s.addrlen_ve, s.dev_name_ve);
++ }
++ break;
++ }
++ return err;
++}
++
++static struct vzioctlinfo vethcalls = {
++ .type = VETHCTLTYPE,
++ .func = veth_ioctl,
++ .owner = THIS_MODULE,
++};
++
++struct net_device * veth_dev_start(char *dev_addr, char *name)
++{
++ struct net_device *dev;
++ int err;
++
++ dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup);
++ if (!dev)
++ return ERR_PTR(-ENOMEM);
++ if (strchr(dev->name, '%')) {
++ err = dev_alloc_name(dev, dev->name);
++ if (err < 0)
++ goto err;
++ }
++ if ((err = register_netdev(dev)) != 0)
++ goto err;
++
++ memcpy(dev->dev_addr, dev_addr, ETH_ALEN);
++ dev->addr_len = ETH_ALEN;
++
++ return dev;
++err:
++ free_netdev(dev);
++ printk(KERN_ERR "%s initialization error err=%d\n", name, err);
++ return ERR_PTR(err);
++}
++
++static int veth_stop(unsigned int hooknum, void *data)
++{
++ struct ve_struct *old_env;
++ struct ve_struct *env;
++ struct list_head *tmp, *n;
++
++ env = (struct ve_struct *)data;
++ down(&hwaddr_sem);
++ list_for_each_safe(tmp, n, &veth_hwaddr_list) {
++ struct veth_struct *entry;
++ struct net_device *dev;
++ entry = list_entry(tmp, struct veth_struct, hwaddr_list);
++ if (VEID(env) != VEID(veth_to_netdev(entry)->owner_env))
++ continue;
++
++ write_lock(&ve_hwaddr_lock);
++ list_del(&entry->hwaddr_list);
++ write_unlock(&ve_hwaddr_lock);
++
++ dev = entry->pair;
++ BUG_ON(entry->pair == NULL);
++ old_env = set_exec_env(env);
++ unregister_netdev(veth_to_netdev(entry));
++ set_exec_env(old_env);
++
++ old_env = set_exec_env(get_ve0());
++ unregister_netdev(dev);
++ set_exec_env(old_env);
++ }
++ up(&hwaddr_sem);
++ return 0;
++}
++
++#define VE_HOOK_PRI_NET 0
++
++static struct ve_hook veth_ve_hook_fini = {
++ .hook = veth_stop,
++ .hooknum = VE_HOOK_FINI,
++ .priority = VE_HOOK_PRI_NET,
++ .owner = THIS_MODULE,
++};
++
++__init int veth_init(void)
++{
++#ifdef CONFIG_PROC_FS
++ struct proc_dir_entry *de;
++#endif
++
++ INIT_LIST_HEAD(&veth_hwaddr_list);
++
++#ifdef CONFIG_PROC_FS
++ de = create_proc_glob_entry("vz/veth",
++ S_IFREG|S_IRUSR, NULL);
++ if (de)
++ de->proc_fops = &proc_vehwaddr_operations;
++ else
++ printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n");
++
++#endif
++
++ ve_hook_register(&veth_ve_hook_fini);
++ vzioctl_register(&vethcalls);
++ return 0;
++}
++
++__exit void veth_exit(void)
++{
++ struct veth_struct *entry;
++ struct list_head *tmp, *n;
++ struct ve_struct *ve;
++ struct ve_struct *old_env;
++
++ vzioctl_unregister(&vethcalls);
++ ve_hook_unregister(&veth_ve_hook_fini);
++#ifdef CONFIG_PROC_FS
++ remove_proc_entry("vz/veth", NULL);
++#endif
++
++ down(&hwaddr_sem);
++ list_for_each_safe(tmp, n, &veth_hwaddr_list) {
++ struct net_device *dev;
++ entry = list_entry(tmp, struct veth_struct, hwaddr_list);
++ ve = get_ve(veth_to_netdev(entry)->owner_env);
++
++ write_lock(&ve_hwaddr_lock);
++ list_del(&entry->hwaddr_list);
++ write_unlock(&ve_hwaddr_lock);
++
++ dev = entry->pair;
++ BUG_ON(entry->pair == NULL);
++ old_env = set_exec_env(ve);
++ unregister_netdev(veth_to_netdev(entry));
++ set_exec_env(old_env);
++
++ unregister_netdev(dev);
++
++ put_ve(ve);
++ }
++ up(&hwaddr_sem);
++}
++
++module_init(veth_init);
++module_exit(veth_exit);
++
++MODULE_AUTHOR("Andrey Mirkin <amirkin@sw.ru>");
++MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device");
++MODULE_LICENSE("GPL v2");
++
+diff -upr linux-2.6.16.orig/drivers/net/via-rhine.c linux-2.6.16-026test015/drivers/net/via-rhine.c
+--- linux-2.6.16.orig/drivers/net/via-rhine.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/via-rhine.c 2006-07-04 14:41:36.000000000 +0400
+@@ -129,6 +129,7 @@
+ - Massive clean-up
+ - Rewrite PHY, media handling (remove options, full_duplex, backoff)
+ - Fix Tx engine race for good
++ - Craig Brind: Zero padded aligned buffers for short packets.
+
+ */
+
+@@ -1306,7 +1307,12 @@ static int rhine_start_tx(struct sk_buff
+ rp->stats.tx_dropped++;
+ return 0;
+ }
++
++ /* Padding is not copied and so must be redone. */
+ skb_copy_and_csum_dev(skb, rp->tx_buf[entry]);
++ if (skb->len < ETH_ZLEN)
++ memset(rp->tx_buf[entry] + skb->len, 0,
++ ETH_ZLEN - skb->len);
+ rp->tx_skbuff_dma[entry] = 0;
+ rp->tx_ring[entry].addr = cpu_to_le32(rp->tx_bufs_dma +
+ (rp->tx_buf[entry] -
+diff -upr linux-2.6.16.orig/drivers/net/wireless/Kconfig linux-2.6.16-026test015/drivers/net/wireless/Kconfig
+--- linux-2.6.16.orig/drivers/net/wireless/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/wireless/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -239,7 +239,8 @@ config IPW2200_DEBUG
+
+ config AIRO
+ tristate "Cisco/Aironet 34X/35X/4500/4800 ISA and PCI cards"
+- depends on NET_RADIO && ISA_DMA_API && CRYPTO && (PCI || BROKEN)
++ depends on NET_RADIO && ISA_DMA_API && (PCI || BROKEN)
++ select CRYPTO
+ ---help---
+ This is the standard Linux driver to support Cisco/Aironet ISA and
+ PCI 802.11 wireless cards.
+@@ -374,6 +375,7 @@ config PCMCIA_HERMES
+ config PCMCIA_SPECTRUM
+ tristate "Symbol Spectrum24 Trilogy PCMCIA card support"
+ depends on NET_RADIO && PCMCIA && HERMES
++ select FW_LOADER
+ ---help---
+
+ This is a driver for 802.11b cards using RAM-loadable Symbol
+@@ -387,6 +389,7 @@ config PCMCIA_SPECTRUM
+ config AIRO_CS
+ tristate "Cisco/Aironet 34X/35X/4500/4800 PCMCIA cards"
+ depends on NET_RADIO && PCMCIA && (BROKEN || !M32R)
++ select CRYPTO
+ ---help---
+ This is the standard Linux driver to support Cisco/Aironet PCMCIA
+ 802.11 wireless cards. This driver is the same as the Aironet
+diff -upr linux-2.6.16.orig/drivers/net/wireless/hostap/hostap_80211_tx.c linux-2.6.16-026test015/drivers/net/wireless/hostap/hostap_80211_tx.c
+--- linux-2.6.16.orig/drivers/net/wireless/hostap/hostap_80211_tx.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/wireless/hostap/hostap_80211_tx.c 2006-07-04 14:41:36.000000000 +0400
+@@ -469,7 +469,7 @@ int hostap_master_start_xmit(struct sk_b
+ }
+
+ if (local->ieee_802_1x && meta->ethertype == ETH_P_PAE && tx.crypt &&
+- !(fc & IEEE80211_FCTL_VERS)) {
++ !(fc & IEEE80211_FCTL_PROTECTED)) {
+ no_encrypt = 1;
+ PDEBUG(DEBUG_EXTRA2, "%s: TX: IEEE 802.1X - passing "
+ "unencrypted EAPOL frame\n", dev->name);
+diff -upr linux-2.6.16.orig/drivers/net/wireless/ipw2200.c linux-2.6.16-026test015/drivers/net/wireless/ipw2200.c
+--- linux-2.6.16.orig/drivers/net/wireless/ipw2200.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/net/wireless/ipw2200.c 2006-07-04 14:41:36.000000000 +0400
+@@ -8391,20 +8391,28 @@ static int ipw_wx_get_range(struct net_d
+
+ i = 0;
+ if (priv->ieee->mode & (IEEE_B | IEEE_G)) {
+- for (j = 0; j < geo->bg_channels && i < IW_MAX_FREQUENCIES;
+- i++, j++) {
++ for (j = 0; j < geo->bg_channels && i < IW_MAX_FREQUENCIES; j++) {
++ if ((priv->ieee->iw_mode == IW_MODE_ADHOC) &&
++ (geo->bg[j].flags & IEEE80211_CH_PASSIVE_ONLY))
++ continue;
++
+ range->freq[i].i = geo->bg[j].channel;
+ range->freq[i].m = geo->bg[j].freq * 100000;
+ range->freq[i].e = 1;
++ i++;
+ }
+ }
+
+ if (priv->ieee->mode & IEEE_A) {
+- for (j = 0; j < geo->a_channels && i < IW_MAX_FREQUENCIES;
+- i++, j++) {
++ for (j = 0; j < geo->a_channels && i < IW_MAX_FREQUENCIES; j++) {
++ if ((priv->ieee->iw_mode == IW_MODE_ADHOC) &&
++ (geo->a[j].flags & IEEE80211_CH_PASSIVE_ONLY))
++ continue;
++
+ range->freq[i].i = geo->a[j].channel;
+ range->freq[i].m = geo->a[j].freq * 100000;
+ range->freq[i].e = 1;
++ i++;
+ }
+ }
+
+@@ -9956,9 +9964,8 @@ static int ipw_ethtool_set_eeprom(struct
+ return -EINVAL;
+ down(&p->sem);
+ memcpy(&p->eeprom[eeprom->offset], bytes, eeprom->len);
+- for (i = IPW_EEPROM_DATA;
+- i < IPW_EEPROM_DATA + IPW_EEPROM_IMAGE_SIZE; i++)
+- ipw_write8(p, i, p->eeprom[i]);
++ for (i = 0; i < IPW_EEPROM_IMAGE_SIZE; i++)
++ ipw_write8(p, i + IPW_EEPROM_DATA, p->eeprom[i]);
+ up(&p->sem);
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/drivers/pci/pci-acpi.c linux-2.6.16-026test015/drivers/pci/pci-acpi.c
+--- linux-2.6.16.orig/drivers/pci/pci-acpi.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/pci/pci-acpi.c 2006-07-04 14:41:36.000000000 +0400
+@@ -33,13 +33,10 @@ acpi_query_osc (
+ acpi_status status;
+ struct acpi_object_list input;
+ union acpi_object in_params[4];
+- struct acpi_buffer output;
+- union acpi_object out_obj;
++ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
++ union acpi_object *out_obj;
+ u32 osc_dw0;
+
+- /* Setting up output buffer */
+- output.length = sizeof(out_obj) + 3*sizeof(u32);
+- output.pointer = &out_obj;
+
+ /* Setting up input parameters */
+ input.count = 4;
+@@ -61,12 +58,15 @@ acpi_query_osc (
+ "Evaluate _OSC Set fails. Status = 0x%04x\n", status);
+ return status;
+ }
+- if (out_obj.type != ACPI_TYPE_BUFFER) {
++ out_obj = output.pointer;
++
++ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ printk(KERN_DEBUG
+ "Evaluate _OSC returns wrong type\n");
+- return AE_TYPE;
++ status = AE_TYPE;
++ goto query_osc_out;
+ }
+- osc_dw0 = *((u32 *) out_obj.buffer.pointer);
++ osc_dw0 = *((u32 *) out_obj->buffer.pointer);
+ if (osc_dw0) {
+ if (osc_dw0 & OSC_REQUEST_ERROR)
+ printk(KERN_DEBUG "_OSC request fails\n");
+@@ -76,15 +76,21 @@ acpi_query_osc (
+ printk(KERN_DEBUG "_OSC invalid revision\n");
+ if (osc_dw0 & OSC_CAPABILITIES_MASK_ERROR) {
+ /* Update Global Control Set */
+- global_ctrlsets = *((u32 *)(out_obj.buffer.pointer+8));
+- return AE_OK;
++ global_ctrlsets = *((u32 *)(out_obj->buffer.pointer+8));
++ status = AE_OK;
++ goto query_osc_out;
+ }
+- return AE_ERROR;
++ status = AE_ERROR;
++ goto query_osc_out;
+ }
+
+ /* Update Global Control Set */
+- global_ctrlsets = *((u32 *)(out_obj.buffer.pointer + 8));
+- return AE_OK;
++ global_ctrlsets = *((u32 *)(out_obj->buffer.pointer + 8));
++ status = AE_OK;
++
++query_osc_out:
++ kfree(output.pointer);
++ return status;
+ }
+
+
+@@ -96,14 +102,10 @@ acpi_run_osc (
+ acpi_status status;
+ struct acpi_object_list input;
+ union acpi_object in_params[4];
+- struct acpi_buffer output;
+- union acpi_object out_obj;
++ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
++ union acpi_object *out_obj;
+ u32 osc_dw0;
+
+- /* Setting up output buffer */
+- output.length = sizeof(out_obj) + 3*sizeof(u32);
+- output.pointer = &out_obj;
+-
+ /* Setting up input parameters */
+ input.count = 4;
+ input.pointer = in_params;
+@@ -124,12 +126,14 @@ acpi_run_osc (
+ "Evaluate _OSC Set fails. Status = 0x%04x\n", status);
+ return status;
+ }
+- if (out_obj.type != ACPI_TYPE_BUFFER) {
++ out_obj = output.pointer;
++ if (out_obj->type != ACPI_TYPE_BUFFER) {
+ printk(KERN_DEBUG
+ "Evaluate _OSC returns wrong type\n");
+- return AE_TYPE;
++ status = AE_TYPE;
++ goto run_osc_out;
+ }
+- osc_dw0 = *((u32 *) out_obj.buffer.pointer);
++ osc_dw0 = *((u32 *) out_obj->buffer.pointer);
+ if (osc_dw0) {
+ if (osc_dw0 & OSC_REQUEST_ERROR)
+ printk(KERN_DEBUG "_OSC request fails\n");
+@@ -139,11 +143,17 @@ acpi_run_osc (
+ printk(KERN_DEBUG "_OSC invalid revision\n");
+ if (osc_dw0 & OSC_CAPABILITIES_MASK_ERROR) {
+ printk(KERN_DEBUG "_OSC FW not grant req. control\n");
+- return AE_SUPPORT;
++ status = AE_SUPPORT;
++ goto run_osc_out;
+ }
+- return AE_ERROR;
++ status = AE_ERROR;
++ goto run_osc_out;
+ }
+- return AE_OK;
++ status = AE_OK;
++
++run_osc_out:
++ kfree(output.pointer);
++ return status;
+ }
+
+ /**
+diff -upr linux-2.6.16.orig/drivers/pci/probe.c linux-2.6.16-026test015/drivers/pci/probe.c
+--- linux-2.6.16.orig/drivers/pci/probe.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/pci/probe.c 2006-07-04 14:41:38.000000000 +0400
+@@ -21,6 +21,7 @@ LIST_HEAD(pci_root_buses);
+ EXPORT_SYMBOL(pci_root_buses);
+
+ LIST_HEAD(pci_devices);
++EXPORT_SYMBOL(pci_devices);
+
+ #ifdef HAVE_PCI_LEGACY
+ /**
+diff -upr linux-2.6.16.orig/drivers/pci/quirks.c linux-2.6.16-026test015/drivers/pci/quirks.c
+--- linux-2.6.16.orig/drivers/pci/quirks.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/pci/quirks.c 2006-07-04 14:41:36.000000000 +0400
+@@ -631,6 +631,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V
+ * non-x86 architectures (yes Via exists on PPC among other places),
+ * we must mask the PCI_INTERRUPT_LINE value versus 0xf to get
+ * interrupts delivered properly.
++ *
++ * Some of the on-chip devices are actually '586 devices' so they are
++ * listed here.
+ */
+ static void quirk_via_irq(struct pci_dev *dev)
+ {
+@@ -639,13 +642,19 @@ static void quirk_via_irq(struct pci_dev
+ new_irq = dev->irq & 0xf;
+ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq);
+ if (new_irq != irq) {
+- printk(KERN_INFO "PCI: Via IRQ fixup for %s, from %d to %d\n",
++ printk(KERN_INFO "PCI: VIA IRQ fixup for %s, from %d to %d\n",
+ pci_name(dev), irq, new_irq);
+ udelay(15); /* unknown if delay really needed */
+ pci_write_config_byte(dev, PCI_INTERRUPT_LINE, new_irq);
+ }
+ }
+-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_ANY_ID, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_0, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_1, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_2, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_3, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686_4, quirk_via_irq);
++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686_5, quirk_via_irq);
+
+ /*
+ * VIA VT82C598 has its device ID settable and many BIOSes
+@@ -861,6 +870,7 @@ static void __init quirk_eisa_bridge(str
+ }
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82375, quirk_eisa_bridge );
+
++#ifndef CONFIG_ACPI_SLEEP
+ /*
+ * On ASUS P4B boards, the SMBus PCI Device within the ICH2/4 southbridge
+ * is not activated. The myth is that Asus said that they do not want the
+@@ -872,8 +882,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I
+ * bridge. Unfortunately, this device has no subvendor/subdevice ID. So it
+ * becomes necessary to do this tweak in two steps -- I've chosen the Host
+ * bridge as trigger.
++ *
++ * Actually, leaving it unhidden and not redoing the quirk over suspend2ram
++ * will cause thermal management to break down, and causing machine to
++ * overheat.
+ */
+-static int __initdata asus_hides_smbus = 0;
++static int __initdata asus_hides_smbus;
+
+ static void __init asus_hides_smbus_hostbridge(struct pci_dev *dev)
+ {
+@@ -1008,6 +1022,8 @@ static void __init asus_hides_smbus_lpc_
+ }
+ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, asus_hides_smbus_lpc_ich6 );
+
++#endif
++
+ /*
+ * SiS 96x south bridge: BIOS typically hides SMBus device...
+ */
+diff -upr linux-2.6.16.orig/drivers/pcmcia/ds.c linux-2.6.16-026test015/drivers/pcmcia/ds.c
+--- linux-2.6.16.orig/drivers/pcmcia/ds.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/pcmcia/ds.c 2006-07-04 14:41:36.000000000 +0400
+@@ -546,7 +546,7 @@ static int pcmcia_device_query(struct pc
+ tmp = vers1->str + vers1->ofs[i];
+
+ length = strlen(tmp) + 1;
+- if ((length < 3) || (length > 255))
++ if ((length < 2) || (length > 255))
+ continue;
+
+ p_dev->prod_id[i] = kmalloc(sizeof(char) * length,
+diff -upr linux-2.6.16.orig/drivers/s390/cio/cio.c linux-2.6.16-026test015/drivers/s390/cio/cio.c
+--- linux-2.6.16.orig/drivers/s390/cio/cio.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/s390/cio/cio.c 2006-07-04 14:41:38.000000000 +0400
+@@ -610,7 +610,11 @@ do_IRQ (struct pt_regs *regs)
+ struct tpi_info *tpi_info;
+ struct subchannel *sch;
+ struct irb *irb;
++ struct ve_struct *ve;
++ struct user_beancounter *ub;
+
++ ve = set_exec_env(get_ve0());
++ ub = set_exec_ub(get_ub0());
+ irq_enter ();
+ asm volatile ("mc 0,0");
+ if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer)
+@@ -657,6 +661,8 @@ do_IRQ (struct pt_regs *regs)
+ */
+ } while (!MACHINE_IS_VM && tpi (NULL) != 0);
+ irq_exit ();
++ (void)set_exec_ub(ub);
++ (void)set_exec_env(ve);
+ }
+
+ #ifdef CONFIG_CCW_CONSOLE
+diff -upr linux-2.6.16.orig/drivers/scsi/3w-9xxx.c linux-2.6.16-026test015/drivers/scsi/3w-9xxx.c
+--- linux-2.6.16.orig/drivers/scsi/3w-9xxx.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/scsi/3w-9xxx.c 2006-07-04 14:41:36.000000000 +0400
+@@ -85,7 +85,7 @@
+ #include "3w-9xxx.h"
+
+ /* Globals */
+-#define TW_DRIVER_VERSION "2.26.02.005"
++#define TW_DRIVER_VERSION "2.26.02.007"
+ static TW_Device_Extension *twa_device_extension_list[TW_MAX_SLOT];
+ static unsigned int twa_device_extension_count;
+ static int twa_major = -1;
+@@ -1944,9 +1944,13 @@ static void twa_scsiop_execute_scsi_comp
+ }
+ if (tw_dev->srb[request_id]->use_sg == 1) {
+ struct scatterlist *sg = (struct scatterlist *)tw_dev->srb[request_id]->request_buffer;
+- char *buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
++ char *buf;
++ unsigned long flags = 0;
++ local_irq_save(flags);
++ buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+ memcpy(buf, tw_dev->generic_buffer_virt[request_id], sg->length);
+ kunmap_atomic(buf - sg->offset, KM_IRQ0);
++ local_irq_restore(flags);
+ }
+ }
+ } /* End twa_scsiop_execute_scsi_complete() */
+diff -upr linux-2.6.16.orig/drivers/scsi/3w-xxxx.c linux-2.6.16-026test015/drivers/scsi/3w-xxxx.c
+--- linux-2.6.16.orig/drivers/scsi/3w-xxxx.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/scsi/3w-xxxx.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1508,10 +1508,12 @@ static void tw_transfer_internal(TW_Devi
+ struct scsi_cmnd *cmd = tw_dev->srb[request_id];
+ void *buf;
+ unsigned int transfer_len;
++ unsigned long flags = 0;
+
+ if (cmd->use_sg) {
+ struct scatterlist *sg =
+ (struct scatterlist *)cmd->request_buffer;
++ local_irq_save(flags);
+ buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset;
+ transfer_len = min(sg->length, len);
+ } else {
+@@ -1526,6 +1528,7 @@ static void tw_transfer_internal(TW_Devi
+
+ sg = (struct scatterlist *)cmd->request_buffer;
+ kunmap_atomic(buf - sg->offset, KM_IRQ0);
++ local_irq_restore(flags);
+ }
+ }
+
+diff -upr linux-2.6.16.orig/drivers/scsi/libata-core.c linux-2.6.16-026test015/drivers/scsi/libata-core.c
+--- linux-2.6.16.orig/drivers/scsi/libata-core.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/scsi/libata-core.c 2006-07-04 14:41:36.000000000 +0400
+@@ -4293,6 +4293,7 @@ static int ata_start_drive(struct ata_po
+ int ata_device_resume(struct ata_port *ap, struct ata_device *dev)
+ {
+ if (ap->flags & ATA_FLAG_SUSPENDED) {
++ ata_busy_wait(ap, ATA_BUSY | ATA_DRQ, 200000);
+ ap->flags &= ~ATA_FLAG_SUSPENDED;
+ ata_set_mode(ap);
+ }
+diff -upr linux-2.6.16.orig/drivers/scsi/sata_mv.c linux-2.6.16-026test015/drivers/scsi/sata_mv.c
+--- linux-2.6.16.orig/drivers/scsi/sata_mv.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/scsi/sata_mv.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1102,6 +1102,7 @@ static u8 mv_get_crpb_status(struct ata_
+ void __iomem *port_mmio = mv_ap_base(ap);
+ struct mv_port_priv *pp = ap->private_data;
+ u32 out_ptr;
++ u8 ata_status;
+
+ out_ptr = readl(port_mmio + EDMA_RSP_Q_OUT_PTR_OFS);
+
+@@ -1109,6 +1110,8 @@ static u8 mv_get_crpb_status(struct ata_
+ assert(((out_ptr >> EDMA_RSP_Q_PTR_SHIFT) & MV_MAX_Q_DEPTH_MASK) ==
+ pp->rsp_consumer);
+
++ ata_status = pp->crpb[pp->rsp_consumer].flags >> CRPB_FLAG_STATUS_SHIFT;
++
+ /* increment our consumer index... */
+ pp->rsp_consumer = mv_inc_q_index(&pp->rsp_consumer);
+
+@@ -1123,7 +1126,7 @@ static u8 mv_get_crpb_status(struct ata_
+ writelfl(out_ptr, port_mmio + EDMA_RSP_Q_OUT_PTR_OFS);
+
+ /* Return ATA status register for completed CRPB */
+- return (pp->crpb[pp->rsp_consumer].flags >> CRPB_FLAG_STATUS_SHIFT);
++ return ata_status;
+ }
+
+ /**
+@@ -1192,7 +1195,6 @@ static void mv_host_intr(struct ata_host
+ u32 hc_irq_cause;
+ int shift, port, port0, hard_port, handled;
+ unsigned int err_mask;
+- u8 ata_status = 0;
+
+ if (hc == 0) {
+ port0 = 0;
+@@ -1210,6 +1212,7 @@ static void mv_host_intr(struct ata_host
+ hc,relevant,hc_irq_cause);
+
+ for (port = port0; port < port0 + MV_PORTS_PER_HC; port++) {
++ u8 ata_status = 0;
+ ap = host_set->ports[port];
+ hard_port = port & MV_PORT_MASK; /* range 0-3 */
+ handled = 0; /* ensure ata_status is set if handled++ */
+diff -upr linux-2.6.16.orig/drivers/scsi/scsi_lib.c linux-2.6.16-026test015/drivers/scsi/scsi_lib.c
+--- linux-2.6.16.orig/drivers/scsi/scsi_lib.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/scsi/scsi_lib.c 2006-07-04 14:41:36.000000000 +0400
+@@ -368,7 +368,7 @@ static int scsi_req_map_sg(struct reques
+ int nsegs, unsigned bufflen, gfp_t gfp)
+ {
+ struct request_queue *q = rq->q;
+- int nr_pages = (bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
++ int nr_pages = (bufflen + sgl[0].offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned int data_len = 0, len, bytes, off;
+ struct page *page;
+ struct bio *bio = NULL;
+diff -upr linux-2.6.16.orig/drivers/sn/ioc3.c linux-2.6.16-026test015/drivers/sn/ioc3.c
+--- linux-2.6.16.orig/drivers/sn/ioc3.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/sn/ioc3.c 2006-07-04 14:41:36.000000000 +0400
+@@ -677,7 +677,7 @@ static int ioc3_probe(struct pci_dev *pd
+ /* Track PCI-device specific data */
+ pci_set_drvdata(pdev, idd);
+ down_write(&ioc3_devices_rwsem);
+- list_add(&idd->list, &ioc3_devices);
++ list_add_tail(&idd->list, &ioc3_devices);
+ idd->id = ioc3_counter++;
+ up_write(&ioc3_devices_rwsem);
+
+diff -upr linux-2.6.16.orig/drivers/sn/ioc4.c linux-2.6.16-026test015/drivers/sn/ioc4.c
+--- linux-2.6.16.orig/drivers/sn/ioc4.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/sn/ioc4.c 2006-07-04 14:41:36.000000000 +0400
+@@ -313,7 +313,7 @@ ioc4_probe(struct pci_dev *pdev, const s
+ idd->idd_serial_data = NULL;
+ pci_set_drvdata(idd->idd_pdev, idd);
+ down_write(&ioc4_devices_rwsem);
+- list_add(&idd->idd_list, &ioc4_devices);
++ list_add_tail(&idd->idd_list, &ioc4_devices);
+ up_write(&ioc4_devices_rwsem);
+
+ /* Add this IOC4 to all submodules */
+diff -upr linux-2.6.16.orig/drivers/usb/core/message.c linux-2.6.16-026test015/drivers/usb/core/message.c
+--- linux-2.6.16.orig/drivers/usb/core/message.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/usb/core/message.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1388,11 +1388,13 @@ free_interfaces:
+ if (dev->state != USB_STATE_ADDRESS)
+ usb_disable_device (dev, 1); // Skip ep0
+
+- i = dev->bus_mA - cp->desc.bMaxPower * 2;
+- if (i < 0)
+- dev_warn(&dev->dev, "new config #%d exceeds power "
+- "limit by %dmA\n",
+- configuration, -i);
++ if (cp) {
++ i = dev->bus_mA - cp->desc.bMaxPower * 2;
++ if (i < 0)
++ dev_warn(&dev->dev, "new config #%d exceeds power "
++ "limit by %dmA\n",
++ configuration, -i);
++ }
+
+ if ((ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
+ USB_REQ_SET_CONFIGURATION, 0, configuration, 0,
+diff -upr linux-2.6.16.orig/drivers/usb/host/ehci-sched.c linux-2.6.16-026test015/drivers/usb/host/ehci-sched.c
+--- linux-2.6.16.orig/drivers/usb/host/ehci-sched.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/usb/host/ehci-sched.c 2006-07-04 14:41:36.000000000 +0400
+@@ -707,6 +707,7 @@ iso_stream_init (
+ } else {
+ u32 addr;
+ int think_time;
++ int hs_transfers;
+
+ addr = dev->ttport << 24;
+ if (!ehci_is_TDI(ehci)
+@@ -719,6 +720,7 @@ iso_stream_init (
+ think_time = dev->tt ? dev->tt->think_time : 0;
+ stream->tt_usecs = NS_TO_US (think_time + usb_calc_bus_time (
+ dev->speed, is_input, 1, maxp));
++ hs_transfers = max (1u, (maxp + 187) / 188);
+ if (is_input) {
+ u32 tmp;
+
+@@ -727,12 +729,11 @@ iso_stream_init (
+ stream->usecs = HS_USECS_ISO (1);
+ stream->raw_mask = 1;
+
+- /* pessimistic c-mask */
+- tmp = usb_calc_bus_time (USB_SPEED_FULL, 1, 0, maxp)
+- / (125 * 1000);
+- stream->raw_mask |= 3 << (tmp + 9);
++ /* c-mask as specified in USB 2.0 11.18.4 3.c */
++ tmp = (1 << (hs_transfers + 2)) - 1;
++ stream->raw_mask |= tmp << (8 + 2);
+ } else
+- stream->raw_mask = smask_out [maxp / 188];
++ stream->raw_mask = smask_out [hs_transfers - 1];
+ bandwidth = stream->usecs + stream->c_usecs;
+ bandwidth /= 1 << (interval + 2);
+
+diff -upr linux-2.6.16.orig/drivers/usb/serial/console.c linux-2.6.16-026test015/drivers/usb/serial/console.c
+--- linux-2.6.16.orig/drivers/usb/serial/console.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/usb/serial/console.c 2006-07-04 14:41:36.000000000 +0400
+@@ -54,7 +54,7 @@ static struct console usbcons;
+ * serial.c code, except that the specifier is "ttyUSB" instead
+ * of "ttyS".
+ */
+-static int __init usb_console_setup(struct console *co, char *options)
++static int usb_console_setup(struct console *co, char *options)
+ {
+ struct usbcons_info *info = &usbcons_info;
+ int baud = 9600;
+diff -upr linux-2.6.16.orig/drivers/usb/serial/option.c linux-2.6.16-026test015/drivers/usb/serial/option.c
+--- linux-2.6.16.orig/drivers/usb/serial/option.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/usb/serial/option.c 2006-07-04 14:41:36.000000000 +0400
+@@ -582,14 +582,14 @@ static void option_setup_urbs(struct usb
+ portdata = usb_get_serial_port_data(port);
+
+ /* Do indat endpoints first */
+- for (j = 0; j <= N_IN_URB; ++j) {
++ for (j = 0; j < N_IN_URB; ++j) {
+ portdata->in_urbs[j] = option_setup_urb (serial,
+ port->bulk_in_endpointAddress, USB_DIR_IN, port,
+ portdata->in_buffer[j], IN_BUFLEN, option_indat_callback);
+ }
+
+ /* outdat endpoints */
+- for (j = 0; j <= N_OUT_URB; ++j) {
++ for (j = 0; j < N_OUT_URB; ++j) {
+ portdata->out_urbs[j] = option_setup_urb (serial,
+ port->bulk_out_endpointAddress, USB_DIR_OUT, port,
+ portdata->out_buffer[j], OUT_BUFLEN, option_outdat_callback);
+diff -upr linux-2.6.16.orig/drivers/usb/serial/whiteheat.c linux-2.6.16-026test015/drivers/usb/serial/whiteheat.c
+--- linux-2.6.16.orig/drivers/usb/serial/whiteheat.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/usb/serial/whiteheat.c 2006-07-04 14:41:36.000000000 +0400
+@@ -388,7 +388,7 @@ static int whiteheat_attach (struct usb_
+ if (ret) {
+ err("%s: Couldn't send command [%d]", serial->type->description, ret);
+ goto no_firmware;
+- } else if (alen != sizeof(command)) {
++ } else if (alen != 2) {
+ err("%s: Send command incomplete [%d]", serial->type->description, alen);
+ goto no_firmware;
+ }
+@@ -400,7 +400,7 @@ static int whiteheat_attach (struct usb_
+ if (ret) {
+ err("%s: Couldn't get results [%d]", serial->type->description, ret);
+ goto no_firmware;
+- } else if (alen != sizeof(result)) {
++ } else if (alen != sizeof(*hw_info) + 1) {
+ err("%s: Get results incomplete [%d]", serial->type->description, alen);
+ goto no_firmware;
+ } else if (result[0] != command[0]) {
+diff -upr linux-2.6.16.orig/drivers/usb/storage/Kconfig linux-2.6.16-026test015/drivers/usb/storage/Kconfig
+--- linux-2.6.16.orig/drivers/usb/storage/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/usb/storage/Kconfig 2006-07-04 14:41:36.000000000 +0400
+@@ -48,7 +48,8 @@ config USB_STORAGE_FREECOM
+
+ config USB_STORAGE_ISD200
+ bool "ISD-200 USB/ATA Bridge support"
+- depends on USB_STORAGE && BLK_DEV_IDE
++ depends on USB_STORAGE
++ depends on BLK_DEV_IDE=y || BLK_DEV_IDE=USB_STORAGE
+ ---help---
+ Say Y here if you want to use USB Mass Store devices based
+ on the In-Systems Design ISD-200 USB/ATA bridge.
+diff -upr linux-2.6.16.orig/drivers/video/cfbimgblt.c linux-2.6.16-026test015/drivers/video/cfbimgblt.c
+--- linux-2.6.16.orig/drivers/video/cfbimgblt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/video/cfbimgblt.c 2006-07-04 14:41:36.000000000 +0400
+@@ -169,7 +169,7 @@ static inline void slow_imageblit(const
+
+ while (j--) {
+ l--;
+- color = (*s & 1 << (FB_BIT_NR(l))) ? fgcolor : bgcolor;
++ color = (*s & (1 << l)) ? fgcolor : bgcolor;
+ val |= FB_SHIFT_HIGH(color, shift);
+
+ /* Did the bitshift spill bits to the next long? */
+diff -upr linux-2.6.16.orig/drivers/video/fbmem.c linux-2.6.16-026test015/drivers/video/fbmem.c
+--- linux-2.6.16.orig/drivers/video/fbmem.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/video/fbmem.c 2006-07-04 14:41:36.000000000 +0400
+@@ -669,13 +669,19 @@ fb_write(struct file *file, const char _
+ total_size = info->fix.smem_len;
+
+ if (p > total_size)
+- return 0;
++ return -EFBIG;
+
+- if (count >= total_size)
++ if (count > total_size) {
++ err = -EFBIG;
+ count = total_size;
++ }
++
++ if (count + p > total_size) {
++ if (!err)
++ err = -ENOSPC;
+
+- if (count + p > total_size)
+ count = total_size - p;
++ }
+
+ buffer = kmalloc((count > PAGE_SIZE) ? PAGE_SIZE : count,
+ GFP_KERNEL);
+@@ -717,7 +723,7 @@ fb_write(struct file *file, const char _
+
+ kfree(buffer);
+
+- return (err) ? err : cnt;
++ return (cnt) ? cnt : err;
+ }
+
+ #ifdef CONFIG_KMOD
+diff -upr linux-2.6.16.orig/drivers/video/i810/i810_main.c linux-2.6.16-026test015/drivers/video/i810/i810_main.c
+--- linux-2.6.16.orig/drivers/video/i810/i810_main.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/drivers/video/i810/i810_main.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1508,7 +1508,7 @@ static int i810fb_cursor(struct fb_info
+ int size = ((cursor->image.width + 7) >> 3) *
+ cursor->image.height;
+ int i;
+- u8 *data = kmalloc(64 * 8, GFP_KERNEL);
++ u8 *data = kmalloc(64 * 8, GFP_ATOMIC);
+
+ if (data == NULL)
+ return -ENOMEM;
+diff -upr linux-2.6.16.orig/fs/9p/vfs_inode.c linux-2.6.16-026test015/fs/9p/vfs_inode.c
+--- linux-2.6.16.orig/fs/9p/vfs_inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/9p/vfs_inode.c 2006-07-04 14:41:36.000000000 +0400
+@@ -614,6 +614,7 @@ static struct dentry *v9fs_vfs_lookup(st
+
+ sb = dir->i_sb;
+ v9ses = v9fs_inode2v9ses(dir);
++ dentry->d_op = &v9fs_dentry_operations;
+ dirfid = v9fs_fid_lookup(dentry->d_parent);
+
+ if (!dirfid) {
+@@ -681,8 +682,6 @@ static struct dentry *v9fs_vfs_lookup(st
+ goto FreeFcall;
+
+ fid->qid = fcall->params.rstat.stat.qid;
+-
+- dentry->d_op = &v9fs_dentry_operations;
+ v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb);
+
+ d_add(dentry, inode);
+diff -upr linux-2.6.16.orig/fs/Kconfig linux-2.6.16-026test015/fs/Kconfig
+--- linux-2.6.16.orig/fs/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -418,6 +418,15 @@ config QUOTA
+ with the quota tools. Probably the quota support is only useful for
+ multi user systems. If unsure, say N.
+
++config QUOTA_COMPAT
++ bool "Compatibility with older quotactl interface"
++ depends on QUOTA
++ help
++ This option enables compatibility layer for older version
++ of quotactl interface with byte granularity (QUOTAON at 0x0100,
++ GETQUOTA at 0x0D00). Interface versions older than that one and
++ with block granularity are still not supported.
++
+ config QFMT_V1
+ tristate "Old quota format support"
+ depends on QUOTA
+@@ -433,6 +442,38 @@ config QFMT_V2
+ This quota format allows using quotas with 32-bit UIDs/GIDs. If you
+ need this functionality say Y here.
+
++config SIM_FS
++ tristate "VPS filesystem"
++ depends on VZ_QUOTA
++ default m
++ help
++ This file system is a part of Virtuozzo. It intoduces a fake
++ superblock and blockdev to VE to hide real device and show
++ statfs results taken from quota.
++
++config VZ_QUOTA
++ tristate "Virtuozzo Disk Quota support"
++ depends on QUOTA
++ default m
++ help
++ Virtuozzo Disk Quota imposes disk quota on directories with their
++ files and subdirectories in total. Such disk quota is used to
++ account and limit disk usage by Virtuozzo VPS, but also may be used
++ separately.
++
++config VZ_QUOTA_UNLOAD
++ bool "Unloadable Virtuozzo Disk Quota module"
++ depends on VZ_QUOTA=m
++ default n
++ help
++ Make Virtuozzo Disk Quota module unloadable.
++ Doesn't work reliably now.
++
++config VZ_QUOTA_UGID
++ bool "Per-user and per-group quota in Virtuozzo quota partitions"
++ depends on VZ_QUOTA!=n
++ default y
++
+ config QUOTACTL
+ bool
+ depends on XFS_QUOTA || QUOTA
+diff -upr linux-2.6.16.orig/fs/Makefile linux-2.6.16-026test015/fs/Makefile
+--- linux-2.6.16.orig/fs/Makefile 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/Makefile 2006-07-04 14:41:39.000000000 +0400
+@@ -39,9 +39,15 @@ obj-$(CONFIG_QUOTA) += dquot.o
+ obj-$(CONFIG_QFMT_V1) += quota_v1.o
+ obj-$(CONFIG_QFMT_V2) += quota_v2.o
+ obj-$(CONFIG_QUOTACTL) += quota.o
++obj-$(CONFIG_VZ_QUOTA) += vzdquota.o
++vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o
++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o
++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o
+
+ obj-$(CONFIG_DNOTIFY) += dnotify.o
+
++obj-$(CONFIG_SIM_FS) += simfs.o
++
+ obj-$(CONFIG_PROC_FS) += proc/
+ obj-y += partitions/
+ obj-$(CONFIG_SYSFS) += sysfs/
+diff -upr linux-2.6.16.orig/fs/aio.c linux-2.6.16-026test015/fs/aio.c
+--- linux-2.6.16.orig/fs/aio.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/aio.c 2006-07-04 14:41:39.000000000 +0400
+@@ -41,13 +41,16 @@
+ #endif
+
+ /*------ sysctl variables----*/
+-static DEFINE_SPINLOCK(aio_nr_lock);
++DEFINE_SPINLOCK(aio_nr_lock);
+ unsigned long aio_nr; /* current system wide number of aio requests */
+ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
++EXPORT_SYMBOL_GPL(aio_nr_lock);
++EXPORT_SYMBOL_GPL(aio_nr);
+ /*----end sysctl variables---*/
+
+ static kmem_cache_t *kiocb_cachep;
+-static kmem_cache_t *kioctx_cachep;
++kmem_cache_t *kioctx_cachep;
++EXPORT_SYMBOL_GPL(kioctx_cachep);
+
+ static struct workqueue_struct *aio_wq;
+
+@@ -58,7 +61,7 @@ static DECLARE_WORK(fput_work, aio_fput_
+ static DEFINE_SPINLOCK(fput_lock);
+ static LIST_HEAD(fput_head);
+
+-static void aio_kick_handler(void *);
++void aio_kick_handler(void *);
+ static void aio_queue_work(struct kioctx *);
+
+ /* aio_setup
+@@ -293,7 +296,7 @@ static void aio_cancel_all(struct kioctx
+ spin_unlock_irq(&ctx->ctx_lock);
+ }
+
+-static void wait_for_all_aios(struct kioctx *ctx)
++void wait_for_all_aios(struct kioctx *ctx)
+ {
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
+@@ -310,6 +313,7 @@ static void wait_for_all_aios(struct kio
+ __set_task_state(tsk, TASK_RUNNING);
+ remove_wait_queue(&ctx->wait, &wait);
+ }
++EXPORT_SYMBOL_GPL(wait_for_all_aios);
+
+ /* wait_on_sync_kiocb:
+ * Waits on the given sync kiocb to complete.
+@@ -856,7 +860,7 @@ static inline void aio_run_all_iocbs(str
+ * space.
+ * Run on aiod's context.
+ */
+-static void aio_kick_handler(void *data)
++void aio_kick_handler(void *data)
+ {
+ struct kioctx *ctx = data;
+ mm_segment_t oldfs = get_fs();
+@@ -875,6 +879,7 @@ static void aio_kick_handler(void *data)
+ if (requeue)
+ queue_work(aio_wq, &ctx->wq);
+ }
++EXPORT_SYMBOL_GPL(aio_kick_handler);
+
+
+ /*
+diff -upr linux-2.6.16.orig/fs/autofs/autofs_i.h linux-2.6.16-026test015/fs/autofs/autofs_i.h
+--- linux-2.6.16.orig/fs/autofs/autofs_i.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs/autofs_i.h 2006-07-04 14:41:38.000000000 +0400
+@@ -124,7 +124,7 @@ static inline struct autofs_sb_info *aut
+ filesystem without "magic".) */
+
+ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) {
+- return sbi->catatonic || process_group(current) == sbi->oz_pgrp;
++ return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp;
+ }
+
+ /* Hash operations */
+diff -upr linux-2.6.16.orig/fs/autofs/init.c linux-2.6.16-026test015/fs/autofs/init.c
+--- linux-2.6.16.orig/fs/autofs/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs/init.c 2006-07-04 14:41:38.000000000 +0400
+@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs
+ .name = "autofs",
+ .get_sb = autofs_get_sb,
+ .kill_sb = kill_anon_super,
++ .fs_flags = FS_VIRTUALIZED,
+ };
+
+ static int __init init_autofs_fs(void)
+diff -upr linux-2.6.16.orig/fs/autofs/inode.c linux-2.6.16-026test015/fs/autofs/inode.c
+--- linux-2.6.16.orig/fs/autofs/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs/inode.c 2006-07-04 14:41:38.000000000 +0400
+@@ -66,7 +66,7 @@ static int parse_options(char *options,
+
+ *uid = current->uid;
+ *gid = current->gid;
+- *pgrp = process_group(current);
++ *pgrp = virt_pgid(current);
+
+ *minproto = *maxproto = AUTOFS_PROTO_VERSION;
+
+@@ -138,7 +138,7 @@ int autofs_fill_super(struct super_block
+ sbi->magic = AUTOFS_SBI_MAGIC;
+ sbi->catatonic = 0;
+ sbi->exp_timeout = 0;
+- sbi->oz_pgrp = process_group(current);
++ sbi->oz_pgrp = virt_pgid(current);
+ autofs_initialize_hash(&sbi->dirhash);
+ sbi->queues = NULL;
+ memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN);
+diff -upr linux-2.6.16.orig/fs/autofs/root.c linux-2.6.16-026test015/fs/autofs/root.c
+--- linux-2.6.16.orig/fs/autofs/root.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs/root.c 2006-07-04 14:41:38.000000000 +0400
+@@ -354,7 +354,7 @@ static int autofs_root_unlink(struct ino
+
+ /* This allows root to remove symlinks */
+ lock_kernel();
+- if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) {
++ if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) {
+ unlock_kernel();
+ return -EACCES;
+ }
+@@ -541,7 +541,7 @@ static int autofs_root_ioctl(struct inod
+ _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT )
+ return -ENOTTY;
+
+- if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) )
++ if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) )
+ return -EPERM;
+
+ switch(cmd) {
+diff -upr linux-2.6.16.orig/fs/autofs4/autofs_i.h linux-2.6.16-026test015/fs/autofs4/autofs_i.h
+--- linux-2.6.16.orig/fs/autofs4/autofs_i.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs4/autofs_i.h 2006-07-04 14:41:38.000000000 +0400
+@@ -122,7 +122,7 @@ static inline struct autofs_info *autofs
+ filesystem without "magic".) */
+
+ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+- return sbi->catatonic || process_group(current) == sbi->oz_pgrp;
++ return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp;
+ }
+
+ /* Does a dentry have some pending activity? */
+diff -upr linux-2.6.16.orig/fs/autofs4/init.c linux-2.6.16-026test015/fs/autofs4/init.c
+--- linux-2.6.16.orig/fs/autofs4/init.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs4/init.c 2006-07-04 14:41:38.000000000 +0400
+@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs
+ .name = "autofs",
+ .get_sb = autofs_get_sb,
+ .kill_sb = kill_anon_super,
++ .fs_flags = FS_VIRTUALIZED,
+ };
+
+ static int __init init_autofs4_fs(void)
+diff -upr linux-2.6.16.orig/fs/autofs4/inode.c linux-2.6.16-026test015/fs/autofs4/inode.c
+--- linux-2.6.16.orig/fs/autofs4/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs4/inode.c 2006-07-04 14:41:38.000000000 +0400
+@@ -179,7 +179,7 @@ static int parse_options(char *options,
+
+ *uid = current->uid;
+ *gid = current->gid;
+- *pgrp = process_group(current);
++ *pgrp = virt_pgid(current);
+
+ *minproto = AUTOFS_MIN_PROTO_VERSION;
+ *maxproto = AUTOFS_MAX_PROTO_VERSION;
+@@ -265,7 +265,7 @@ int autofs4_fill_super(struct super_bloc
+ sbi->root = NULL;
+ sbi->catatonic = 0;
+ sbi->exp_timeout = 0;
+- sbi->oz_pgrp = process_group(current);
++ sbi->oz_pgrp = virt_pgid(current);
+ sbi->sb = s;
+ sbi->version = 0;
+ sbi->sub_version = 0;
+diff -upr linux-2.6.16.orig/fs/autofs4/root.c linux-2.6.16-026test015/fs/autofs4/root.c
+--- linux-2.6.16.orig/fs/autofs4/root.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/autofs4/root.c 2006-07-04 14:41:38.000000000 +0400
+@@ -592,7 +592,7 @@ static int autofs4_dir_unlink(struct ino
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
+ /* This allows root to remove symlinks */
+- if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) )
++ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) )
+ return -EACCES;
+
+ dput(ino->dentry);
+@@ -784,7 +784,7 @@ static int autofs4_root_ioctl(struct ino
+ _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT )
+ return -ENOTTY;
+
+- if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) )
++ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) )
+ return -EPERM;
+
+ switch(cmd) {
+diff -upr linux-2.6.16.orig/fs/binfmt_aout.c linux-2.6.16-026test015/fs/binfmt_aout.c
+--- linux-2.6.16.orig/fs/binfmt_aout.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_aout.c 2006-07-04 14:41:39.000000000 +0400
+@@ -446,9 +446,11 @@ beyond_if:
+ #endif
+ start_thread(regs, ex.a_entry, current->mm->start_stack);
+ if (unlikely(current->ptrace & PT_PTRACED)) {
+- if (current->ptrace & PT_TRACE_EXEC)
++ if (current->ptrace & PT_TRACE_EXEC) {
++ set_pn_state(current, PN_STOP_EXEC);
+ ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
+- else
++ clear_pn_state(current);
++ } else
+ send_sig(SIGTRAP, current, 0);
+ }
+ return 0;
+diff -upr linux-2.6.16.orig/fs/binfmt_elf.c linux-2.6.16-026test015/fs/binfmt_elf.c
+--- linux-2.6.16.orig/fs/binfmt_elf.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_elf.c 2006-07-04 14:41:39.000000000 +0400
+@@ -361,7 +361,7 @@ static unsigned long load_elf_interp(str
+ eppnt = elf_phdata;
+ for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) {
+ if (eppnt->p_type == PT_LOAD) {
+- int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
++ int elf_type = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECPRIO;
+ int elf_prot = 0;
+ unsigned long vaddr = 0;
+ unsigned long k, map_addr;
+@@ -669,7 +669,7 @@ static int load_elf_binary(struct linux_
+ */
+ SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter);
+
+- interpreter = open_exec(elf_interpreter);
++ interpreter = open_exec(elf_interpreter, NULL);
+ retval = PTR_ERR(interpreter);
+ if (IS_ERR(interpreter))
+ goto out_free_interp;
+@@ -834,7 +834,7 @@ static int load_elf_binary(struct linux_
+ if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE;
+ if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC;
+
+- elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
++ elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE|MAP_EXECPRIO;
+
+ vaddr = elf_ppnt->p_vaddr;
+ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
+@@ -1000,9 +1000,11 @@ static int load_elf_binary(struct linux_
+
+ start_thread(regs, elf_entry, bprm->p);
+ if (unlikely(current->ptrace & PT_PTRACED)) {
+- if (current->ptrace & PT_TRACE_EXEC)
++ if (current->ptrace & PT_TRACE_EXEC) {
++ set_pn_state(current, PN_STOP_EXEC);
+ ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
+- else
++ clear_pn_state(current);
++ } else
+ send_sig(SIGTRAP, current, 0);
+ }
+ retval = 0;
+@@ -1022,8 +1024,13 @@ out_free_file:
+ sys_close(elf_exec_fileno);
+ out_free_fh:
+ if (files) {
+- put_files_struct(current->files);
++ struct files_struct *old;
++
++ old = current->files;
++ task_lock(current);
+ current->files = files;
++ task_unlock(current);
++ put_files_struct(old);
+ }
+ out_free_ph:
+ kfree(elf_phdata);
+@@ -1281,10 +1288,10 @@ static void fill_prstatus(struct elf_prs
+ prstatus->pr_info.si_signo = prstatus->pr_cursig = signr;
+ prstatus->pr_sigpend = p->pending.signal.sig[0];
+ prstatus->pr_sighold = p->blocked.sig[0];
+- prstatus->pr_pid = p->pid;
+- prstatus->pr_ppid = p->parent->pid;
+- prstatus->pr_pgrp = process_group(p);
+- prstatus->pr_sid = p->signal->session;
++ prstatus->pr_pid = virt_pid(p);
++ prstatus->pr_ppid = virt_pid(p->parent);
++ prstatus->pr_pgrp = virt_pgid(p);
++ prstatus->pr_sid = virt_sid(p);
+ if (thread_group_leader(p)) {
+ /*
+ * This is the record for the group leader. Add in the
+@@ -1327,10 +1334,10 @@ static int fill_psinfo(struct elf_prpsin
+ psinfo->pr_psargs[i] = ' ';
+ psinfo->pr_psargs[len] = 0;
+
+- psinfo->pr_pid = p->pid;
+- psinfo->pr_ppid = p->parent->pid;
+- psinfo->pr_pgrp = process_group(p);
+- psinfo->pr_sid = p->signal->session;
++ psinfo->pr_pid = virt_pid(p);
++ psinfo->pr_ppid = virt_pid(p->parent);
++ psinfo->pr_pgrp = virt_pgid(p);
++ psinfo->pr_sid = virt_sid(p);
+
+ i = p->state ? ffz(~p->state) + 1 : 0;
+ psinfo->pr_state = i;
+@@ -1463,7 +1470,7 @@ static int elf_core_dump(long signr, str
+ if (signr) {
+ struct elf_thread_status *tmp;
+ read_lock(&tasklist_lock);
+- do_each_thread(g,p)
++ do_each_thread_ve(g,p)
+ if (current->mm == p->mm && current != p) {
+ tmp = kmalloc(sizeof(*tmp), GFP_ATOMIC);
+ if (!tmp) {
+@@ -1475,7 +1482,7 @@ static int elf_core_dump(long signr, str
+ tmp->thread = p;
+ list_add(&tmp->list, &thread_list);
+ }
+- while_each_thread(g,p);
++ while_each_thread_ve(g,p);
+ read_unlock(&tasklist_lock);
+ list_for_each(t, &thread_list) {
+ struct elf_thread_status *tmp;
+diff -upr linux-2.6.16.orig/fs/binfmt_elf_fdpic.c linux-2.6.16-026test015/fs/binfmt_elf_fdpic.c
+--- linux-2.6.16.orig/fs/binfmt_elf_fdpic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_elf_fdpic.c 2006-07-04 14:41:37.000000000 +0400
+@@ -205,7 +205,7 @@ static int load_elf_fdpic_binary(struct
+ kdebug("Using ELF interpreter %s", interpreter_name);
+
+ /* replace the program with the interpreter */
+- interpreter = open_exec(interpreter_name);
++ interpreter = open_exec(interpreter_name, bprm);
+ retval = PTR_ERR(interpreter);
+ if (IS_ERR(interpreter)) {
+ interpreter = NULL;
+diff -upr linux-2.6.16.orig/fs/binfmt_em86.c linux-2.6.16-026test015/fs/binfmt_em86.c
+--- linux-2.6.16.orig/fs/binfmt_em86.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_em86.c 2006-07-04 14:41:37.000000000 +0400
+@@ -82,7 +82,7 @@ static int load_em86(struct linux_binprm
+ * Note that we use open_exec() as the name is now in kernel
+ * space, and we don't need to copy it.
+ */
+- file = open_exec(interp);
++ file = open_exec(interp, bprm);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+diff -upr linux-2.6.16.orig/fs/binfmt_flat.c linux-2.6.16-026test015/fs/binfmt_flat.c
+--- linux-2.6.16.orig/fs/binfmt_flat.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_flat.c 2006-07-04 14:41:37.000000000 +0400
+@@ -774,7 +774,7 @@ static int load_flat_shared_library(int
+
+ /* Open the file up */
+ bprm.filename = buf;
+- bprm.file = open_exec(bprm.filename);
++ bprm.file = open_exec(bprm.filename, bprm);
+ res = PTR_ERR(bprm.file);
+ if (IS_ERR(bprm.file))
+ return res;
+diff -upr linux-2.6.16.orig/fs/binfmt_misc.c linux-2.6.16-026test015/fs/binfmt_misc.c
+--- linux-2.6.16.orig/fs/binfmt_misc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_misc.c 2006-07-04 14:41:37.000000000 +0400
+@@ -179,7 +179,7 @@ static int load_misc_binary(struct linux
+
+ bprm->interp = iname; /* for binfmt_script */
+
+- interp_file = open_exec (iname);
++ interp_file = open_exec (iname, bprm);
+ retval = PTR_ERR (interp_file);
+ if (IS_ERR (interp_file))
+ goto _error;
+@@ -216,8 +216,13 @@ _error:
+ bprm->interp_data = 0;
+ _unshare:
+ if (files) {
+- put_files_struct(current->files);
++ struct files_struct *old;
++
++ old = current->files;
++ task_lock(current);
+ current->files = files;
++ task_unlock(current);
++ put_files_struct(old);
+ }
+ goto _ret;
+ }
+diff -upr linux-2.6.16.orig/fs/binfmt_script.c linux-2.6.16-026test015/fs/binfmt_script.c
+--- linux-2.6.16.orig/fs/binfmt_script.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/binfmt_script.c 2006-07-04 14:41:37.000000000 +0400
+@@ -85,7 +85,7 @@ static int load_script(struct linux_binp
+ /*
+ * OK, now restart the process with the interpreter's dentry.
+ */
+- file = open_exec(interp);
++ file = open_exec(interp, bprm);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+diff -upr linux-2.6.16.orig/fs/block_dev.c linux-2.6.16-026test015/fs/block_dev.c
+--- linux-2.6.16.orig/fs/block_dev.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/block_dev.c 2006-07-04 14:41:37.000000000 +0400
+@@ -561,9 +561,16 @@ static int do_open(struct block_device *
+ {
+ struct module *owner = NULL;
+ struct gendisk *disk;
+- int ret = -ENXIO;
++ int ret;
+ int part;
+
++#ifdef CONFIG_VE
++ ret = get_device_perms_ve(S_IFBLK, bdev->bd_dev,
++ file->f_mode&(FMODE_READ|FMODE_WRITE));
++ if (ret)
++ return ret;
++#endif
++ ret = -ENXIO;
+ file->f_mapping = bdev->bd_inode->i_mapping;
+ lock_kernel();
+ disk = get_gendisk(bdev->bd_dev, &part);
+@@ -832,7 +839,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
+ * namespace if possible and return it. Return ERR_PTR(error)
+ * otherwise.
+ */
+-struct block_device *lookup_bdev(const char *path)
++struct block_device *lookup_bdev(const char *path, int mode)
+ {
+ struct block_device *bdev;
+ struct inode *inode;
+@@ -850,6 +857,11 @@ struct block_device *lookup_bdev(const c
+ error = -ENOTBLK;
+ if (!S_ISBLK(inode->i_mode))
+ goto fail;
++#ifdef CONFIG_VE
++ error = get_device_perms_ve(S_IFBLK, inode->i_rdev, mode);
++ if (error)
++ goto fail;
++#endif
+ error = -EACCES;
+ if (nd.mnt->mnt_flags & MNT_NODEV)
+ goto fail;
+@@ -881,12 +893,13 @@ struct block_device *open_bdev_excl(cons
+ mode_t mode = FMODE_READ;
+ int error = 0;
+
+- bdev = lookup_bdev(path);
++ if (!(flags & MS_RDONLY))
++ mode |= FMODE_WRITE;
++
++ bdev = lookup_bdev(path, mode);
+ if (IS_ERR(bdev))
+ return bdev;
+
+- if (!(flags & MS_RDONLY))
+- mode |= FMODE_WRITE;
+ error = blkdev_get(bdev, mode, 0);
+ if (error)
+ return ERR_PTR(error);
+diff -upr linux-2.6.16.orig/fs/buffer.c linux-2.6.16-026test015/fs/buffer.c
+--- linux-2.6.16.orig/fs/buffer.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/buffer.c 2006-07-04 14:41:37.000000000 +0400
+@@ -1942,8 +1942,9 @@ static int __block_prepare_write(struct
+ if (err)
+ break;
+ if (buffer_new(bh)) {
+- unmap_underlying_metadata(bh->b_bdev,
+- bh->b_blocknr);
++ if (buffer_mapped(bh))
++ unmap_underlying_metadata(bh->b_bdev,
++ bh->b_blocknr);
+ if (PageUptodate(page)) {
+ set_buffer_uptodate(bh);
+ continue;
+diff -upr linux-2.6.16.orig/fs/char_dev.c linux-2.6.16-026test015/fs/char_dev.c
+--- linux-2.6.16.orig/fs/char_dev.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/char_dev.c 2006-07-04 14:41:37.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/module.h>
+ #include <linux/smp_lock.h>
+ #include <linux/devfs_fs_kernel.h>
++#include <linux/seq_file.h>
+
+ #include <linux/kobject.h>
+ #include <linux/kobj_map.h>
+@@ -26,8 +27,6 @@
+
+ static struct kobj_map *cdev_map;
+
+-#define MAX_PROBE_HASH 255 /* random */
+-
+ static DECLARE_MUTEX(chrdevs_lock);
+
+ static struct char_device_struct {
+@@ -38,93 +37,29 @@ static struct char_device_struct {
+ char name[64];
+ struct file_operations *fops;
+ struct cdev *cdev; /* will die */
+-} *chrdevs[MAX_PROBE_HASH];
++} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
+
+ /* index in the above */
+ static inline int major_to_index(int major)
+ {
+- return major % MAX_PROBE_HASH;
+-}
+-
+-struct chrdev_info {
+- int index;
+- struct char_device_struct *cd;
+-};
+-
+-void *get_next_chrdev(void *dev)
+-{
+- struct chrdev_info *info;
+-
+- if (dev == NULL) {
+- info = kmalloc(sizeof(*info), GFP_KERNEL);
+- if (!info)
+- goto out;
+- info->index=0;
+- info->cd = chrdevs[info->index];
+- if (info->cd)
+- goto out;
+- } else {
+- info = dev;
+- }
+-
+- while (info->index < ARRAY_SIZE(chrdevs)) {
+- if (info->cd)
+- info->cd = info->cd->next;
+- if (info->cd)
+- goto out;
+- /*
+- * No devices on this chain, move to the next
+- */
+- info->index++;
+- info->cd = (info->index < ARRAY_SIZE(chrdevs)) ?
+- chrdevs[info->index] : NULL;
+- if (info->cd)
+- goto out;
+- }
+-
+-out:
+- return info;
+-}
+-
+-void *acquire_chrdev_list(void)
+-{
+- down(&chrdevs_lock);
+- return get_next_chrdev(NULL);
+-}
+-
+-void release_chrdev_list(void *dev)
+-{
+- up(&chrdevs_lock);
+- kfree(dev);
++ return major % CHRDEV_MAJOR_HASH_SIZE;
+ }
+
++#ifdef CONFIG_PROC_FS
+
+-int count_chrdev_list(void)
++void chrdev_show(struct seq_file *f, off_t offset)
+ {
+ struct char_device_struct *cd;
+- int i, count;
+
+- count = 0;
+-
+- for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) {
+- for (cd = chrdevs[i]; cd; cd = cd->next)
+- count++;
++ if (offset < CHRDEV_MAJOR_HASH_SIZE) {
++ down(&chrdevs_lock);
++ for (cd = chrdevs[offset]; cd; cd = cd->next)
++ seq_printf(f, "%3d %s\n", cd->major, cd->name);
++ up(&chrdevs_lock);
+ }
+-
+- return count;
+ }
+
+-int get_chrdev_info(void *dev, int *major, char **name)
+-{
+- struct chrdev_info *info = dev;
+-
+- if (info->cd == NULL)
+- return 1;
+-
+- *major = info->cd->major;
+- *name = info->cd->name;
+- return 0;
+-}
++#endif /* CONFIG_PROC_FS */
+
+ /*
+ * Register a single major with a specified minor range.
+@@ -342,6 +277,13 @@ int chrdev_open(struct inode * inode, st
+ struct cdev *new = NULL;
+ int ret = 0;
+
++#ifdef CONFIG_VE
++ ret = get_device_perms_ve(S_IFCHR, inode->i_rdev,
++ filp->f_mode&(FMODE_READ|FMODE_WRITE));
++ if (ret)
++ return ret;
++#endif
++
+ spin_lock(&cdev_lock);
+ p = inode->i_cdev;
+ if (!p) {
+diff -upr linux-2.6.16.orig/fs/cifs/cifsencrypt.c linux-2.6.16-026test015/fs/cifs/cifsencrypt.c
+--- linux-2.6.16.orig/fs/cifs/cifsencrypt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/cifs/cifsencrypt.c 2006-07-04 14:41:36.000000000 +0400
+@@ -56,9 +56,6 @@ int cifs_sign_smb(struct smb_hdr * cifs_
+ int rc = 0;
+ char smb_signature[20];
+
+- /* BB remember to initialize sequence number elsewhere and initialize mac_signing key elsewhere BB */
+- /* BB remember to add code to save expected sequence number in midQ entry BB */
+-
+ if((cifs_pdu == NULL) || (server == NULL))
+ return -EINVAL;
+
+@@ -85,20 +82,33 @@ int cifs_sign_smb(struct smb_hdr * cifs_
+ static int cifs_calc_signature2(const struct kvec * iov, int n_vec,
+ const char * key, char * signature)
+ {
+- struct MD5Context context;
+-
+- if((iov == NULL) || (signature == NULL))
+- return -EINVAL;
++ struct MD5Context context;
++ int i;
+
+- MD5Init(&context);
+- MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
++ if((iov == NULL) || (signature == NULL))
++ return -EINVAL;
+
+-/* MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length); */ /* BB FIXME BB */
++ MD5Init(&context);
++ MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16);
++ for(i=0;i<n_vec;i++) {
++ if(iov[i].iov_base == NULL) {
++ cERROR(1,("null iovec entry"));
++ return -EIO;
++ } else if(iov[i].iov_len == 0)
++ break; /* bail out if we are sent nothing to sign */
++ /* The first entry includes a length field (which does not get
++ signed that occupies the first 4 bytes before the header */
++ if(i==0) {
++ if (iov[0].iov_len <= 8 ) /* cmd field at offset 9 */
++ break; /* nothing to sign or corrupt header */
++ MD5Update(&context,iov[0].iov_base+4, iov[0].iov_len-4);
++ } else
++ MD5Update(&context,iov[i].iov_base, iov[i].iov_len);
++ }
+
+- MD5Final(signature,&context);
++ MD5Final(signature,&context);
+
+- return -EOPNOTSUPP;
+-/* return 0; */
++ return 0;
+ }
+
+
+diff -upr linux-2.6.16.orig/fs/cifs/cifsfs.c linux-2.6.16-026test015/fs/cifs/cifsfs.c
+--- linux-2.6.16.orig/fs/cifs/cifsfs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/cifs/cifsfs.c 2006-07-04 14:41:37.000000000 +0400
+@@ -220,7 +220,8 @@ cifs_statfs(struct super_block *sb, stru
+ longer available? */
+ }
+
+-static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd)
++static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ struct cifs_sb_info *cifs_sb;
+
+@@ -232,7 +233,7 @@ static int cifs_permission(struct inode
+ on the client (above and beyond ACL on servers) for
+ servers which do not support setting and viewing mode bits,
+ so allowing client to check permissions is useful */
+- return generic_permission(inode, mask, NULL);
++ return generic_permission(inode, mask, NULL, perm);
+ }
+
+ static kmem_cache_t *cifs_inode_cachep;
+diff -upr linux-2.6.16.orig/fs/cifs/dir.c linux-2.6.16-026test015/fs/cifs/dir.c
+--- linux-2.6.16.orig/fs/cifs/dir.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/cifs/dir.c 2006-07-04 14:41:36.000000000 +0400
+@@ -441,6 +441,20 @@ cifs_lookup(struct inode *parent_dir_ino
+ cifs_sb = CIFS_SB(parent_dir_inode->i_sb);
+ pTcon = cifs_sb->tcon;
+
++ /*
++ * Don't allow the separator character in a path component.
++ * The VFS will not allow "/", but "\" is allowed by posix.
++ */
++ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) {
++ int i;
++ for (i = 0; i < direntry->d_name.len; i++)
++ if (direntry->d_name.name[i] == '\\') {
++ cFYI(1, ("Invalid file name"));
++ FreeXid(xid);
++ return ERR_PTR(-EINVAL);
++ }
++ }
++
+ /* can not grab the rename sem here since it would
+ deadlock in the cases (beginning of sys_rename itself)
+ in which we already have the sb rename sem */
+diff -upr linux-2.6.16.orig/fs/coda/dir.c linux-2.6.16-026test015/fs/coda/dir.c
+--- linux-2.6.16.orig/fs/coda/dir.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/coda/dir.c 2006-07-04 14:41:37.000000000 +0400
+@@ -151,7 +151,8 @@ exit:
+ }
+
+
+-int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
++int coda_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ int error = 0;
+
+diff -upr linux-2.6.16.orig/fs/coda/pioctl.c linux-2.6.16-026test015/fs/coda/pioctl.c
+--- linux-2.6.16.orig/fs/coda/pioctl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/coda/pioctl.c 2006-07-04 14:41:37.000000000 +0400
+@@ -25,7 +25,7 @@
+
+ /* pioctl ops */
+ static int coda_ioctl_permission(struct inode *inode, int mask,
+- struct nameidata *nd);
++ struct nameidata *nd, struct exec_perm *perm);
+ static int coda_pioctl(struct inode * inode, struct file * filp,
+ unsigned int cmd, unsigned long user_data);
+
+@@ -43,7 +43,7 @@ struct file_operations coda_ioctl_operat
+
+ /* the coda pioctl inode ops */
+ static int coda_ioctl_permission(struct inode *inode, int mask,
+- struct nameidata *nd)
++ struct nameidata *nd, struct exec_perm *perm)
+ {
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/fs/compat.c linux-2.6.16-026test015/fs/compat.c
+--- linux-2.6.16.orig/fs/compat.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/compat.c 2006-07-04 14:41:39.000000000 +0400
+@@ -197,6 +197,8 @@ asmlinkage long compat_sys_statfs(const
+ struct kstatfs tmp;
+ error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+ if (!error)
++ error = faudit_statfs(nd.mnt->mnt_sb, &tmp);
++ if (!error)
+ error = put_compat_statfs(buf, &tmp);
+ path_release(&nd);
+ }
+@@ -215,6 +217,8 @@ asmlinkage long compat_sys_fstatfs(unsig
+ goto out;
+ error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+ if (!error)
++ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp);
++ if (!error)
+ error = put_compat_statfs(buf, &tmp);
+ fput(file);
+ out:
+@@ -265,6 +269,8 @@ asmlinkage long compat_sys_statfs64(cons
+ struct kstatfs tmp;
+ error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp);
+ if (!error)
++ error = faudit_statfs(nd.mnt->mnt_sb, &tmp);
++ if (!error)
+ error = put_compat_statfs64(buf, &tmp);
+ path_release(&nd);
+ }
+@@ -286,6 +292,8 @@ asmlinkage long compat_sys_fstatfs64(uns
+ goto out;
+ error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp);
+ if (!error)
++ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp);
++ if (!error)
+ error = put_compat_statfs64(buf, &tmp);
+ fput(file);
+ out:
+@@ -1215,6 +1223,10 @@ static ssize_t compat_do_readv_writev(in
+ if (ret < 0)
+ goto out;
+
++ ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE);
++ if (ret)
++ goto out;
++
+ fnv = NULL;
+ if (type == READ) {
+ fn = file->f_op->read;
+@@ -1479,7 +1491,7 @@ int compat_do_execve(char * filename,
+ goto out_ret;
+ memset(bprm, 0, sizeof(*bprm));
+
+- file = open_exec(filename);
++ file = open_exec(filename, bprm);
+ retval = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out_kfree;
+@@ -1897,7 +1909,7 @@ asmlinkage long compat_sys_ppoll(struct
+ }
+
+ if (sigmask) {
+- if (sigsetsize |= sizeof(compat_sigset_t))
++ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+ return -EFAULT;
+diff -upr linux-2.6.16.orig/fs/dcache.c linux-2.6.16-026test015/fs/dcache.c
+--- linux-2.6.16.orig/fs/dcache.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/dcache.c 2006-07-04 14:41:38.000000000 +0400
+@@ -28,11 +28,16 @@
+ #include <linux/module.h>
+ #include <linux/mount.h>
+ #include <linux/file.h>
++#include <linux/namei.h>
+ #include <asm/uaccess.h>
+ #include <linux/security.h>
+ #include <linux/seqlock.h>
+ #include <linux/swap.h>
+ #include <linux/bootmem.h>
++#include <linux/kernel_stat.h>
++#include <net/inet_sock.h>
++
++#include <ub/ub_dcache.h>
+
+ /* #define DCACHE_DEBUG 1 */
+
+@@ -44,7 +49,7 @@ static seqlock_t rename_lock __cacheline
+
+ EXPORT_SYMBOL(dcache_lock);
+
+-static kmem_cache_t *dentry_cache;
++kmem_cache_t *dentry_cache;
+
+ #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname))
+
+@@ -143,11 +148,8 @@ static void dentry_iput(struct dentry *
+ * no dcache lock, please.
+ */
+
+-void dput(struct dentry *dentry)
++static void dput_recursive(struct dentry *dentry)
+ {
+- if (!dentry)
+- return;
+-
+ repeat:
+ if (atomic_read(&dentry->d_count) == 1)
+ might_sleep();
+@@ -206,6 +208,17 @@ kill_it: {
+ }
+ }
+
++void dput(struct dentry *dentry)
++{
++ if (!dentry)
++ return;
++
++ spin_lock(&dcache_lock);
++ ub_dentry_uncharge(dentry);
++ spin_unlock(&dcache_lock);
++ dput_recursive(dentry);
++}
++
+ /**
+ * d_invalidate - invalidate a dentry
+ * @dentry: dentry to invalidate
+@@ -272,6 +285,8 @@ static inline struct dentry * __dget_loc
+ dentry_stat.nr_unused--;
+ list_del_init(&dentry->d_lru);
+ }
++
++ ub_dentry_charge_nofail(dentry);
+ return dentry;
+ }
+
+@@ -373,13 +388,19 @@ static inline void prune_one_dentry(stru
+ parent = dentry->d_parent;
+ d_free(dentry);
+ if (parent != dentry)
+- dput(parent);
++ /*
++ * dentry is not in use, only child (not outside)
++ * references change, so parent->d_inuse does not change
++ */
++ dput_recursive(parent);
+ spin_lock(&dcache_lock);
+ }
+
+ /**
+ * prune_dcache - shrink the dcache
+ * @count: number of entries to try and free
++ * @sb: if given, ignore dentries for other superblocks
++ * which are being unmounted.
+ *
+ * Shrink the dcache. This is done when we need
+ * more memory, or simply when we need to unmount
+@@ -390,16 +411,29 @@ static inline void prune_one_dentry(stru
+ * all the dentries are in use.
+ */
+
+-static void prune_dcache(int count)
++static void prune_dcache(int count, struct super_block *sb)
+ {
+ spin_lock(&dcache_lock);
+ for (; count ; count--) {
+ struct dentry *dentry;
+ struct list_head *tmp;
++ struct rw_semaphore *s_umount;
+
+ cond_resched_lock(&dcache_lock);
+
+ tmp = dentry_unused.prev;
++ if (unlikely(sb)) {
++ /* Try to find a dentry for this sb, but don't try
++ * too hard, if they aren't near the tail they will
++ * be moved down again soon
++ */
++ int skip = count;
++ while (skip && tmp != &dentry_unused &&
++ list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
++ skip--;
++ tmp = tmp->prev;
++ }
++ }
+ if (tmp == &dentry_unused)
+ break;
+ list_del_init(tmp);
+@@ -425,7 +459,45 @@ static void prune_dcache(int count)
+ spin_unlock(&dentry->d_lock);
+ continue;
+ }
+- prune_one_dentry(dentry);
++ /*
++ * If the dentry is not DCACHED_REFERENCED, it is time
++ * to remove it from the dcache, provided the super block is
++ * NULL (which means we are trying to reclaim memory)
++ * or this dentry belongs to the same super block that
++ * we want to shrink.
++ */
++ /*
++ * If this dentry is for "my" filesystem, then I can prune it
++ * without taking the s_umount lock (I already hold it).
++ */
++ if (sb && dentry->d_sb == sb) {
++ prune_one_dentry(dentry);
++ continue;
++ }
++ /*
++ * ...otherwise we need to be sure this filesystem isn't being
++ * unmounted, otherwise we could race with
++ * generic_shutdown_super(), and end up holding a reference to
++ * an inode while the filesystem is unmounted.
++ * So we try to get s_umount, and make sure s_root isn't NULL.
++ * (Take a local copy of s_umount to avoid a use-after-free of
++ * `dentry').
++ */
++ s_umount = &dentry->d_sb->s_umount;
++ if (down_read_trylock(s_umount)) {
++ if (dentry->d_sb->s_root != NULL) {
++ prune_one_dentry(dentry);
++ up_read(s_umount);
++ continue;
++ }
++ up_read(s_umount);
++ }
++ spin_unlock(&dentry->d_lock);
++ /* Cannot remove the first dentry, and it isn't appropriate
++ * to move it to the head of the list, so give up, and try
++ * later
++ */
++ break;
+ }
+ spin_unlock(&dcache_lock);
+ }
+@@ -486,6 +558,7 @@ repeat:
+ continue;
+ }
+ prune_one_dentry(dentry);
++ cond_resched_lock(&dcache_lock);
+ goto repeat;
+ }
+ spin_unlock(&dcache_lock);
+@@ -635,7 +708,7 @@ void shrink_dcache_parent(struct dentry
+ int found;
+
+ while ((found = select_parent(parent)) != 0)
+- prune_dcache(found);
++ prune_dcache(found, parent->d_sb);
+ }
+
+ /**
+@@ -648,9 +721,10 @@ void shrink_dcache_parent(struct dentry
+ * done under dcache_lock.
+ *
+ */
+-void shrink_dcache_anon(struct hlist_head *head)
++void shrink_dcache_anon(struct super_block *sb)
+ {
+ struct hlist_node *lp;
++ struct hlist_head *head = &sb->s_anon;
+ int found;
+ do {
+ found = 0;
+@@ -673,7 +747,7 @@ void shrink_dcache_anon(struct hlist_hea
+ }
+ }
+ spin_unlock(&dcache_lock);
+- prune_dcache(found);
++ prune_dcache(found, sb);
+ } while(found);
+ }
+
+@@ -691,12 +765,18 @@ void shrink_dcache_anon(struct hlist_hea
+ */
+ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+ {
++ int res = -1;
++
++ KSTAT_PERF_ENTER(shrink_dcache)
+ if (nr) {
+ if (!(gfp_mask & __GFP_FS))
+- return -1;
+- prune_dcache(nr);
++ goto out;
++ prune_dcache(nr, NULL);
+ }
+- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
++ res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
++out:
++ KSTAT_PERF_LEAVE(shrink_dcache)
++ return res;
+ }
+
+ /**
+@@ -716,19 +796,20 @@ struct dentry *d_alloc(struct dentry * p
+
+ dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ if (!dentry)
+- return NULL;
++ goto err_alloc;
+
+ if (name->len > DNAME_INLINE_LEN-1) {
+ dname = kmalloc(name->len + 1, GFP_KERNEL);
+- if (!dname) {
+- kmem_cache_free(dentry_cache, dentry);
+- return NULL;
+- }
++ if (!dname)
++ goto err_name;
+ } else {
+ dname = dentry->d_iname;
+ }
+ dentry->d_name.name = dname;
+
++ if (ub_dentry_alloc(dentry))
++ goto err_charge;
++
+ dentry->d_name.len = name->len;
+ dentry->d_name.hash = name->hash;
+ memcpy(dname, name->name, name->len);
+@@ -759,12 +840,23 @@ struct dentry *d_alloc(struct dentry * p
+ }
+
+ spin_lock(&dcache_lock);
+- if (parent)
++ if (parent) {
+ list_add(&dentry->d_u.d_child, &parent->d_subdirs);
++ if (parent->d_flags & DCACHE_VIRTUAL)
++ dentry->d_flags |= DCACHE_VIRTUAL;
++ }
+ dentry_stat.nr_dentry++;
+ spin_unlock(&dcache_lock);
+
+ return dentry;
++
++err_charge:
++ if (name->len > DNAME_INLINE_LEN - 1)
++ kfree(dname);
++err_name:
++ kmem_cache_free(dentry_cache, dentry);
++err_alloc:
++ return NULL;
+ }
+
+ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
+@@ -1048,7 +1140,6 @@ struct dentry * __d_lookup(struct dentry
+ unsigned int hash = name->hash;
+ const unsigned char *str = name->name;
+ struct hlist_head *head = d_hash(parent,hash);
+- struct dentry *found = NULL;
+ struct hlist_node *node;
+ struct dentry *dentry;
+
+@@ -1089,7 +1180,7 @@ struct dentry * __d_lookup(struct dentry
+
+ if (!d_unhashed(dentry)) {
+ atomic_inc(&dentry->d_count);
+- found = dentry;
++ goto found;
+ }
+ spin_unlock(&dentry->d_lock);
+ break;
+@@ -1098,7 +1189,18 @@ next:
+ }
+ rcu_read_unlock();
+
+- return found;
++ return NULL;
++
++found:
++ /*
++ * d_lock and rcu_read_lock
++ * are dropped in ub_dentry_charge()
++ */
++ if (ub_dentry_charge(dentry)) {
++ dput(dentry);
++ dentry = NULL;
++ }
++ return dentry;
+ }
+
+ /**
+@@ -1345,6 +1447,32 @@ already_unhashed:
+ }
+
+ /**
++ * __d_path_add_deleted - prepend "(deleted) " text
++ * @end: a pointer to the character after free space at the beginning of the
++ * buffer
++ * @buflen: remaining free space
++ */
++static inline char * __d_path_add_deleted(char * end, int buflen)
++{
++ buflen -= 10;
++ if (buflen < 0)
++ return ERR_PTR(-ENAMETOOLONG);
++ end -= 10;
++ memcpy(end, "(deleted) ", 10);
++ return end;
++}
++
++/**
++ * d_root_check - checks if dentry is accessible from current's fs root
++ * @dentry: dentry to be verified
++ * @vfsmnt: vfsmnt to which the dentry belongs
++ */
++int d_root_check(struct dentry *dentry, struct vfsmount *vfsmnt)
++{
++ return PTR_ERR(d_path(dentry, vfsmnt, NULL, 0));
++}
++
++/**
+ * d_path - return the path of a dentry
+ * @dentry: dentry to report
+ * @vfsmnt: vfsmnt to which the dentry belongs
+@@ -1365,36 +1493,35 @@ static char * __d_path( struct dentry *d
+ char *buffer, int buflen)
+ {
+ char * end = buffer+buflen;
+- char * retval;
++ char * retval = NULL;
+ int namelen;
++ int deleted;
++ struct vfsmount *oldvfsmnt;
+
+- *--end = '\0';
+- buflen--;
+- if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
+- buflen -= 10;
+- end -= 10;
+- if (buflen < 0)
++ oldvfsmnt = vfsmnt;
++ deleted = (!IS_ROOT(dentry) && d_unhashed(dentry));
++ if (buffer != NULL) {
++ *--end = '\0';
++ buflen--;
++
++ if (buflen < 1)
+ goto Elong;
+- memcpy(end, " (deleted)", 10);
++ /* Get '/' right */
++ retval = end-1;
++ *retval = '/';
+ }
+
+- if (buflen < 1)
+- goto Elong;
+- /* Get '/' right */
+- retval = end-1;
+- *retval = '/';
+-
+ for (;;) {
+ struct dentry * parent;
+
+ if (dentry == root && vfsmnt == rootmnt)
+ break;
+ if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+- /* Global root? */
++ /* root of a tree? */
+ spin_lock(&vfsmount_lock);
+ if (vfsmnt->mnt_parent == vfsmnt) {
+ spin_unlock(&vfsmount_lock);
+- goto global_root;
++ goto other_root;
+ }
+ dentry = vfsmnt->mnt_mountpoint;
+ vfsmnt = vfsmnt->mnt_parent;
+@@ -1403,27 +1530,51 @@ static char * __d_path( struct dentry *d
+ }
+ parent = dentry->d_parent;
+ prefetch(parent);
++ if (buffer != NULL) {
++ namelen = dentry->d_name.len;
++ buflen -= namelen + 1;
++ if (buflen < 0)
++ goto Elong;
++ end -= namelen;
++ memcpy(end, dentry->d_name.name, namelen);
++ *--end = '/';
++ retval = end;
++ }
++ dentry = parent;
++ }
++ /* the given root point is reached */
++finish:
++ if (buffer != NULL && deleted)
++ retval = __d_path_add_deleted(end, buflen);
++ return retval;
++
++other_root:
++ /*
++ * We traversed the tree upward and reached a root, but the given
++ * lookup terminal point wasn't encountered. It means either that the
++ * dentry is out of our scope or belongs to an abstract space like
++ * sock_mnt or pipe_mnt. Check for it.
++ *
++ * There are different options to check it.
++ * We may assume that any dentry tree is unreachable unless it's
++ * connected to `root' (defined as fs root of init aka child reaper)
++ * and expose all paths that are not connected to it.
++ * The other option is to allow exposing of known abstract spaces
++ * explicitly and hide the path information for other cases.
++ * This approach is more safe, let's take it. 2001/04/22 SAW
++ */
++ if (!(oldvfsmnt->mnt_sb->s_flags & MS_NOUSER))
++ return ERR_PTR(-EINVAL);
++ if (buffer != NULL) {
+ namelen = dentry->d_name.len;
+- buflen -= namelen + 1;
++ buflen -= namelen;
+ if (buflen < 0)
+ goto Elong;
+- end -= namelen;
+- memcpy(end, dentry->d_name.name, namelen);
+- *--end = '/';
+- retval = end;
+- dentry = parent;
++ retval -= namelen-1; /* hit the slash */
++ memcpy(retval, dentry->d_name.name, namelen);
+ }
++ goto finish;
+
+- return retval;
+-
+-global_root:
+- namelen = dentry->d_name.len;
+- buflen -= namelen;
+- if (buflen < 0)
+- goto Elong;
+- retval -= namelen-1; /* hit the slash */
+- memcpy(retval, dentry->d_name.name, namelen);
+- return retval;
+ Elong:
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+@@ -1448,6 +1599,229 @@ char * d_path(struct dentry *dentry, str
+ return res;
+ }
+
++#ifdef CONFIG_VE
++#include <net/sock.h>
++#include <linux/ip.h>
++#include <linux/file.h>
++#include <linux/namespace.h>
++#include <linux/vzratelimit.h>
++
++static void mark_sub_tree_virtual(struct dentry *d)
++{
++ struct dentry *orig_root;
++
++ orig_root = d;
++ while (1) {
++ spin_lock(&d->d_lock);
++ d->d_flags |= DCACHE_VIRTUAL;
++ spin_unlock(&d->d_lock);
++
++ if (!list_empty(&d->d_subdirs)) {
++ d = list_entry(d->d_subdirs.next,
++ struct dentry, d_u.d_child);
++ continue;
++ }
++ if (d == orig_root)
++ break;
++ while (d == list_entry(d->d_parent->d_subdirs.prev,
++ struct dentry, d_u.d_child)) {
++ d = d->d_parent;
++ if (d == orig_root)
++ goto out;
++ }
++ d = list_entry(d->d_u.d_child.next,
++ struct dentry, d_u.d_child);
++ }
++out:
++ return;
++}
++
++void mark_tree_virtual(struct vfsmount *m, struct dentry *d)
++{
++ struct vfsmount *orig_rootmnt;
++
++ spin_lock(&dcache_lock);
++ spin_lock(&vfsmount_lock);
++ orig_rootmnt = m;
++ while (1) {
++ mark_sub_tree_virtual(d);
++ if (!list_empty(&m->mnt_mounts)) {
++ m = list_entry(m->mnt_mounts.next,
++ struct vfsmount, mnt_child);
++ d = m->mnt_root;
++ continue;
++ }
++ if (m == orig_rootmnt)
++ break;
++ while (m == list_entry(m->mnt_parent->mnt_mounts.prev,
++ struct vfsmount, mnt_child)) {
++ m = m->mnt_parent;
++ if (m == orig_rootmnt)
++ goto out;
++ }
++ m = list_entry(m->mnt_child.next,
++ struct vfsmount, mnt_child);
++ d = m->mnt_root;
++ }
++out:
++ spin_unlock(&vfsmount_lock);
++ spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(mark_tree_virtual);
++
++static struct vz_rate_info area_ri = { 20, 10*HZ };
++#define VE_AREA_ACC_CHECK 0x0001
++#define VE_AREA_ACC_DENY 0x0002
++#define VE_AREA_EXEC_CHECK 0x0010
++#define VE_AREA_EXEC_DENY 0x0020
++#define VE0_AREA_ACC_CHECK 0x0100
++#define VE0_AREA_ACC_DENY 0x0200
++#define VE0_AREA_EXEC_CHECK 0x1000
++#define VE0_AREA_EXEC_DENY 0x2000
++int ve_area_access_check = 0;
++
++static void print_connection_info(struct task_struct *tsk)
++{
++ struct files_struct *files;
++ struct fdtable *fdt;
++ int fd;
++
++ files = get_files_struct(tsk);
++ if (!files)
++ return;
++
++ spin_lock(&files->file_lock);
++ fdt = files_fdtable(files);
++ for (fd = 0; fd < fdt->max_fds; fd++) {
++ struct file *file;
++ struct inode *inode;
++ struct socket *socket;
++ struct sock *sk;
++ struct inet_sock *inet;
++
++ file = fdt->fd[fd];
++ if (file == NULL)
++ continue;
++
++ inode = file->f_dentry->d_inode;
++ if (!S_ISSOCK(inode->i_mode))
++ continue;
++
++ socket = SOCKET_I(inode);
++ if (socket == NULL)
++ continue;
++
++ sk = socket->sk;
++ if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
++ || sk->sk_type != SOCK_STREAM)
++ continue;
++
++ inet = inet_sk(sk);
++ printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n",
++ NIPQUAD(inet->daddr), ntohs(inet->dport),
++ inet->num);
++ }
++ spin_unlock(&files->file_lock);
++ put_files_struct(files);
++}
++
++static void check_alert(struct vfsmount *vfsmnt, struct dentry *dentry,
++ char *str)
++{
++ struct task_struct *tsk;
++ unsigned long page;
++ struct super_block *sb;
++ char *p;
++
++ if (!vz_ratelimit(&area_ri))
++ return;
++
++ tsk = current;
++ p = ERR_PTR(-ENOMEM);
++ page = __get_free_page(GFP_KERNEL);
++ if (page) {
++ spin_lock(&dcache_lock);
++ p = __d_path(dentry, vfsmnt, tsk->fs->root, tsk->fs->rootmnt,
++ (char *)page, PAGE_SIZE);
++ spin_unlock(&dcache_lock);
++ }
++ if (IS_ERR(p))
++ p = "(undefined)";
++
++ sb = dentry->d_sb;
++ printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n"
++ "Task %d/%d[%s] from VE%d, execenv %d\n",
++ str, p, VE_OWNER_FSTYPE(sb->s_type)->veid,
++ sb->s_type->name, sb->s_dev,
++ tsk->pid, virt_pid(tsk), tsk->comm,
++ VE_TASK_INFO(tsk)->owner_env->veid,
++ get_exec_env()->veid);
++
++ free_page(page);
++
++ print_connection_info(tsk);
++
++ read_lock(&tasklist_lock);
++ tsk = tsk->real_parent;
++ get_task_struct(tsk);
++ read_unlock(&tasklist_lock);
++
++ printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n",
++ tsk->pid, virt_pid(tsk), tsk->comm,
++ VE_TASK_INFO(tsk)->owner_env->veid);
++
++ print_connection_info(tsk);
++ put_task_struct(tsk);
++ dump_stack();
++}
++#endif
++
++int check_area_access_ve(struct dentry *dentry, struct vfsmount *mnt)
++{
++#ifdef CONFIG_VE
++ int check, alert, deny;
++
++ if (ve_is_super(get_exec_env())) {
++ check = ve_area_access_check & VE0_AREA_ACC_CHECK;
++ alert = dentry->d_flags & DCACHE_VIRTUAL;
++ deny = ve_area_access_check & VE0_AREA_ACC_DENY;
++ } else {
++ check = ve_area_access_check & VE_AREA_ACC_CHECK;
++ alert = !(dentry->d_flags & DCACHE_VIRTUAL);
++ deny = ve_area_access_check & VE_AREA_ACC_DENY;
++ }
++
++ if (check && alert)
++ check_alert(mnt, dentry, "Access");
++ if (deny && alert)
++ return -EACCES;
++#endif
++ return 0;
++}
++
++int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt)
++{
++#ifdef CONFIG_VE
++ int check, alert, deny;
++
++ if (ve_is_super(get_exec_env())) {
++ check = ve_area_access_check & VE0_AREA_EXEC_CHECK;
++ alert = dentry->d_flags & DCACHE_VIRTUAL;
++ deny = ve_area_access_check & VE0_AREA_EXEC_DENY;
++ } else {
++ check = ve_area_access_check & VE_AREA_EXEC_CHECK;
++ alert = !(dentry->d_flags & DCACHE_VIRTUAL);
++ deny = ve_area_access_check & VE_AREA_EXEC_DENY;
++ }
++
++ if (check && alert)
++ check_alert(mnt, dentry, "Exec");
++ if (deny && alert)
++ return -EACCES;
++#endif
++ return 0;
++}
++
+ /*
+ * NOTE! The user-level library version returns a
+ * character pointer. The kernel system call just
+@@ -1584,10 +1958,12 @@ resume:
+ goto repeat;
+ }
+ atomic_dec(&dentry->d_count);
++ ub_dentry_uncharge(dentry);
+ }
+ if (this_parent != root) {
+ next = this_parent->d_u.d_child.next;
+ atomic_dec(&this_parent->d_count);
++ ub_dentry_uncharge(this_parent);
+ this_parent = this_parent->d_parent;
+ goto resume;
+ }
+@@ -1736,7 +2112,8 @@ void __init vfs_caches_init(unsigned lon
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+
+ filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC,
++ NULL, NULL);
+
+ dcache_init(mempages);
+ inode_init(mempages);
+diff -upr linux-2.6.16.orig/fs/devpts/inode.c linux-2.6.16-026test015/fs/devpts/inode.c
+--- linux-2.6.16.orig/fs/devpts/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/devpts/inode.c 2006-07-04 14:41:38.000000000 +0400
+@@ -12,6 +12,7 @@
+
+ #include <linux/module.h>
+ #include <linux/init.h>
++#include <linux/ve.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/namei.h>
+@@ -21,16 +22,17 @@
+
+ #define DEVPTS_SUPER_MAGIC 0x1cd1
+
++struct devpts_config devpts_config = {.mode = 0600};
++
++#ifndef CONFIG_VE
+ static struct vfsmount *devpts_mnt;
+ static struct dentry *devpts_root;
+-
+-static struct {
+- int setuid;
+- int setgid;
+- uid_t uid;
+- gid_t gid;
+- umode_t mode;
+-} config = {.mode = 0600};
++#define config devpts_config
++#else
++#define devpts_mnt (get_exec_env()->devpts_mnt)
++#define devpts_root (get_exec_env()->devpts_root)
++#define config (*(get_exec_env()->devpts_config))
++#endif
+
+ static int devpts_remount(struct super_block *sb, int *flags, char *data)
+ {
+@@ -56,7 +58,8 @@ static int devpts_remount(struct super_b
+ } else if (sscanf(this_char, "mode=%o%c", &n, &dummy) == 1)
+ mode = n & ~S_IFMT;
+ else {
+- printk("devpts: called with bogus options\n");
++ ve_printk(VE_LOG,
++ "devpts: called with bogus options\n");
+ return -EINVAL;
+ }
+ }
+@@ -114,13 +117,15 @@ static struct super_block *devpts_get_sb
+ return get_sb_single(fs_type, flags, data, devpts_fill_super);
+ }
+
+-static struct file_system_type devpts_fs_type = {
++struct file_system_type devpts_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "devpts",
+ .get_sb = devpts_get_sb,
+ .kill_sb = kill_anon_super,
+ };
+
++EXPORT_SYMBOL(devpts_fs_type);
++
+ /*
+ * The normal naming convention is simply /dev/pts/<number>; this conforms
+ * to the System V naming convention
+@@ -212,6 +217,7 @@ static int __init init_devpts_fs(void)
+
+ static void __exit exit_devpts_fs(void)
+ {
++ /* the code is never called, the argument is irrelevant */
+ unregister_filesystem(&devpts_fs_type);
+ mntput(devpts_mnt);
+ }
+diff -upr linux-2.6.16.orig/fs/eventpoll.c linux-2.6.16-026test015/fs/eventpoll.c
+--- linux-2.6.16.orig/fs/eventpoll.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/eventpoll.c 2006-07-04 14:41:39.000000000 +0400
+@@ -105,11 +105,6 @@
+ #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
+
+
+-struct epoll_filefd {
+- struct file *file;
+- int fd;
+-};
+-
+ /*
+ * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
+ * It is used to keep track on all tasks that are currently inside the wake_up() code
+@@ -132,36 +127,6 @@ struct poll_safewake {
+ spinlock_t lock;
+ };
+
+-/*
+- * This structure is stored inside the "private_data" member of the file
+- * structure and rapresent the main data sructure for the eventpoll
+- * interface.
+- */
+-struct eventpoll {
+- /* Protect the this structure access */
+- rwlock_t lock;
+-
+- /*
+- * This semaphore is used to ensure that files are not removed
+- * while epoll is using them. This is read-held during the event
+- * collection loop and it is write-held during the file cleanup
+- * path, the epoll file exit code and the ctl operations.
+- */
+- struct rw_semaphore sem;
+-
+- /* Wait queue used by sys_epoll_wait() */
+- wait_queue_head_t wq;
+-
+- /* Wait queue used by file->poll() */
+- wait_queue_head_t poll_wait;
+-
+- /* List of ready file descriptors */
+- struct list_head rdllist;
+-
+- /* RB-Tree root used to store monitored fd structs */
+- struct rb_root rbr;
+-};
+-
+ /* Wait structure used by the poll hooks */
+ struct eppoll_entry {
+ /* List header used to link this structure to the "struct epitem" */
+@@ -180,51 +145,6 @@ struct eppoll_entry {
+ wait_queue_head_t *whead;
+ };
+
+-/*
+- * Each file descriptor added to the eventpoll interface will
+- * have an entry of this type linked to the hash.
+- */
+-struct epitem {
+- /* RB-Tree node used to link this structure to the eventpoll rb-tree */
+- struct rb_node rbn;
+-
+- /* List header used to link this structure to the eventpoll ready list */
+- struct list_head rdllink;
+-
+- /* The file descriptor information this item refers to */
+- struct epoll_filefd ffd;
+-
+- /* Number of active wait queue attached to poll operations */
+- int nwait;
+-
+- /* List containing poll wait queues */
+- struct list_head pwqlist;
+-
+- /* The "container" of this item */
+- struct eventpoll *ep;
+-
+- /* The structure that describe the interested events and the source fd */
+- struct epoll_event event;
+-
+- /*
+- * Used to keep track of the usage count of the structure. This avoids
+- * that the structure will desappear from underneath our processing.
+- */
+- atomic_t usecnt;
+-
+- /* List header used to link this item to the "struct file" items list */
+- struct list_head fllink;
+-
+- /* List header used to link the item to the transfer list */
+- struct list_head txlink;
+-
+- /*
+- * This is used during the collection/transfer of events to userspace
+- * to pin items empty events set.
+- */
+- unsigned int revents;
+-};
+-
+ /* Wrapper struct used by poll queueing */
+ struct ep_pqueue {
+ poll_table pt;
+@@ -239,14 +159,10 @@ static int ep_getfd(int *efd, struct ino
+ struct eventpoll *ep);
+ static int ep_alloc(struct eventpoll **pep);
+ static void ep_free(struct eventpoll *ep);
+-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
+ static void ep_use_epitem(struct epitem *epi);
+-static void ep_release_epitem(struct epitem *epi);
+ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+ poll_table *pt);
+ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi);
+-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+- struct file *tfile, int fd);
+ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
+ struct epoll_event *event);
+ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi);
+@@ -274,7 +190,8 @@ static struct super_block *eventpollfs_g
+ /*
+ * This semaphore is used to serialize ep_free() and eventpoll_release_file().
+ */
+-static struct semaphore epsem;
++struct semaphore epsem;
++EXPORT_SYMBOL_GPL(epsem);
+
+ /* Safe wake up implementation */
+ static struct poll_safewake psw;
+@@ -289,10 +206,11 @@ static kmem_cache_t *pwq_cache;
+ static struct vfsmount *eventpoll_mnt;
+
+ /* File callbacks that implement the eventpoll file behaviour */
+-static struct file_operations eventpoll_fops = {
++struct file_operations eventpoll_fops = {
+ .release = ep_eventpoll_close,
+ .poll = ep_eventpoll_poll
+ };
++EXPORT_SYMBOL_GPL(eventpoll_fops);
+
+ /*
+ * This is used to register the virtual file system from where
+@@ -542,7 +460,7 @@ eexit_1:
+ current, size, error));
+ return error;
+ }
+-
++EXPORT_SYMBOL_GPL(sys_epoll_create);
+
+ /*
+ * The following function implements the controller interface for
+@@ -852,7 +770,7 @@ static void ep_free(struct eventpoll *ep
+ * the returned item, so the caller must call ep_release_epitem()
+ * after finished using the "struct epitem".
+ */
+-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
+ {
+ int kcmp;
+ unsigned long flags;
+@@ -882,6 +800,7 @@ static struct epitem *ep_find(struct eve
+
+ return epir;
+ }
++EXPORT_SYMBOL_GPL(ep_find);
+
+
+ /*
+@@ -900,13 +819,13 @@ static void ep_use_epitem(struct epitem
+ * has finished using the structure. It might lead to freeing the
+ * structure itself if the count goes to zero.
+ */
+-static void ep_release_epitem(struct epitem *epi)
++void ep_release_epitem(struct epitem *epi)
+ {
+
+ if (atomic_dec_and_test(&epi->usecnt))
+ kmem_cache_free(epi_cache, epi);
+ }
+-
++EXPORT_SYMBOL_GPL(ep_release_epitem);
+
+ /*
+ * This is the callback that is used to add our wait queue to the
+@@ -952,7 +871,7 @@ static void ep_rbtree_insert(struct even
+ }
+
+
+-static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
++int ep_insert(struct eventpoll *ep, struct epoll_event *event,
+ struct file *tfile, int fd)
+ {
+ int error, revents, pwake = 0;
+@@ -1044,6 +963,7 @@ eexit_2:
+ eexit_1:
+ return error;
+ }
++EXPORT_SYMBOL_GPL(ep_insert);
+
+
+ /*
+diff -upr linux-2.6.16.orig/fs/exec.c linux-2.6.16-026test015/fs/exec.c
+--- linux-2.6.16.orig/fs/exec.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/exec.c 2006-07-04 14:41:39.000000000 +0400
+@@ -53,6 +53,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
+
++#include <ub/ub_vmpages.h>
++
+ #ifdef CONFIG_KMOD
+ #include <linux/kmod.h>
+ #endif
+@@ -64,6 +66,8 @@ int suid_dumpable = 0;
+ EXPORT_SYMBOL(suid_dumpable);
+ /* The maximal length of core_pattern is also specified in sysctl.c */
+
++int sysctl_at_vsyscall;
++
+ static struct linux_binfmt *formats;
+ static DEFINE_RWLOCK(binfmt_lock);
+
+@@ -135,7 +139,7 @@ asmlinkage long sys_uselib(const char __
+ if (!S_ISREG(nd.dentry->d_inode->i_mode))
+ goto exit;
+
+- error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
++ error = vfs_permission(&nd, MAY_READ | MAY_EXEC, NULL);
+ if (error)
+ goto exit;
+
+@@ -308,6 +312,10 @@ void install_arg_page(struct vm_area_str
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t * pte;
+ spinlock_t *ptl;
++ struct page_beancounter *pb;
++
++ if (unlikely(pb_alloc(&pb)))
++ goto out_nopb;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto out;
+@@ -321,15 +329,21 @@ void install_arg_page(struct vm_area_str
+ goto out;
+ }
+ inc_mm_counter(mm, anon_rss);
++ inc_vma_rss(vma);
+ lru_cache_add_active(page);
+ set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
+ page, vma->vm_page_prot))));
++ pb_add_ref(page, mm, &pb);
++ ub_unused_privvm_dec(mm, vma);
++ pb_free(&pb);
+ page_add_new_anon_rmap(page, vma, address);
+ pte_unmap_unlock(pte, ptl);
+
+ /* no need for flush_tlb */
+ return;
+ out:
++ pb_free(&pb);
++out_nopb:
+ __free_page(page);
+ force_sig(SIGKILL, current);
+ }
+@@ -404,9 +418,13 @@ int setup_arg_pages(struct linux_binprm
+ bprm->loader += stack_base;
+ bprm->exec += stack_base;
+
+- mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
++ if (ub_memory_charge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags,
++ NULL, UB_SOFT))
++ goto fail_charge;
++
++ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | __GFP_SOFT_UBC);
+ if (!mpnt)
+- return -ENOMEM;
++ goto fail_alloc;
+
+ memset(mpnt, 0, sizeof(*mpnt));
+
+@@ -450,6 +468,11 @@ int setup_arg_pages(struct linux_binprm
+ up_write(&mm->mmap_sem);
+
+ return 0;
++
++fail_alloc:
++ ub_memory_uncharge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, NULL);
++fail_charge:
++ return -ENOMEM;
+ }
+
+ EXPORT_SYMBOL(setup_arg_pages);
+@@ -471,7 +494,7 @@ static inline void free_arg_pages(struct
+
+ #endif /* CONFIG_MMU */
+
+-struct file *open_exec(const char *name)
++struct file *open_exec(const char *name, struct linux_binprm *bprm)
+ {
+ struct nameidata nd;
+ int err;
+@@ -485,9 +508,16 @@ struct file *open_exec(const char *name)
+ file = ERR_PTR(-EACCES);
+ if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
+ S_ISREG(inode->i_mode)) {
+- int err = vfs_permission(&nd, MAY_EXEC);
+- if (!err && !(inode->i_mode & 0111))
+- err = -EACCES;
++ int err;
++ struct exec_perm *perm;
++
++ if (bprm != NULL) {
++ perm = &bprm->perm;
++ perm->set = 0;
++ } else
++ perm = NULL;
++
++ err = vfs_permission(&nd, MAY_EXEC, perm);
+ file = ERR_PTR(err);
+ if (!err) {
+ file = nameidata_to_filp(&nd, O_RDONLY);
+@@ -657,7 +687,7 @@ static int de_thread(struct task_struct
+ */
+ if (!thread_group_leader(current)) {
+ struct task_struct *parent;
+- struct dentry *proc_dentry1, *proc_dentry2;
++ struct dentry *proc_dentry1[2], *proc_dentry2[2];
+ unsigned long ptrace;
+
+ /*
+@@ -671,8 +701,8 @@ static int de_thread(struct task_struct
+
+ spin_lock(&leader->proc_lock);
+ spin_lock(&current->proc_lock);
+- proc_dentry1 = proc_pid_unhash(current);
+- proc_dentry2 = proc_pid_unhash(leader);
++ proc_pid_unhash(current, proc_dentry1);
++ proc_pid_unhash(leader, proc_dentry2);
+ write_lock_irq(&tasklist_lock);
+
+ BUG_ON(leader->tgid != current->tgid);
+@@ -829,7 +859,7 @@ int flush_old_exec(struct linux_binprm *
+ {
+ char * name;
+ int i, ch, retval;
+- struct files_struct *files;
++ struct files_struct *files, *old;
+ char tcomm[sizeof(current->comm)];
+
+ /*
+@@ -897,6 +927,7 @@ int flush_old_exec(struct linux_binprm *
+ suid_keys(current);
+ current->mm->dumpable = suid_dumpable;
+ }
++ current->mm->vps_dumpable = 1;
+
+ /* An exec changes our domain. We are no longer part of the thread
+ group */
+@@ -909,8 +940,11 @@ int flush_old_exec(struct linux_binprm *
+ return 0;
+
+ mmap_failed:
+- put_files_struct(current->files);
++ old = current->files;
++ task_lock(current);
+ current->files = files;
++ task_unlock(current);
++ put_files_struct(old);
+ out:
+ return retval;
+ }
+@@ -927,13 +961,6 @@ int prepare_binprm(struct linux_binprm *
+ struct inode * inode = bprm->file->f_dentry->d_inode;
+ int retval;
+
+- mode = inode->i_mode;
+- /*
+- * Check execute perms again - if the caller has CAP_DAC_OVERRIDE,
+- * generic_permission lets a non-executable through
+- */
+- if (!(mode & 0111)) /* with at least _one_ execute bit set */
+- return -EACCES;
+ if (bprm->file->f_op == NULL)
+ return -EACCES;
+
+@@ -941,10 +968,24 @@ int prepare_binprm(struct linux_binprm *
+ bprm->e_gid = current->egid;
+
+ if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
++ if (!bprm->perm.set) {
++ /*
++ * This piece of code creates a time window between
++ * MAY_EXEC permission check and setuid/setgid
++ * operations and may be considered as a security hole.
++ * This code is here for compatibility reasons,
++ * if the filesystem is unable to return info now.
++ */
++ bprm->perm.mode = inode->i_mode;
++ bprm->perm.uid = inode->i_uid;
++ bprm->perm.gid = inode->i_gid;
++ }
++ mode = bprm->perm.mode;
++
+ /* Set-uid? */
+ if (mode & S_ISUID) {
+ current->personality &= ~PER_CLEAR_ON_SETID;
+- bprm->e_uid = inode->i_uid;
++ bprm->e_uid = bprm->perm.uid;
+ }
+
+ /* Set-gid? */
+@@ -955,7 +996,7 @@ int prepare_binprm(struct linux_binprm *
+ */
+ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+ current->personality &= ~PER_CLEAR_ON_SETID;
+- bprm->e_gid = inode->i_gid;
++ bprm->e_gid = bprm->perm.gid;
+ }
+ }
+
+@@ -1054,7 +1095,7 @@ int search_binary_handler(struct linux_b
+
+ loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
+
+- file = open_exec("/sbin/loader");
++ file = open_exec("/sbin/loader", bprm);
+ retval = PTR_ERR(file);
+ if (IS_ERR(file))
+ return retval;
+@@ -1148,7 +1189,7 @@ int do_execve(char * filename,
+ goto out_ret;
+ memset(bprm, 0, sizeof(*bprm));
+
+- file = open_exec(filename);
++ file = open_exec(filename, bprm);
+ retval = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out_kfree;
+@@ -1288,7 +1329,7 @@ static void format_corename(char *corena
+ case 'p':
+ pid_in_pattern = 1;
+ rc = snprintf(out_ptr, out_end - out_ptr,
+- "%d", current->tgid);
++ "%d", virt_tgid(current));
+ if (rc > out_end - out_ptr)
+ goto out;
+ out_ptr += rc;
+@@ -1332,7 +1373,7 @@ static void format_corename(char *corena
+ case 'h':
+ down_read(&uts_sem);
+ rc = snprintf(out_ptr, out_end - out_ptr,
+- "%s", system_utsname.nodename);
++ "%s", ve_utsname.nodename);
+ up_read(&uts_sem);
+ if (rc > out_end - out_ptr)
+ goto out;
+@@ -1360,7 +1401,7 @@ static void format_corename(char *corena
+ if (!pid_in_pattern
+ && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
+ rc = snprintf(out_ptr, out_end - out_ptr,
+- ".%d", current->tgid);
++ ".%d", virt_tgid(current));
+ if (rc > out_end - out_ptr)
+ goto out;
+ out_ptr += rc;
+@@ -1386,7 +1427,7 @@ static void zap_threads (struct mm_struc
+ }
+
+ read_lock(&tasklist_lock);
+- do_each_thread(g,p)
++ do_each_thread_ve(g,p)
+ if (mm == p->mm && p != tsk) {
+ force_sig_specific(SIGKILL, p);
+ mm->core_waiters++;
+@@ -1394,7 +1435,7 @@ static void zap_threads (struct mm_struc
+ unlikely(p->parent->mm == mm))
+ traced = 1;
+ }
+- while_each_thread(g,p);
++ while_each_thread_ve(g,p);
+
+ read_unlock(&tasklist_lock);
+
+@@ -1406,12 +1447,12 @@ static void zap_threads (struct mm_struc
+ * coredump to finish. Detach them so they can both die.
+ */
+ write_lock_irq(&tasklist_lock);
+- do_each_thread(g,p) {
++ do_each_thread_ve(g,p) {
+ if (mm == p->mm && p != tsk &&
+ p->ptrace && p->parent->mm == mm) {
+ __ptrace_detach(p, 0);
+ }
+- } while_each_thread(g,p);
++ } while_each_thread_ve(g,p);
+ write_unlock_irq(&tasklist_lock);
+ }
+ }
+@@ -1447,7 +1488,8 @@ int do_coredump(long signr, int exit_cod
+ if (!binfmt || !binfmt->core_dump)
+ goto fail;
+ down_write(&mm->mmap_sem);
+- if (!mm->dumpable) {
++ if (!mm->dumpable ||
++ (!mm->vps_dumpable && !ve_is_super(get_exec_env()))) {
+ up_write(&mm->mmap_sem);
+ goto fail;
+ }
+diff -upr linux-2.6.16.orig/fs/ext2/acl.c linux-2.6.16-026test015/fs/ext2/acl.c
+--- linux-2.6.16.orig/fs/ext2/acl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext2/acl.c 2006-07-04 14:41:37.000000000 +0400
+@@ -294,9 +294,10 @@ ext2_check_acl(struct inode *inode, int
+ }
+
+ int
+-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
++ext2_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+- return generic_permission(inode, mask, ext2_check_acl);
++ return generic_permission(inode, mask, ext2_check_acl, perm);
+ }
+
+ /*
+diff -upr linux-2.6.16.orig/fs/ext2/acl.h linux-2.6.16-026test015/fs/ext2/acl.h
+--- linux-2.6.16.orig/fs/ext2/acl.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext2/acl.h 2006-07-04 14:41:37.000000000 +0400
+@@ -58,7 +58,8 @@ static inline int ext2_acl_count(size_t
+ #define EXT2_ACL_NOT_CACHED ((void *)-1)
+
+ /* acl.c */
+-extern int ext2_permission (struct inode *, int, struct nameidata *);
++extern int ext2_permission (struct inode *, int, struct nameidata *,
++ struct exec_perm *);
+ extern int ext2_acl_chmod (struct inode *);
+ extern int ext2_init_acl (struct inode *, struct inode *);
+
+diff -upr linux-2.6.16.orig/fs/ext2/namei.c linux-2.6.16-026test015/fs/ext2/namei.c
+--- linux-2.6.16.orig/fs/ext2/namei.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext2/namei.c 2006-07-04 14:41:39.000000000 +0400
+@@ -31,6 +31,7 @@
+ */
+
+ #include <linux/pagemap.h>
++#include <linux/quotaops.h>
+ #include "ext2.h"
+ #include "xattr.h"
+ #include "acl.h"
+@@ -273,6 +274,8 @@ static int ext2_unlink(struct inode * di
+ struct page * page;
+ int err = -ENOENT;
+
++ DQUOT_INIT(inode);
++
+ de = ext2_find_entry (dir, dentry, &page);
+ if (!de)
+ goto out;
+@@ -315,6 +318,9 @@ static int ext2_rename (struct inode * o
+ struct ext2_dir_entry_2 * old_de;
+ int err = -ENOENT;
+
++ if (new_inode)
++ DQUOT_INIT(new_inode);
++
+ old_de = ext2_find_entry (old_dir, old_dentry, &old_page);
+ if (!old_de)
+ goto out;
+diff -upr linux-2.6.16.orig/fs/ext2/super.c linux-2.6.16-026test015/fs/ext2/super.c
+--- linux-2.6.16.orig/fs/ext2/super.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext2/super.c 2006-07-04 14:41:38.000000000 +0400
+@@ -996,7 +996,7 @@ static int ext2_remount (struct super_bl
+ es = sbi->s_es;
+ if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
+ (old_mount_opt & EXT2_MOUNT_XIP)) &&
+- invalidate_inodes(sb))
++ invalidate_inodes(sb, 0))
+ ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\
+ "xip remain in cache (no functional problem)");
+ if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+@@ -1205,7 +1205,7 @@ static struct file_system_type ext2_fs_t
+ .name = "ext2",
+ .get_sb = ext2_get_sb,
+ .kill_sb = kill_block_super,
+- .fs_flags = FS_REQUIRES_DEV,
++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED,
+ };
+
+ static int __init init_ext2_fs(void)
+diff -upr linux-2.6.16.orig/fs/ext3/acl.c linux-2.6.16-026test015/fs/ext3/acl.c
+--- linux-2.6.16.orig/fs/ext3/acl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext3/acl.c 2006-07-04 14:41:37.000000000 +0400
+@@ -299,9 +299,10 @@ ext3_check_acl(struct inode *inode, int
+ }
+
+ int
+-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
++ext3_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+- return generic_permission(inode, mask, ext3_check_acl);
++ return generic_permission(inode, mask, ext3_check_acl, perm);
+ }
+
+ /*
+diff -upr linux-2.6.16.orig/fs/ext3/acl.h linux-2.6.16-026test015/fs/ext3/acl.h
+--- linux-2.6.16.orig/fs/ext3/acl.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext3/acl.h 2006-07-04 14:41:37.000000000 +0400
+@@ -58,7 +58,8 @@ static inline int ext3_acl_count(size_t
+ #define EXT3_ACL_NOT_CACHED ((void *)-1)
+
+ /* acl.c */
+-extern int ext3_permission (struct inode *, int, struct nameidata *);
++extern int ext3_permission (struct inode *, int, struct nameidata *,
++ struct exec_perm *);
+ extern int ext3_acl_chmod (struct inode *);
+ extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
+
+diff -upr linux-2.6.16.orig/fs/ext3/inode.c linux-2.6.16-026test015/fs/ext3/inode.c
+--- linux-2.6.16.orig/fs/ext3/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext3/inode.c 2006-07-04 14:41:37.000000000 +0400
+@@ -771,6 +771,7 @@ ext3_get_block_handle(handle_t *handle,
+
+ set_buffer_new(bh_result);
+ got_it:
++ clear_buffer_delay(bh_result);
+ map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ if (boundary)
+ set_buffer_boundary(bh_result);
+@@ -964,11 +965,13 @@ static int walk_page_buffers( handle_t *
+ * and the commit_write(). So doing the journal_start at the start of
+ * prepare_write() is the right place.
+ *
+- * Also, this function can nest inside ext3_writepage() ->
+- * block_write_full_page(). In that case, we *know* that ext3_writepage()
+- * has generated enough buffer credits to do the whole page. So we won't
+- * block on the journal in that case, which is good, because the caller may
+- * be PF_MEMALLOC.
++ * [2004/09/04 SAW] journal_start() in prepare_write() causes different ranking
++ * violations if copy_from_user() triggers a page fault (mmap_sem, may be page
++ * lock, plus __GFP_FS allocations).
++ * Now we read in not up-to-date buffers in prepare_write(), and do the rest
++ * including hole instantiation and inode extension in commit_write().
++ *
++ * Other notes.
+ *
+ * By accident, ext3 can be reentered when a transaction is open via
+ * quota file writes. If we were to commit the transaction while thus
+@@ -983,6 +986,27 @@ static int walk_page_buffers( handle_t *
+ * write.
+ */
+
++static int ext3_get_block_delay(struct inode *inode, sector_t iblock,
++ struct buffer_head *bh, int create)
++{
++ int ret;
++
++ ret = ext3_get_block_handle(NULL, inode, iblock, bh, 0, 0);
++ if (ret)
++ return ret;
++ if (!buffer_mapped(bh)) {
++ set_buffer_delay(bh);
++ set_buffer_new(bh);
++ }
++ return ret;
++}
++
++static int ext3_prepare_write(struct file *file, struct page *page,
++ unsigned from, unsigned to)
++{
++ return block_prepare_write(page, from, to, ext3_get_block_delay);
++}
++
+ static int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
+ {
+@@ -991,8 +1015,52 @@ static int do_journal_get_write_access(h
+ return ext3_journal_get_write_access(handle, bh);
+ }
+
+-static int ext3_prepare_write(struct file *file, struct page *page,
+- unsigned from, unsigned to)
++/*
++ * This function zeroes buffers not mapped to disk.
++ * We do it similarly to the error path in __block_prepare_write() to avoid
++ * keeping garbage in the page cache.
++ * Here we check BH_delay state. We know that if the buffer appears
++ * !buffer_mapped then
++ * - it was !buffer_mapped at the moment of ext3_prepare_write, and
++ * - ext3_get_block failed to map this buffer (e.g., ENOSPC).
++ * If this !mapped buffer is not up to date (it can be up to date if
++ * PageUptodate), then we zero its content.
++ */
++static void ext3_clear_delayed_buffers(struct page *page,
++ unsigned from, unsigned to)
++{
++ struct buffer_head *bh, *head, *next;
++ unsigned block_start, block_end;
++ unsigned blocksize;
++ void *kaddr;
++
++ head = page_buffers(page);
++ blocksize = head->b_size;
++ for ( bh = head, block_start = 0;
++ bh != head || !block_start;
++ block_start = block_end, bh = next)
++ {
++ next = bh->b_this_page;
++ block_end = block_start + blocksize;
++ if (block_end <= from || block_start >= to)
++ continue;
++ if (!buffer_delay(bh))
++ continue;
++ J_ASSERT_BH(bh, !buffer_mapped(bh));
++ clear_buffer_new(bh);
++ clear_buffer_delay(bh);
++ if (!buffer_uptodate(bh)) {
++ kaddr = kmap_atomic(page, KM_USER0);
++ memset(kaddr + block_start, 0, bh->b_size);
++ kunmap_atomic(kaddr, KM_USER0);
++ set_buffer_uptodate(bh);
++ mark_buffer_dirty(bh);
++ }
++ }
++}
++
++static int ext3_map_write(struct file *file, struct page *page,
++ unsigned from, unsigned to)
+ {
+ struct inode *inode = page->mapping->host;
+ int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+@@ -1009,18 +1077,17 @@ retry:
+ ret = nobh_prepare_write(page, from, to, ext3_get_block);
+ else
+ ret = block_prepare_write(page, from, to, ext3_get_block);
+- if (ret)
+- goto prepare_write_failed;
+-
+- if (ext3_should_journal_data(inode)) {
++ if (!ret && ext3_should_journal_data(inode)) {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, do_journal_get_write_access);
+ }
+-prepare_write_failed:
+- if (ret)
+- ext3_journal_stop(handle);
++ if (!ret)
++ goto out;
++
++ ext3_journal_stop(handle);
+ if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
++ ext3_clear_delayed_buffers(page, from, to);
+ out:
+ return ret;
+ }
+@@ -1055,10 +1122,15 @@ static int commit_write_fn(handle_t *han
+ static int ext3_ordered_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+ {
+- handle_t *handle = ext3_journal_current_handle();
++ handle_t *handle;
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+
++ ret = ext3_map_write(file, page, from, to);
++ if (ret)
++ return ret;
++ handle = ext3_journal_current_handle();
++
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL, ext3_journal_dirty_data);
+
+@@ -1084,11 +1156,15 @@ static int ext3_ordered_commit_write(str
+ static int ext3_writeback_commit_write(struct file *file, struct page *page,
+ unsigned from, unsigned to)
+ {
+- handle_t *handle = ext3_journal_current_handle();
++ handle_t *handle;
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+ loff_t new_i_size;
+
++ ret = ext3_map_write(file, page, from, to);
++ if (ret)
++ return ret;
++ handle = ext3_journal_current_handle();
+ new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ if (new_i_size > EXT3_I(inode)->i_disksize)
+ EXT3_I(inode)->i_disksize = new_i_size;
+@@ -1107,12 +1183,17 @@ static int ext3_writeback_commit_write(s
+ static int ext3_journalled_commit_write(struct file *file,
+ struct page *page, unsigned from, unsigned to)
+ {
+- handle_t *handle = ext3_journal_current_handle();
++ handle_t *handle;
+ struct inode *inode = page->mapping->host;
+ int ret = 0, ret2;
+ int partial = 0;
+ loff_t pos;
+
++ ret = ext3_map_write(file, page, from, to);
++ if (ret)
++ return ret;
++ handle = ext3_journal_current_handle();
++
+ /*
+ * Here we duplicate the generic_commit_write() functionality
+ */
+diff -upr linux-2.6.16.orig/fs/ext3/ioctl.c linux-2.6.16-026test015/fs/ext3/ioctl.c
+--- linux-2.6.16.orig/fs/ext3/ioctl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext3/ioctl.c 2006-07-04 14:41:37.000000000 +0400
+@@ -69,7 +69,7 @@ int ext3_ioctl (struct inode * inode, st
+ * the relevant capability.
+ */
+ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+- if (!capable(CAP_SYS_RESOURCE))
++ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ }
+
+diff -upr linux-2.6.16.orig/fs/ext3/resize.c linux-2.6.16-026test015/fs/ext3/resize.c
+--- linux-2.6.16.orig/fs/ext3/resize.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext3/resize.c 2006-07-04 14:41:36.000000000 +0400
+@@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block
+ if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
+ ext3_warning(sb, __FUNCTION__,
+ "multiple resizers run on filesystem!");
++ unlock_super(sb);
+ err = -EBUSY;
+ goto exit_put;
+ }
+diff -upr linux-2.6.16.orig/fs/ext3/super.c linux-2.6.16-026test015/fs/ext3/super.c
+--- linux-2.6.16.orig/fs/ext3/super.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ext3/super.c 2006-07-04 14:41:38.000000000 +0400
+@@ -2661,7 +2661,7 @@ static struct file_system_type ext3_fs_t
+ .name = "ext3",
+ .get_sb = ext3_get_sb,
+ .kill_sb = kill_block_super,
+- .fs_flags = FS_REQUIRES_DEV,
++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED,
+ };
+
+ static int __init init_ext3_fs(void)
+diff -upr linux-2.6.16.orig/fs/fcntl.c linux-2.6.16-026test015/fs/fcntl.c
+--- linux-2.6.16.orig/fs/fcntl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/fcntl.c 2006-07-04 14:41:39.000000000 +0400
+@@ -18,6 +18,7 @@
+ #include <linux/ptrace.h>
+ #include <linux/signal.h>
+ #include <linux/rcupdate.h>
++#include <linux/ve_owner.h>
+
+ #include <asm/poll.h>
+ #include <asm/siginfo.h>
+@@ -190,6 +191,7 @@ out_fput:
+ fput(file);
+ goto out;
+ }
++EXPORT_SYMBOL_GPL(sys_dup2);
+
+ asmlinkage long sys_dup(unsigned int fildes)
+ {
+@@ -254,6 +256,7 @@ static int setfl(int fd, struct file * f
+ static void f_modown(struct file *filp, unsigned long pid,
+ uid_t uid, uid_t euid, int force)
+ {
++ pid = comb_vpid_to_pid(pid);
+ write_lock_irq(&filp->f_owner.lock);
+ if (force || !filp->f_owner.pid) {
+ filp->f_owner.pid = pid;
+@@ -320,7 +323,7 @@ static long do_fcntl(int fd, unsigned in
+ * current syscall conventions, the only way
+ * to fix this will be in libc.
+ */
+- err = filp->f_owner.pid;
++ err = comb_pid_to_vpid(filp->f_owner.pid);
+ force_successful_syscall_return();
+ break;
+ case F_SETOWN:
+@@ -472,23 +475,29 @@ static void send_sigio_to_task(struct ta
+ void send_sigio(struct fown_struct *fown, int fd, int band)
+ {
+ struct task_struct *p;
++ struct file *f;
++ struct ve_struct *ve;
+ int pid;
+
+ read_lock(&fown->lock);
+ pid = fown->pid;
+ if (!pid)
+ goto out_unlock_fown;
++
++ /* hack: fown's are always embedded in struct file */
++ f = container_of(fown, struct file, f_owner);
++ ve = VE_OWNER_FILP(f);
+
+ read_lock(&tasklist_lock);
+ if (pid > 0) {
+- p = find_task_by_pid(pid);
+- if (p) {
++ p = find_task_by_pid_all(pid);
++ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) {
+ send_sigio_to_task(p, fown, fd, band);
+ }
+ } else {
+- do_each_task_pid(-pid, PIDTYPE_PGID, p) {
++ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) {
+ send_sigio_to_task(p, fown, fd, band);
+- } while_each_task_pid(-pid, PIDTYPE_PGID, p);
++ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve);
+ }
+ read_unlock(&tasklist_lock);
+ out_unlock_fown:
+@@ -505,6 +514,8 @@ static void send_sigurg_to_task(struct t
+ int send_sigurg(struct fown_struct *fown)
+ {
+ struct task_struct *p;
++ struct file *f;
++ struct ve_struct *ve;
+ int pid, ret = 0;
+
+ read_lock(&fown->lock);
+@@ -513,17 +524,19 @@ int send_sigurg(struct fown_struct *fown
+ goto out_unlock_fown;
+
+ ret = 1;
++ f = container_of(fown, struct file, f_owner);
++ ve = VE_OWNER_FILP(f);
+
+ read_lock(&tasklist_lock);
+ if (pid > 0) {
+- p = find_task_by_pid(pid);
+- if (p) {
++ p = find_task_by_pid_all(pid);
++ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) {
+ send_sigurg_to_task(p, fown);
+ }
+ } else {
+- do_each_task_pid(-pid, PIDTYPE_PGID, p) {
++ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) {
+ send_sigurg_to_task(p, fown);
+- } while_each_task_pid(-pid, PIDTYPE_PGID, p);
++ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve);
+ }
+ read_unlock(&tasklist_lock);
+ out_unlock_fown:
+diff -upr linux-2.6.16.orig/fs/file.c linux-2.6.16-026test015/fs/file.c
+--- linux-2.6.16.orig/fs/file.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/file.c 2006-07-04 14:41:39.000000000 +0400
+@@ -8,6 +8,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/time.h>
+ #include <linux/slab.h>
+ #include <linux/vmalloc.h>
+@@ -18,6 +19,8 @@
+ #include <linux/rcupdate.h>
+ #include <linux/workqueue.h>
+
++#include <ub/ub_mem.h>
++
+ struct fdtable_defer {
+ spinlock_t lock;
+ struct work_struct wq;
+@@ -44,9 +47,9 @@ struct file ** alloc_fd_array(int num)
+ int size = num * sizeof(struct file *);
+
+ if (size <= PAGE_SIZE)
+- new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
++ new_fds = (struct file **) ub_kmalloc(size, GFP_KERNEL);
+ else
+- new_fds = (struct file **) vmalloc(size);
++ new_fds = (struct file **) ub_vmalloc(size);
+ return new_fds;
+ }
+
+@@ -212,9 +215,9 @@ fd_set * alloc_fdset(int num)
+ int size = num / 8;
+
+ if (size <= PAGE_SIZE)
+- new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
++ new_fdset = (fd_set *) ub_kmalloc(size, GFP_KERNEL);
+ else
+- new_fdset = (fd_set *) vmalloc(size);
++ new_fdset = (fd_set *) ub_vmalloc(size);
+ return new_fdset;
+ }
+
+@@ -302,7 +305,7 @@ out:
+ * both fd array and fdset. It is expected to be called with the
+ * files_lock held.
+ */
+-static int expand_fdtable(struct files_struct *files, int nr)
++int expand_fdtable(struct files_struct *files, int nr)
+ __releases(files->file_lock)
+ __acquires(files->file_lock)
+ {
+@@ -338,6 +341,7 @@ static int expand_fdtable(struct files_s
+ out:
+ return error;
+ }
++EXPORT_SYMBOL_GPL(expand_fdtable);
+
+ /*
+ * Expand files.
+diff -upr linux-2.6.16.orig/fs/file_table.c linux-2.6.16-026test015/fs/file_table.c
+--- linux-2.6.16.orig/fs/file_table.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/file_table.c 2006-07-04 14:41:38.000000000 +0400
+@@ -9,6 +9,7 @@
+ #include <linux/string.h>
+ #include <linux/slab.h>
+ #include <linux/file.h>
++#include <linux/ve_owner.h>
+ #include <linux/init.h>
+ #include <linux/module.h>
+ #include <linux/smp_lock.h>
+@@ -25,6 +26,8 @@
+
+ #include <asm/atomic.h>
+
++#include <ub/ub_misc.h>
++
+ /* sysctl tunables... */
+ struct files_stat_struct files_stat = {
+ .max_files = NR_FILE
+@@ -38,6 +41,8 @@ static struct percpu_counter nr_files __
+ static inline void file_free_rcu(struct rcu_head *head)
+ {
+ struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
++ ub_file_uncharge(f);
++ put_ve(VE_OWNER_FILP(f));
+ kmem_cache_free(filp_cachep, f);
+ }
+
+@@ -109,6 +114,12 @@ struct file *get_empty_filp(void)
+
+ percpu_counter_inc(&nr_files);
+ memset(f, 0, sizeof(*f));
++
++ if (ub_file_charge(f))
++ goto fail_ch;
++
++ SET_VE_OWNER_FILP(f, get_ve(get_exec_env()));
++
+ if (security_file_alloc(f))
+ goto fail_sec;
+
+@@ -134,6 +145,10 @@ fail_sec:
+ file_free(f);
+ fail:
+ return NULL;
++
++fail_ch:
++ kmem_cache_free(filp_cachep, f);
++ return NULL;
+ }
+
+ EXPORT_SYMBOL(get_empty_filp);
+diff -upr linux-2.6.16.orig/fs/filesystems.c linux-2.6.16-026test015/fs/filesystems.c
+--- linux-2.6.16.orig/fs/filesystems.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/filesystems.c 2006-07-04 14:41:38.000000000 +0400
+@@ -13,6 +13,7 @@
+ #include <linux/init.h>
+ #include <linux/module.h>
+ #include <linux/sched.h> /* for 'current' */
++#include <linux/ve_owner.h>
+ #include <asm/uaccess.h>
+
+ /*
+@@ -22,8 +23,8 @@
+ * During the unload module must call unregister_filesystem().
+ * We can access the fields of list element if:
+ * 1) spinlock is held or
+- * 2) we hold the reference to the module.
+- * The latter can be guaranteed by call of try_module_get(); if it
++ * 2) we hold the reference to the element.
++ * The latter can be guaranteed by call of try_filesystem(); if it
+ * returned 0 we must skip the element, otherwise we got the reference.
+ * Once the reference is obtained we can drop the spinlock.
+ */
+@@ -31,23 +32,51 @@
+ static struct file_system_type *file_systems;
+ static DEFINE_RWLOCK(file_systems_lock);
+
++int try_get_filesystem(struct file_system_type *fs)
++{
++ if (try_module_get(fs->owner)) {
++#ifdef CONFIG_VE
++ get_ve(VE_OWNER_FSTYPE(fs));
++#endif
++ return 1;
++ }
++ return 0;
++}
++
+ /* WARNING: This can be used only if we _already_ own a reference */
+ void get_filesystem(struct file_system_type *fs)
+ {
++#ifdef CONFIG_VE
++ get_ve(VE_OWNER_FSTYPE(fs));
++#endif
+ __module_get(fs->owner);
+ }
+
+ void put_filesystem(struct file_system_type *fs)
+ {
+ module_put(fs->owner);
++#ifdef CONFIG_VE
++ put_ve(VE_OWNER_FSTYPE(fs));
++#endif
++}
++
++static inline int check_ve_fstype(struct file_system_type *p,
++ struct ve_struct *env)
++{
++ return ((p->fs_flags & FS_VIRTUALIZED) ||
++ ve_accessible_strict(VE_OWNER_FSTYPE(p), env));
+ }
+
+-static struct file_system_type **find_filesystem(const char *name)
++static struct file_system_type **find_filesystem(const char *name,
++ struct ve_struct *env)
+ {
+ struct file_system_type **p;
+- for (p=&file_systems; *p; p=&(*p)->next)
++ for (p=&file_systems; *p; p=&(*p)->next) {
++ if (!check_ve_fstype(*p, env))
++ continue;
+ if (strcmp((*p)->name,name) == 0)
+ break;
++ }
+ return p;
+ }
+
+@@ -74,8 +103,10 @@ int register_filesystem(struct file_syst
+ if (fs->next)
+ return -EBUSY;
+ INIT_LIST_HEAD(&fs->fs_supers);
++ if (VE_OWNER_FSTYPE(fs) == NULL)
++ SET_VE_OWNER_FSTYPE(fs, get_ve0());
+ write_lock(&file_systems_lock);
+- p = find_filesystem(fs->name);
++ p = find_filesystem(fs->name, VE_OWNER_FSTYPE(fs));
+ if (*p)
+ res = -EBUSY;
+ else
+@@ -132,11 +163,14 @@ static int fs_index(const char __user *
+
+ err = -EINVAL;
+ read_lock(&file_systems_lock);
+- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) {
++ for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) {
++ if (!check_ve_fstype(tmp, get_exec_env()))
++ continue;
+ if (strcmp(tmp->name,name) == 0) {
+ err = index;
+ break;
+ }
++ index++;
+ }
+ read_unlock(&file_systems_lock);
+ putname(name);
+@@ -149,9 +183,15 @@ static int fs_name(unsigned int index, c
+ int len, res;
+
+ read_lock(&file_systems_lock);
+- for (tmp = file_systems; tmp; tmp = tmp->next, index--)
+- if (index <= 0 && try_module_get(tmp->owner))
+- break;
++ for (tmp = file_systems; tmp; tmp = tmp->next) {
++ if (!check_ve_fstype(tmp, get_exec_env()))
++ continue;
++ if (!index) {
++ if (try_get_filesystem(tmp))
++ break;
++ } else
++ index--;
++ }
+ read_unlock(&file_systems_lock);
+ if (!tmp)
+ return -EINVAL;
+@@ -169,8 +209,9 @@ static int fs_maxindex(void)
+ int index;
+
+ read_lock(&file_systems_lock);
+- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++)
+- ;
++ for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next)
++ if (check_ve_fstype(tmp, get_exec_env()))
++ index++;
+ read_unlock(&file_systems_lock);
+ return index;
+ }
+@@ -206,9 +247,10 @@ int get_filesystem_list(char * buf)
+ read_lock(&file_systems_lock);
+ tmp = file_systems;
+ while (tmp && len < PAGE_SIZE - 80) {
+- len += sprintf(buf+len, "%s\t%s\n",
+- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+- tmp->name);
++ if (check_ve_fstype(tmp, get_exec_env()))
++ len += sprintf(buf+len, "%s\t%s\n",
++ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
++ tmp->name);
+ tmp = tmp->next;
+ }
+ read_unlock(&file_systems_lock);
+@@ -220,14 +262,14 @@ struct file_system_type *get_fs_type(con
+ struct file_system_type *fs;
+
+ read_lock(&file_systems_lock);
+- fs = *(find_filesystem(name));
+- if (fs && !try_module_get(fs->owner))
++ fs = *(find_filesystem(name, get_exec_env()));
++ if (fs && !try_get_filesystem(fs))
+ fs = NULL;
+ read_unlock(&file_systems_lock);
+ if (!fs && (request_module("%s", name) == 0)) {
+ read_lock(&file_systems_lock);
+- fs = *(find_filesystem(name));
+- if (fs && !try_module_get(fs->owner))
++ fs = *(find_filesystem(name, get_exec_env()));
++ if (fs && !try_get_filesystem(fs))
+ fs = NULL;
+ read_unlock(&file_systems_lock);
+ }
+@@ -235,3 +277,5 @@ struct file_system_type *get_fs_type(con
+ }
+
+ EXPORT_SYMBOL(get_fs_type);
++EXPORT_SYMBOL(get_filesystem);
++EXPORT_SYMBOL(put_filesystem);
+diff -upr linux-2.6.16.orig/fs/fuse/dir.c linux-2.6.16-026test015/fs/fuse/dir.c
+--- linux-2.6.16.orig/fs/fuse/dir.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/fuse/dir.c 2006-07-04 14:41:37.000000000 +0400
+@@ -708,14 +708,15 @@ static int fuse_access(struct inode *ino
+ * access request is sent. Execute permission is still checked
+ * locally based on file mode.
+ */
+-static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
++static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_task(fc, current))
+ return -EACCES;
+ else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+- int err = generic_permission(inode, mask, NULL);
++ int err = generic_permission(inode, mask, NULL, perm);
+
+ /* If permission is denied, try to refresh file
+ attributes. This is also needed, because the root
+@@ -723,7 +724,7 @@ static int fuse_permission(struct inode
+ if (err == -EACCES) {
+ err = fuse_do_getattr(inode);
+ if (!err)
+- err = generic_permission(inode, mask, NULL);
++ err = generic_permission(inode, mask, NULL, perm);
+ }
+
+ /* Note: the opposite of the above test does not
+diff -upr linux-2.6.16.orig/fs/fuse/file.c linux-2.6.16-026test015/fs/fuse/file.c
+--- linux-2.6.16.orig/fs/fuse/file.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/fuse/file.c 2006-07-04 14:41:36.000000000 +0400
+@@ -397,8 +397,12 @@ static int fuse_readpages(struct file *f
+ return -EINTR;
+
+ err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+- if (!err)
+- fuse_send_readpages(data.req, file, inode);
++ if (!err) {
++ if (data.req->num_pages)
++ fuse_send_readpages(data.req, file, inode);
++ else
++ fuse_put_request(fc, data.req);
++ }
+ return err;
+ }
+
+diff -upr linux-2.6.16.orig/fs/hfs/inode.c linux-2.6.16-026test015/fs/hfs/inode.c
+--- linux-2.6.16.orig/fs/hfs/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/hfs/inode.c 2006-07-04 14:41:37.000000000 +0400
+@@ -520,11 +520,11 @@ void hfs_clear_inode(struct inode *inode
+ }
+
+ static int hfs_permission(struct inode *inode, int mask,
+- struct nameidata *nd)
++ struct nameidata *nd, struct exec_perm *perm)
+ {
+ if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
+ return 0;
+- return generic_permission(inode, mask, NULL);
++ return generic_permission(inode, mask, NULL, perm);
+ }
+
+ static int hfs_file_open(struct inode *inode, struct file *file)
+diff -upr linux-2.6.16.orig/fs/hfsplus/inode.c linux-2.6.16-026test015/fs/hfsplus/inode.c
+--- linux-2.6.16.orig/fs/hfsplus/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/hfsplus/inode.c 2006-07-04 14:41:37.000000000 +0400
+@@ -237,7 +237,8 @@ static void hfsplus_set_perms(struct ino
+ perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
+ }
+
+-static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd)
++static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ /* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
+ * open_exec has the same test, so it's still not executable, if a x bit
+@@ -245,7 +246,7 @@ static int hfsplus_permission(struct ino
+ */
+ if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111))
+ return 0;
+- return generic_permission(inode, mask, NULL);
++ return generic_permission(inode, mask, NULL, perm);
+ }
+
+
+diff -upr linux-2.6.16.orig/fs/hostfs/hostfs_kern.c linux-2.6.16-026test015/fs/hostfs/hostfs_kern.c
+--- linux-2.6.16.orig/fs/hostfs/hostfs_kern.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/hostfs/hostfs_kern.c 2006-07-04 14:41:37.000000000 +0400
+@@ -796,7 +796,8 @@ int hostfs_rename(struct inode *from_ino
+ return(err);
+ }
+
+-int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
++int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ char *name;
+ int r = 0, w = 0, x = 0, err;
+@@ -814,7 +815,7 @@ int hostfs_permission(struct inode *ino,
+ err = access_file(name, r, w, x);
+ kfree(name);
+ if(!err)
+- err = generic_permission(ino, desired, NULL);
++ err = generic_permission(ino, desired, NULL, perm);
+ return err;
+ }
+
+diff -upr linux-2.6.16.orig/fs/hpfs/namei.c linux-2.6.16-026test015/fs/hpfs/namei.c
+--- linux-2.6.16.orig/fs/hpfs/namei.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/hpfs/namei.c 2006-07-04 14:41:37.000000000 +0400
+@@ -415,7 +415,7 @@ again:
+ d_drop(dentry);
+ spin_lock(&dentry->d_lock);
+ if (atomic_read(&dentry->d_count) > 1 ||
+- permission(inode, MAY_WRITE, NULL) ||
++ permission(inode, MAY_WRITE, NULL, NULL) ||
+ !S_ISREG(inode->i_mode) ||
+ get_write_access(inode)) {
+ spin_unlock(&dentry->d_lock);
+diff -upr linux-2.6.16.orig/fs/hugetlbfs/inode.c linux-2.6.16-026test015/fs/hugetlbfs/inode.c
+--- linux-2.6.16.orig/fs/hugetlbfs/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/hugetlbfs/inode.c 2006-07-04 14:41:39.000000000 +0400
+@@ -800,7 +800,7 @@ struct file *hugetlb_zero_setup(size_t s
+ struct inode *inode;
+ struct dentry *dentry, *root;
+ struct qstr quick_string;
+- char buf[16];
++ char buf[64];
+
+ if (!can_do_hugetlb_shm())
+ return ERR_PTR(-EPERM);
+@@ -812,7 +812,8 @@ struct file *hugetlb_zero_setup(size_t s
+ return ERR_PTR(-ENOMEM);
+
+ root = hugetlbfs_vfsmount->mnt_root;
+- snprintf(buf, 16, "%lu", hugetlbfs_counter());
++ snprintf(buf, sizeof(buf), "VE%d-%lu",
++ VEID(get_exec_env()), hugetlbfs_counter());
+ quick_string.name = buf;
+ quick_string.len = strlen(quick_string.name);
+ quick_string.hash = 0;
+diff -upr linux-2.6.16.orig/fs/inode.c linux-2.6.16-026test015/fs/inode.c
+--- linux-2.6.16.orig/fs/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/inode.c 2006-07-04 14:41:39.000000000 +0400
+@@ -9,6 +9,7 @@
+ #include <linux/mm.h>
+ #include <linux/dcache.h>
+ #include <linux/init.h>
++#include <linux/kernel_stat.h>
+ #include <linux/quotaops.h>
+ #include <linux/slab.h>
+ #include <linux/writeback.h>
+@@ -98,13 +99,15 @@ DECLARE_MUTEX(iprune_sem);
+ */
+ struct inodes_stat_t inodes_stat;
+
+-static kmem_cache_t * inode_cachep;
++kmem_cache_t *inode_cachep;
++
++static struct address_space_operations vfs_empty_aops;
++struct inode_operations vfs_empty_iops;
++static struct file_operations vfs_empty_fops;
++EXPORT_SYMBOL(vfs_empty_iops);
+
+ static struct inode *alloc_inode(struct super_block *sb)
+ {
+- static struct address_space_operations empty_aops;
+- static struct inode_operations empty_iops;
+- static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+@@ -119,8 +122,8 @@ static struct inode *alloc_inode(struct
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+- inode->i_op = &empty_iops;
+- inode->i_fop = &empty_fops;
++ inode->i_op = &vfs_empty_iops;
++ inode->i_fop = &vfs_empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+@@ -144,7 +147,7 @@ static struct inode *alloc_inode(struct
+ return NULL;
+ }
+
+- mapping->a_ops = &empty_aops;
++ mapping->a_ops = &vfs_empty_aops;
+ mapping->host = inode;
+ mapping->flags = 0;
+ mapping_set_gfp_mask(mapping, GFP_HIGHUSER);
+@@ -303,13 +306,57 @@ static void dispose_list(struct list_hea
+ spin_unlock(&inode_lock);
+ }
+
++static void show_header(struct inode *inode)
++{
++ struct super_block *sb = inode->i_sb;
++
++ printk("VFS: Busy inodes after unmount. "
++ "sb = %p, fs type = %s, sb count = %d, "
++ "sb->s_root = %s\n", sb,
++ (sb->s_type != NULL) ? sb->s_type->name : "",
++ sb->s_count,
++ (sb->s_root != NULL) ?
++ (char *)sb->s_root->d_name.name : "");
++}
++
++static void show_inode(struct list_head *tmp, struct inode *inode)
++{
++ struct dentry *d;
++ int i;
++
++ printk("inode = %p, inode->i_count = %d, "
++ "inode->i_nlink = %d, "
++ "inode->i_mode = %d, "
++ "inode->i_state = %ld, "
++ "inode->i_flags = %d, "
++ "inode->i_devices.next = %p, "
++ "inode->i_devices.prev = %p, "
++ "inode->i_ino = %ld\n",
++ tmp,
++ atomic_read(&inode->i_count),
++ inode->i_nlink,
++ inode->i_mode,
++ inode->i_state,
++ inode->i_flags,
++ inode->i_devices.next,
++ inode->i_devices.prev,
++ inode->i_ino);
++ printk("inode dump: ");
++ for (i = 0; i < sizeof(*tmp); i++)
++ printk("%2.2x ", *((u_char *)tmp + i));
++ printk("\n");
++ list_for_each_entry(d, &inode->i_dentry, d_alias)
++ printk(" d_alias %s\n",
++ d->d_name.name);
++}
++
+ /*
+ * Invalidate all inodes for a device.
+ */
+-static int invalidate_list(struct list_head *head, struct list_head *dispose)
++static int invalidate_list(struct list_head *head, struct list_head *dispose, int check)
+ {
+ struct list_head *next;
+- int busy = 0, count = 0;
++ int busy = 0, count = 0, once = 1;
+
+ next = head->next;
+ for (;;) {
+@@ -336,6 +383,14 @@ static int invalidate_list(struct list_h
+ continue;
+ }
+ busy = 1;
++
++ if (check) {
++ if (once) {
++ once = 0;
++ show_header(inode);
++ }
++ show_inode(tmp, inode);
++ }
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+@@ -350,7 +405,7 @@ static int invalidate_list(struct list_h
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+-int invalidate_inodes(struct super_block * sb)
++int invalidate_inodes(struct super_block * sb, int check)
+ {
+ int busy;
+ LIST_HEAD(throw_away);
+@@ -358,7 +413,7 @@ int invalidate_inodes(struct super_block
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ inotify_unmount_inodes(&sb->s_inodes);
+- busy = invalidate_list(&sb->s_inodes, &throw_away);
++ busy = invalidate_list(&sb->s_inodes, &throw_away, check);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+@@ -382,7 +437,7 @@ int __invalidate_device(struct block_dev
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+- res = invalidate_inodes(sb);
++ res = invalidate_inodes(sb, 0);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+@@ -478,6 +533,7 @@ static void prune_icache(int nr_to_scan)
+ */
+ static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+ {
++ KSTAT_PERF_ENTER(shrink_icache)
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+@@ -488,6 +544,7 @@ static int shrink_icache_memory(int nr,
+ return -1;
+ prune_icache(nr);
+ }
++ KSTAT_PERF_LEAVE(shrink_icache)
+ return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+ }
+
+@@ -737,7 +794,7 @@ EXPORT_SYMBOL(iunique);
+ struct inode *igrab(struct inode *inode)
+ {
+ spin_lock(&inode_lock);
+- if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
++ if (inode && !(inode->i_state & (I_FREEING|I_WILL_FREE)))
+ __iget(inode);
+ else
+ /*
+diff -upr linux-2.6.16.orig/fs/inotify.c linux-2.6.16-026test015/fs/inotify.c
+--- linux-2.6.16.orig/fs/inotify.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/inotify.c 2006-07-04 14:41:37.000000000 +0400
+@@ -374,7 +374,7 @@ static int find_inode(const char __user
+ if (error)
+ return error;
+ /* you can only watch an inode if you have read permissions on it */
+- error = vfs_permission(nd, MAY_READ);
++ error = vfs_permission(nd, MAY_READ, NULL);
+ if (error)
+ path_release(nd);
+ return error;
+diff -upr linux-2.6.16.orig/fs/ioprio.c linux-2.6.16-026test015/fs/ioprio.c
+--- linux-2.6.16.orig/fs/ioprio.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ioprio.c 2006-07-04 14:41:38.000000000 +0400
+@@ -53,6 +53,9 @@ asmlinkage long sys_ioprio_set(int which
+ struct user_struct *user;
+ int ret;
+
++ if (!ve_is_super(get_exec_env()))
++ return -EPERM;
++
+ switch (class) {
+ case IOPRIO_CLASS_RT:
+ if (!capable(CAP_SYS_ADMIN))
+@@ -78,18 +81,18 @@ asmlinkage long sys_ioprio_set(int which
+ if (!who)
+ p = current;
+ else
+- p = find_task_by_pid(who);
++ p = find_task_by_pid_all(who);
+ if (p)
+ ret = set_task_ioprio(p, ioprio);
+ break;
+ case IOPRIO_WHO_PGRP:
+ if (!who)
+ who = process_group(current);
+- do_each_task_pid(who, PIDTYPE_PGID, p) {
++ do_each_task_pid_all(who, PIDTYPE_PGID, p) {
+ ret = set_task_ioprio(p, ioprio);
+ if (ret)
+ break;
+- } while_each_task_pid(who, PIDTYPE_PGID, p);
++ } while_each_task_pid_all(who, PIDTYPE_PGID, p);
+ break;
+ case IOPRIO_WHO_USER:
+ if (!who)
+@@ -100,13 +103,13 @@ asmlinkage long sys_ioprio_set(int which
+ if (!user)
+ break;
+
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ if (p->uid != who)
+ continue;
+ ret = set_task_ioprio(p, ioprio);
+ if (ret)
+ break;
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+
+ if (who)
+ free_uid(user);
+@@ -131,19 +134,19 @@ asmlinkage long sys_ioprio_get(int which
+ if (!who)
+ p = current;
+ else
+- p = find_task_by_pid(who);
++ p = find_task_by_pid_ve(who);
+ if (p)
+ ret = p->ioprio;
+ break;
+ case IOPRIO_WHO_PGRP:
+ if (!who)
+ who = process_group(current);
+- do_each_task_pid(who, PIDTYPE_PGID, p) {
++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) {
+ if (ret == -ESRCH)
+ ret = p->ioprio;
+ else
+ ret = ioprio_best(ret, p->ioprio);
+- } while_each_task_pid(who, PIDTYPE_PGID, p);
++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p);
+ break;
+ case IOPRIO_WHO_USER:
+ if (!who)
+@@ -154,14 +157,14 @@ asmlinkage long sys_ioprio_get(int which
+ if (!user)
+ break;
+
+- do_each_thread(g, p) {
++ do_each_thread_ve(g, p) {
+ if (p->uid != user->uid)
+ continue;
+ if (ret == -ESRCH)
+ ret = p->ioprio;
+ else
+ ret = ioprio_best(ret, p->ioprio);
+- } while_each_thread(g, p);
++ } while_each_thread_ve(g, p);
+
+ if (who)
+ free_uid(user);
+diff -upr linux-2.6.16.orig/fs/jbd/journal.c linux-2.6.16-026test015/fs/jbd/journal.c
+--- linux-2.6.16.orig/fs/jbd/journal.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/jbd/journal.c 2006-07-04 14:41:37.000000000 +0400
+@@ -210,10 +210,16 @@ end_loop:
+ return 0;
+ }
+
+-static void journal_start_thread(journal_t *journal)
++static int journal_start_thread(journal_t *journal)
+ {
+- kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES);
++ int err;
++
++ err = kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES);
++ if (err < 0)
++ return err;
++
+ wait_event(journal->j_wait_done_commit, journal->j_task != 0);
++ return 0;
+ }
+
+ static void journal_kill_thread(journal_t *journal)
+@@ -839,8 +845,7 @@ static int journal_reset(journal_t *jour
+
+ /* Add the dynamic fields and write it to disk. */
+ journal_update_superblock(journal, 1);
+- journal_start_thread(journal);
+- return 0;
++ return journal_start_thread(journal);
+ }
+
+ /**
+diff -upr linux-2.6.16.orig/fs/jbd/transaction.c linux-2.6.16-026test015/fs/jbd/transaction.c
+--- linux-2.6.16.orig/fs/jbd/transaction.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/jbd/transaction.c 2006-07-04 14:41:37.000000000 +0400
+@@ -1868,6 +1868,7 @@ zap_buffer_unlocked:
+ clear_buffer_mapped(bh);
+ clear_buffer_req(bh);
+ clear_buffer_new(bh);
++ clear_buffer_delay(bh);
+ bh->b_bdev = NULL;
+ return may_free;
+ }
+diff -upr linux-2.6.16.orig/fs/jfs/acl.c linux-2.6.16-026test015/fs/jfs/acl.c
+--- linux-2.6.16.orig/fs/jfs/acl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/jfs/acl.c 2006-07-04 14:41:37.000000000 +0400
+@@ -140,9 +140,10 @@ static int jfs_check_acl(struct inode *i
+ return -EAGAIN;
+ }
+
+-int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++int jfs_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+- return generic_permission(inode, mask, jfs_check_acl);
++ return generic_permission(inode, mask, jfs_check_acl, perm);
+ }
+
+ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
+diff -upr linux-2.6.16.orig/fs/jfs/jfs_acl.h linux-2.6.16-026test015/fs/jfs/jfs_acl.h
+--- linux-2.6.16.orig/fs/jfs/jfs_acl.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/jfs/jfs_acl.h 2006-07-04 14:41:37.000000000 +0400
+@@ -20,7 +20,7 @@
+
+ #ifdef CONFIG_JFS_POSIX_ACL
+
+-int jfs_permission(struct inode *, int, struct nameidata *);
++int jfs_permission(struct inode *, int, struct nameidata *, struct exec_perm *);
+ int jfs_init_acl(tid_t, struct inode *, struct inode *);
+ int jfs_setattr(struct dentry *, struct iattr *);
+
+diff -upr linux-2.6.16.orig/fs/jfs/jfs_metapage.c linux-2.6.16-026test015/fs/jfs/jfs_metapage.c
+--- linux-2.6.16.orig/fs/jfs/jfs_metapage.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/jfs/jfs_metapage.c 2006-07-04 14:41:36.000000000 +0400
+@@ -543,7 +543,7 @@ add_failed:
+ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
+ {
+ struct metapage *mp;
+- int busy = 0;
++ int ret = 1;
+ unsigned int offset;
+
+ for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+@@ -553,30 +553,20 @@ static int metapage_releasepage(struct p
+ continue;
+
+ jfs_info("metapage_releasepage: mp = 0x%p", mp);
+- if (mp->count || mp->nohomeok) {
++ if (mp->count || mp->nohomeok ||
++ test_bit(META_dirty, &mp->flag)) {
+ jfs_info("count = %ld, nohomeok = %d", mp->count,
+ mp->nohomeok);
+- busy = 1;
++ ret = 0;
+ continue;
+ }
+- wait_on_page_writeback(page);
+- //WARN_ON(test_bit(META_dirty, &mp->flag));
+- if (test_bit(META_dirty, &mp->flag)) {
+- dump_mem("dirty mp in metapage_releasepage", mp,
+- sizeof(struct metapage));
+- dump_mem("page", page, sizeof(struct page));
+- dump_stack();
+- }
+ if (mp->lsn)
+ remove_from_logsync(mp);
+ remove_metapage(page, mp);
+ INCREMENT(mpStat.pagefree);
+ free_metapage(mp);
+ }
+- if (busy)
+- return -1;
+-
+- return 0;
++ return ret;
+ }
+
+ static int metapage_invalidatepage(struct page *page, unsigned long offset)
+diff -upr linux-2.6.16.orig/fs/lockd/clntproc.c linux-2.6.16-026test015/fs/lockd/clntproc.c
+--- linux-2.6.16.orig/fs/lockd/clntproc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/lockd/clntproc.c 2006-07-04 14:41:38.000000000 +0400
+@@ -130,10 +130,10 @@ static void nlmclnt_setlockargs(struct n
+ nlmclnt_next_cookie(&argp->cookie);
+ argp->state = nsm_local_state;
+ memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh));
+- lock->caller = system_utsname.nodename;
++ lock->caller = ve_utsname.nodename;
+ lock->oh.data = req->a_owner;
+ lock->oh.len = sprintf(req->a_owner, "%d@%s",
+- current->pid, system_utsname.nodename);
++ current->pid, ve_utsname.nodename);
+ locks_copy_lock(&lock->fl, fl);
+ }
+
+@@ -154,7 +154,7 @@ nlmclnt_setgrantargs(struct nlm_rqst *ca
+ {
+ locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
+ memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
+- call->a_args.lock.caller = system_utsname.nodename;
++ call->a_args.lock.caller = ve_utsname.nodename;
+ call->a_args.lock.oh.len = lock->oh.len;
+
+ /* set default data area */
+diff -upr linux-2.6.16.orig/fs/lockd/mon.c linux-2.6.16-026test015/fs/lockd/mon.c
+--- linux-2.6.16.orig/fs/lockd/mon.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/lockd/mon.c 2006-07-04 14:41:38.000000000 +0400
+@@ -147,7 +147,7 @@ xdr_encode_common(struct rpc_rqst *rqstp
+ */
+ sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
+ if (!(p = xdr_encode_string(p, buffer))
+- || !(p = xdr_encode_string(p, system_utsname.nodename)))
++ || !(p = xdr_encode_string(p, ve_utsname.nodename)))
+ return ERR_PTR(-EIO);
+ *p++ = htonl(argp->prog);
+ *p++ = htonl(argp->vers);
+diff -upr linux-2.6.16.orig/fs/locks.c linux-2.6.16-026test015/fs/locks.c
+--- linux-2.6.16.orig/fs/locks.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/locks.c 2006-07-04 14:41:39.000000000 +0400
+@@ -129,6 +129,8 @@
+ #include <asm/semaphore.h>
+ #include <asm/uaccess.h>
+
++#include <ub/ub_misc.h>
++
+ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
+ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
+ #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
+@@ -148,11 +150,28 @@ static LIST_HEAD(blocked_list);
+ static kmem_cache_t *filelock_cache;
+
+ /* Allocate an empty lock structure. */
+-static struct file_lock *locks_alloc_lock(void)
++static struct file_lock *locks_alloc_lock(int charge)
+ {
+- return kmem_cache_alloc(filelock_cache, SLAB_KERNEL);
++ struct file_lock *fl;
++
++ fl = kmem_cache_alloc(filelock_cache, SLAB_KERNEL);
++#ifdef CONFIG_USER_RESOURCE
++ if (fl == NULL)
++ goto out;
++ fl->fl_charged = 0;
++ if (!charge)
++ goto out;
++ if (!ub_flock_charge(fl, 1))
++ goto out;
++
++ kmem_cache_free(filelock_cache, fl);
++ fl = NULL;
++out:
++#endif
++ return fl;
+ }
+
++
+ /* Free a lock which is not in use. */
+ static void locks_free_lock(struct file_lock *fl)
+ {
+@@ -181,6 +200,7 @@ static void locks_free_lock(struct file_
+ fl->fl_lmops = NULL;
+ }
+
++ ub_flock_uncharge(fl);
+ kmem_cache_free(filelock_cache, fl);
+ }
+
+@@ -263,7 +283,7 @@ static int flock_make_lock(struct file *
+ if (type < 0)
+ return type;
+
+- fl = locks_alloc_lock();
++ fl = locks_alloc_lock(type != F_UNLCK);
+ if (fl == NULL)
+ return -ENOMEM;
+
+@@ -432,15 +452,14 @@ static struct lock_manager_operations le
+ */
+ static int lease_init(struct file *filp, int type, struct file_lock *fl)
+ {
++ if (assign_type(fl, type) != 0)
++ return -EINVAL;
++
+ fl->fl_owner = current->files;
+ fl->fl_pid = current->tgid;
+
+ fl->fl_file = filp;
+ fl->fl_flags = FL_LEASE;
+- if (assign_type(fl, type) != 0) {
+- locks_free_lock(fl);
+- return -EINVAL;
+- }
+ fl->fl_start = 0;
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_ops = NULL;
+@@ -451,17 +470,20 @@ static int lease_init(struct file *filp,
+ /* Allocate a file_lock initialised to this type of lease */
+ static int lease_alloc(struct file *filp, int type, struct file_lock **flp)
+ {
+- struct file_lock *fl = locks_alloc_lock();
+- int error;
++ struct file_lock *fl = locks_alloc_lock(1);
++ int error = -ENOMEM;
+
+ if (fl == NULL)
+- return -ENOMEM;
++ goto out;
+
+ error = lease_init(filp, type, fl);
+- if (error)
+- return error;
++ if (error) {
++ locks_free_lock(fl);
++ fl = NULL;
++ }
++out:
+ *flp = fl;
+- return 0;
++ return error;
+ }
+
+ /* Check if two locks overlap each other.
+@@ -712,8 +734,9 @@ EXPORT_SYMBOL(posix_locks_deadlock);
+ * at the head of the list, but that's secret knowledge known only to
+ * flock_lock_file and posix_lock_file.
+ */
+-static int flock_lock_file(struct file *filp, struct file_lock *new_fl)
++static int flock_lock_file(struct file *filp, struct file_lock *request)
+ {
++ struct file_lock *new_fl = NULL;
+ struct file_lock **before;
+ struct inode * inode = filp->f_dentry->d_inode;
+ int error = 0;
+@@ -728,44 +751,60 @@ static int flock_lock_file(struct file *
+ continue;
+ if (filp != fl->fl_file)
+ continue;
+- if (new_fl->fl_type == fl->fl_type)
++ if (request->fl_type == fl->fl_type)
+ goto out;
+ found = 1;
+ locks_delete_lock(before);
+ break;
+ }
+- unlock_kernel();
+
+- if (new_fl->fl_type == F_UNLCK)
+- return 0;
++ if (request->fl_type == F_UNLCK)
++ goto out;
+
+ /*
++ * Nont F_UNLCK request must be already charged in
++ * flock_make_lock().
++ *
++ * actually new_fl must be charged not the request,
++ * but we try to fail earlier
++ */
++ error = -ENOMEM;
++ new_fl = locks_alloc_lock(0);
++ if (new_fl == NULL)
++ goto out;
++ /*
+ * If a higher-priority process was blocked on the old file lock,
+ * give it the opportunity to lock the file.
+ */
+ if (found)
+ cond_resched();
+
+- lock_kernel();
+ for_each_lock(inode, before) {
+ struct file_lock *fl = *before;
+ if (IS_POSIX(fl))
+ break;
+ if (IS_LEASE(fl))
+ continue;
+- if (!flock_locks_conflict(new_fl, fl))
++ if (!flock_locks_conflict(request, fl))
+ continue;
+ error = -EAGAIN;
+- if (new_fl->fl_flags & FL_SLEEP) {
+- locks_insert_block(fl, new_fl);
+- }
++ if (request->fl_flags & FL_SLEEP)
++ locks_insert_block(fl, request);
+ goto out;
+ }
++
++ set_flock_charged(new_fl);
++ unset_flock_charged(request);
++
++ locks_copy_lock(new_fl, request);
+ locks_insert_lock(&inode->i_flock, new_fl);
++ new_fl = NULL;
+ error = 0;
+
+ out:
+ unlock_kernel();
++ if (new_fl)
++ locks_free_lock(new_fl);
+ return error;
+ }
+
+@@ -784,8 +823,11 @@ static int __posix_lock_file(struct inod
+ * We may need two file_lock structures for this operation,
+ * so we get them in advance to avoid races.
+ */
+- new_fl = locks_alloc_lock();
+- new_fl2 = locks_alloc_lock();
++ if (request->fl_type != F_UNLCK)
++ new_fl = locks_alloc_lock(1);
++ else
++ new_fl = NULL;
++ new_fl2 = locks_alloc_lock(0);
+
+ lock_kernel();
+ if (request->fl_type != F_UNLCK) {
+@@ -813,7 +855,7 @@ static int __posix_lock_file(struct inod
+ goto out;
+
+ error = -ENOLCK; /* "no luck" */
+- if (!(new_fl && new_fl2))
++ if (!((request->fl_type == F_UNLCK || new_fl) && new_fl2))
+ goto out;
+
+ /*
+@@ -919,19 +961,30 @@ static int __posix_lock_file(struct inod
+ if (!added) {
+ if (request->fl_type == F_UNLCK)
+ goto out;
++ error = -ENOLCK;
++ if (right && (left == right) && ub_flock_charge(new_fl, 1))
++ goto out;
+ locks_copy_lock(new_fl, request);
+ locks_insert_lock(before, new_fl);
+ new_fl = NULL;
++ error = 0;
+ }
+ if (right) {
+ if (left == right) {
+ /* The new lock breaks the old one in two pieces,
+ * so we have to use the second new lock.
+ */
++ error = -ENOLCK;
++ if (added && ub_flock_charge(new_fl2,
++ request->fl_type != F_UNLCK))
++ goto out;
++ /* FIXME move all fl_charged manipulations in ub code */
++ set_flock_charged(new_fl2);
+ left = new_fl2;
+ new_fl2 = NULL;
+ locks_copy_lock(left, right);
+ locks_insert_lock(before, left);
++ error = 0;
+ }
+ right->fl_start = request->fl_end + 1;
+ locks_wake_up_blocks(right);
+@@ -1337,6 +1390,7 @@ static int __setlease(struct file *filp,
+ goto out;
+
+ if (my_before != NULL) {
++ *flp = *my_before;
+ error = lease->fl_lmops->fl_change(my_before, arg);
+ goto out;
+ }
+@@ -1529,15 +1583,14 @@ asmlinkage long sys_flock(unsigned int f
+ error = flock_lock_file_wait(filp, lock);
+
+ out_free:
+- if (list_empty(&lock->fl_link)) {
+- locks_free_lock(lock);
+- }
++ locks_free_lock(lock);
+
+ out_putf:
+ fput(filp);
+ out:
+ return error;
+ }
++EXPORT_SYMBOL_GPL(sys_flock);
+
+ /* Report the first existing lock that would conflict with l.
+ * This implements the F_GETLK command of fcntl().
+@@ -1573,7 +1626,7 @@ int fcntl_getlk(struct file *filp, struc
+
+ flock.l_type = F_UNLCK;
+ if (fl != NULL) {
+- flock.l_pid = fl->fl_pid;
++ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid);
+ #if BITS_PER_LONG == 32
+ /*
+ * Make sure we can represent the posix lock via
+@@ -1605,7 +1658,7 @@ out:
+ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock __user *l)
+ {
+- struct file_lock *file_lock = locks_alloc_lock();
++ struct file_lock *file_lock = locks_alloc_lock(0);
+ struct flock flock;
+ struct inode *inode;
+ int error;
+@@ -1727,7 +1780,7 @@ int fcntl_getlk64(struct file *filp, str
+
+ flock.l_type = F_UNLCK;
+ if (fl != NULL) {
+- flock.l_pid = fl->fl_pid;
++ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid);
+ flock.l_start = fl->fl_start;
+ flock.l_len = fl->fl_end == OFFSET_MAX ? 0 :
+ fl->fl_end - fl->fl_start + 1;
+@@ -1748,7 +1801,7 @@ out:
+ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ struct flock64 __user *l)
+ {
+- struct file_lock *file_lock = locks_alloc_lock();
++ struct file_lock *file_lock = locks_alloc_lock(0);
+ struct flock64 flock;
+ struct inode *inode;
+ int error;
+@@ -1976,7 +2029,9 @@ EXPORT_SYMBOL(posix_unblock_lock);
+ static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx)
+ {
+ struct inode *inode = NULL;
++ unsigned int fl_pid;
+
++ fl_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid);
+ if (fl->fl_file != NULL)
+ inode = fl->fl_file->f_dentry->d_inode;
+
+@@ -2018,16 +2073,16 @@ static void lock_get_status(char* out, s
+ }
+ if (inode) {
+ #ifdef WE_CAN_BREAK_LSLK_NOW
+- out += sprintf(out, "%d %s:%ld ", fl->fl_pid,
++ out += sprintf(out, "%d %s:%ld ", fl_pid,
+ inode->i_sb->s_id, inode->i_ino);
+ #else
+ /* userspace relies on this representation of dev_t ;-( */
+- out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid,
++ out += sprintf(out, "%d %02x:%02x:%ld ", fl_pid,
+ MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev), inode->i_ino);
+ #endif
+ } else {
+- out += sprintf(out, "%d <none>:0 ", fl->fl_pid);
++ out += sprintf(out, "%d <none>:0 ", fl_pid);
+ }
+ if (IS_POSIX(fl)) {
+ if (fl->fl_end == OFFSET_MAX)
+@@ -2076,11 +2131,17 @@ int get_locks_status(char *buffer, char
+ char *q = buffer;
+ off_t pos = 0;
+ int i = 0;
++ struct ve_struct *env;
+
+ lock_kernel();
++ env = get_exec_env();
+ list_for_each(tmp, &file_lock_list) {
+ struct list_head *btmp;
+ struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link);
++
++ if (!ve_accessible(VE_OWNER_FILP(fl->fl_file), env))
++ continue;
++
+ lock_get_status(q, fl, ++i, "");
+ move_lock_status(&q, &pos, offset);
+
+@@ -2212,7 +2273,12 @@ void steal_locks(fl_owner_t from)
+
+ lock_kernel();
+ j = 0;
+- rcu_read_lock();
++
++ /*
++ * We are not taking a ref to the file structures, so
++ * we need to acquire ->file_lock.
++ */
++ spin_lock(&files->file_lock);
+ fdt = files_fdtable(files);
+ for (;;) {
+ unsigned long set;
+@@ -2230,7 +2296,7 @@ void steal_locks(fl_owner_t from)
+ set >>= 1;
+ }
+ }
+- rcu_read_unlock();
++ spin_unlock(&files->file_lock);
+ unlock_kernel();
+ }
+ EXPORT_SYMBOL(steal_locks);
+@@ -2238,7 +2304,7 @@ EXPORT_SYMBOL(steal_locks);
+ static int __init filelock_init(void)
+ {
+ filelock_cache = kmem_cache_create("file_lock_cache",
+- sizeof(struct file_lock), 0, SLAB_PANIC,
++ sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_UBC,
+ init_once, NULL);
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/fs/namei.c linux-2.6.16-026test015/fs/namei.c
+--- linux-2.6.16.orig/fs/namei.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/namei.c 2006-07-04 14:41:39.000000000 +0400
+@@ -179,7 +179,7 @@ EXPORT_SYMBOL(putname);
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+-int generic_permission(struct inode *inode, int mask,
++static int __generic_permission(struct inode *inode, int mask,
+ int (*check_acl)(struct inode *inode, int mask))
+ {
+ umode_t mode = inode->i_mode;
+@@ -225,7 +225,26 @@ int generic_permission(struct inode *ino
+ return -EACCES;
+ }
+
+-int permission(struct inode *inode, int mask, struct nameidata *nd)
++int generic_permission(struct inode *inode, int mask,
++ int (*check_acl)(struct inode *inode, int mask),
++ struct exec_perm *perm)
++{
++ int ret;
++
++ if (perm == NULL)
++ return __generic_permission(inode, mask, check_acl);
++
++ mutex_lock(&inode->i_mutex);
++ ret = __generic_permission(inode, mask, check_acl);
++ if (!ret)
++ set_exec_perm(perm, inode);
++ mutex_unlock(&inode->i_mutex);
++ return ret;
++}
++
++
++int permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ int retval, submask;
+
+@@ -250,9 +269,9 @@ int permission(struct inode *inode, int
+ /* Ordinary permission routines do not understand MAY_APPEND. */
+ submask = mask & ~MAY_APPEND;
+ if (inode->i_op && inode->i_op->permission)
+- retval = inode->i_op->permission(inode, submask, nd);
++ retval = inode->i_op->permission(inode, submask, nd, perm);
+ else
+- retval = generic_permission(inode, submask, NULL);
++ retval = generic_permission(inode, submask, NULL, perm);
+ if (retval)
+ return retval;
+
+@@ -269,9 +288,9 @@ int permission(struct inode *inode, int
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
+-int vfs_permission(struct nameidata *nd, int mask)
++int vfs_permission(struct nameidata *nd, int mask, struct exec_perm *perm)
+ {
+- return permission(nd->dentry->d_inode, mask, nd);
++ return permission(nd->dentry->d_inode, mask, nd, perm);
+ }
+
+ /**
+@@ -288,7 +307,7 @@ int vfs_permission(struct nameidata *nd,
+ */
+ int file_permission(struct file *file, int mask)
+ {
+- return permission(file->f_dentry->d_inode, mask, NULL);
++ return permission(file->f_dentry->d_inode, mask, NULL, NULL);
+ }
+
+ /*
+@@ -379,6 +398,21 @@ static struct dentry * cached_lookup(str
+ if (!dentry)
+ dentry = d_lookup(parent, name);
+
++ /*
++ * The revalidation rules are simple:
++ * d_revalidate operation is called when we're about to use a cached
++ * dentry rather than call d_lookup.
++ * d_revalidate method may unhash the dentry itself or return FALSE, in
++ * which case if the dentry can be released d_lookup will be called.
++ *
++ * Additionally, by request of NFS people
++ * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c)
++ * d_revalidate is called when `/', `.' or `..' are looked up.
++ * Since re-lookup is impossible on them, we introduce a hack and
++ * return an error in this case.
++ *
++ * 2003/02/19 SAW
++ */
+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+ if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) {
+ dput(dentry);
+@@ -441,6 +475,7 @@ static struct dentry * real_lookup(struc
+ struct dentry * result;
+ struct inode *dir = parent->d_inode;
+
++repeat:
+ mutex_lock(&dir->i_mutex);
+ /*
+ * First re-do the cached lookup just in case it was created
+@@ -479,7 +514,7 @@ static struct dentry * real_lookup(struc
+ if (result->d_op && result->d_op->d_revalidate) {
+ if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
+ dput(result);
+- result = ERR_PTR(-ENOENT);
++ goto repeat;
+ }
+ }
+ return result;
+@@ -704,7 +739,14 @@ static __always_inline void follow_dotdo
+ read_unlock(&current->fs->lock);
+ break;
+ }
+- read_unlock(&current->fs->lock);
++#ifdef CONFIG_VE
++ if (nd->dentry == get_exec_env()->fs_root &&
++ nd->mnt == get_exec_env()->fs_rootmnt) {
++ read_unlock(&current->fs->lock);
++ break;
++ }
++#endif
++ read_unlock(&current->fs->lock);
+ spin_lock(&dcache_lock);
+ if (nd->dentry != nd->mnt->mnt_root) {
+ nd->dentry = dget(nd->dentry->d_parent);
+@@ -745,6 +787,10 @@ static int do_lookup(struct nameidata *n
+ if (dentry->d_op && dentry->d_op->d_revalidate)
+ goto need_revalidate;
+ done:
++ if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) {
++ dput(dentry);
++ return -ENOENT;
++ }
+ path->mnt = mnt;
+ path->dentry = dentry;
+ __follow_mount(path);
+@@ -780,6 +826,7 @@ static fastcall int __link_path_walk(con
+ {
+ struct path next;
+ struct inode *inode;
++ int real_components = 0;
+ int err;
+ unsigned int lookup_flags = nd->flags;
+
+@@ -801,7 +848,7 @@ static fastcall int __link_path_walk(con
+ nd->flags |= LOOKUP_CONTINUE;
+ err = exec_permission_lite(inode, nd);
+ if (err == -EAGAIN)
+- err = vfs_permission(nd, MAY_EXEC);
++ err = vfs_permission(nd, MAY_EXEC, NULL);
+ if (err)
+ break;
+
+@@ -851,6 +898,7 @@ static fastcall int __link_path_walk(con
+ break;
+ }
+ /* This does the actual lookups.. */
++ real_components++;
+ err = do_lookup(nd, &this, &next);
+ if (err)
+ break;
+@@ -864,6 +912,9 @@ static fastcall int __link_path_walk(con
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
++ err = -ENOENT;
++ if (lookup_flags & LOOKUP_STRICT)
++ goto out_dput;
+ err = do_follow_link(&next, nd);
+ if (err)
+ goto return_err;
+@@ -911,6 +962,7 @@ last_component:
+ break;
+ inode = next.dentry->d_inode;
+ if ((lookup_flags & LOOKUP_FOLLOW)
++ && !(lookup_flags & LOOKUP_STRICT)
+ && inode && inode->i_op && inode->i_op->follow_link) {
+ err = do_follow_link(&next, nd);
+ if (err)
+@@ -932,26 +984,40 @@ lookup_parent:
+ nd->last_type = LAST_NORM;
+ if (this.name[0] != '.')
+ goto return_base;
+- if (this.len == 1)
++ if (this.len == 1) {
+ nd->last_type = LAST_DOT;
+- else if (this.len == 2 && this.name[1] == '.')
++ goto return_reval;
++ } else if (this.len == 2 && this.name[1] == '.') {
+ nd->last_type = LAST_DOTDOT;
+- else
+- goto return_base;
++ goto return_reval;
++ }
++return_base:
++ if (!(nd->flags & LOOKUP_NOAREACHECK)) {
++ err = check_area_access_ve(nd->dentry, nd->mnt);
++ if (err)
++ break;
++ }
++ return 0;
+ return_reval:
+ /*
+ * We bypassed the ordinary revalidation routines.
+ * We may need to check the cached dentry for staleness.
+ */
+- if (nd->dentry && nd->dentry->d_sb &&
++ if (!real_components && nd->dentry && nd->dentry->d_sb &&
+ (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
+ err = -ESTALE;
+ /* Note: we do not d_invalidate() */
+ if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
++ /*
++ * This lookup is for `/' or `.' or `..'.
++ * The filesystem unhashed the dentry itself
++ * inside d_revalidate (otherwise, d_invalidate
++ * wouldn't succeed). As a special courtesy to
++ * NFS we return an error. 2003/02/19 SAW
++ */
+ break;
+ }
+-return_base:
+- return 0;
++ goto return_base;
+ out_dput:
+ dput_path(&next, nd);
+ break;
+@@ -1077,8 +1143,8 @@ static int fastcall do_path_lookup(int d
+ nd->flags = flags;
+ nd->depth = 0;
+
+- read_lock(&current->fs->lock);
+ if (*name=='/') {
++ read_lock(&current->fs->lock);
+ if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
+ nd->mnt = mntget(current->fs->altrootmnt);
+ nd->dentry = dget(current->fs->altroot);
+@@ -1089,33 +1155,35 @@ static int fastcall do_path_lookup(int d
+ }
+ nd->mnt = mntget(current->fs->rootmnt);
+ nd->dentry = dget(current->fs->root);
++ read_unlock(&current->fs->lock);
+ } else if (dfd == AT_FDCWD) {
++ read_lock(&current->fs->lock);
+ nd->mnt = mntget(current->fs->pwdmnt);
+ nd->dentry = dget(current->fs->pwd);
++ read_unlock(&current->fs->lock);
+ } else {
+ struct dentry *dentry;
+
+ file = fget_light(dfd, &fput_needed);
+ retval = -EBADF;
+ if (!file)
+- goto unlock_fail;
++ goto out_fail;
+
+ dentry = file->f_dentry;
+
+ retval = -ENOTDIR;
+ if (!S_ISDIR(dentry->d_inode->i_mode))
+- goto fput_unlock_fail;
++ goto fput_fail;
+
+ retval = file_permission(file, MAY_EXEC);
+ if (retval)
+- goto fput_unlock_fail;
++ goto fput_fail;
+
+ nd->mnt = mntget(file->f_vfsmnt);
+ nd->dentry = dget(dentry);
+
+ fput_light(file, fput_needed);
+ }
+- read_unlock(&current->fs->lock);
+ current->total_link_count = 0;
+ retval = link_path_walk(name, nd);
+ out:
+@@ -1124,13 +1192,12 @@ out:
+ nd->dentry->d_inode))
+ audit_inode(name, nd->dentry->d_inode, flags);
+ }
++out_fail:
+ return retval;
+
+-fput_unlock_fail:
++fput_fail:
+ fput_light(file, fput_needed);
+-unlock_fail:
+- read_unlock(&current->fs->lock);
+- return retval;
++ goto out_fail;
+ }
+
+ int fastcall path_lookup(const char *name, unsigned int flags,
+@@ -1219,7 +1286,7 @@ static struct dentry * __lookup_hash(str
+ int err;
+
+ inode = base->d_inode;
+- err = permission(inode, MAY_EXEC, nd);
++ err = permission(inode, MAY_EXEC, nd, NULL);
+ dentry = ERR_PTR(err);
+ if (err)
+ goto out;
+@@ -1354,7 +1421,7 @@ static int may_delete(struct inode *dir,
+
+ BUG_ON(victim->d_parent->d_inode != dir);
+
+- error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
++ error = permission(dir,MAY_WRITE | MAY_EXEC, NULL, NULL);
+ if (error)
+ return error;
+ if (IS_APPEND(dir))
+@@ -1391,7 +1458,7 @@ static inline int may_create(struct inod
+ return -EEXIST;
+ if (IS_DEADDIR(dir))
+ return -ENOENT;
+- return permission(dir,MAY_WRITE | MAY_EXEC, nd);
++ return permission(dir,MAY_WRITE | MAY_EXEC, nd, NULL);
+ }
+
+ /*
+@@ -1491,7 +1558,7 @@ int may_open(struct nameidata *nd, int a
+ if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
+ return -EISDIR;
+
+- error = vfs_permission(nd, acc_mode);
++ error = vfs_permission(nd, acc_mode, NULL);
+ if (error)
+ return error;
+
+@@ -1628,6 +1695,12 @@ do_last:
+ goto exit;
+ }
+
++ if (IS_ERR(nd->intent.open.file)) {
++ mutex_unlock(&dir->d_inode->i_mutex);
++ error = PTR_ERR(nd->intent.open.file);
++ goto exit_dput;
++ }
++
+ /* Negative dentry, just create the file */
+ if (!path.dentry->d_inode) {
+ if (!IS_POSIXACL(dir->d_inode))
+@@ -1851,6 +1924,7 @@ asmlinkage long sys_mknod(const char __u
+ {
+ return sys_mknodat(AT_FDCWD, filename, mode, dev);
+ }
++EXPORT_SYMBOL_GPL(sys_mknod);
+
+ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+ {
+@@ -1909,6 +1983,7 @@ asmlinkage long sys_mkdir(const char __u
+ {
+ return sys_mkdirat(AT_FDCWD, pathname, mode);
+ }
++EXPORT_SYMBOL_GPL(sys_mkdir);
+
+ /*
+ * We try to drop the dentry early: we should have
+@@ -2016,6 +2091,7 @@ asmlinkage long sys_rmdir(const char __u
+ {
+ return do_rmdir(AT_FDCWD, pathname);
+ }
++EXPORT_SYMBOL_GPL(sys_rmdir);
+
+ int vfs_unlink(struct inode *dir, struct dentry *dentry)
+ {
+@@ -2115,6 +2191,7 @@ asmlinkage long sys_unlink(const char __
+ {
+ return do_unlinkat(AT_FDCWD, pathname);
+ }
++EXPORT_SYMBOL_GPL(sys_unlink);
+
+ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
+ {
+@@ -2313,7 +2390,7 @@ static int vfs_rename_dir(struct inode *
+ * we'll need to flip '..'.
+ */
+ if (new_dir != old_dir) {
+- error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
++ error = permission(old_dentry->d_inode, MAY_WRITE, NULL, NULL);
+ if (error)
+ return error;
+ }
+@@ -2380,6 +2457,9 @@ int vfs_rename(struct inode *old_dir, st
+ int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+ const char *old_name;
+
++ if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir))
++ return -EXDEV;
++
+ if (old_dentry->d_inode == new_dentry->d_inode)
+ return 0;
+
+diff -upr linux-2.6.16.orig/fs/namespace.c linux-2.6.16-026test015/fs/namespace.c
+--- linux-2.6.16.orig/fs/namespace.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/namespace.c 2006-07-04 14:41:39.000000000 +0400
+@@ -40,13 +40,15 @@ static inline int sysfs_init(void)
+
+ /* spinlock for vfsmount related operations, inplace of dcache_lock */
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
++EXPORT_SYMBOL(vfsmount_lock);
+
+ static int event;
+
+ static struct list_head *mount_hashtable;
+ static int hash_mask __read_mostly, hash_bits __read_mostly;
+ static kmem_cache_t *mnt_cache;
+-static struct rw_semaphore namespace_sem;
++struct rw_semaphore namespace_sem;
++EXPORT_SYMBOL(namespace_sem);
+
+ /* /sys/fs */
+ decl_subsys(fs, NULL, NULL);
+@@ -65,6 +67,7 @@ struct vfsmount *alloc_vfsmnt(const char
+ struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
+ if (mnt) {
+ memset(mnt, 0, sizeof(struct vfsmount));
++ mnt->owner = VEID(get_exec_env());
+ atomic_set(&mnt->mnt_count, 1);
+ INIT_LIST_HEAD(&mnt->mnt_hash);
+ INIT_LIST_HEAD(&mnt->mnt_child);
+@@ -371,10 +374,32 @@ static int show_vfsmnt(struct seq_file *
+ { 0, NULL }
+ };
+ struct proc_fs_info *fs_infop;
++ char *path_buf, *path;
+
+- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
++ /* skip FS_NOMOUNT mounts (rootfs) */
++ if (mnt->mnt_sb->s_flags & MS_NOUSER)
++ return 0;
++
++ path_buf = (char *) __get_free_page(GFP_KERNEL);
++ if (!path_buf)
++ return -ENOMEM;
++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE);
++ if (IS_ERR(path)) {
++ free_page((unsigned long) path_buf);
++ /*
++ * This means that the file position will be incremented, i.e.
++ * the total number of "invisible" vfsmnt will leak.
++ */
++ return 0;
++ }
++
++ if (ve_is_super(get_exec_env()))
++ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
++ else
++ mangle(m, mnt->mnt_sb->s_type->name);
+ seq_putc(m, ' ');
+- seq_path(m, mnt, mnt->mnt_root, " \t\n\\");
++ mangle(m, path);
++ free_page((unsigned long) path_buf);
+ seq_putc(m, ' ');
+ mangle(m, mnt->mnt_sb->s_type->name);
+ seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+@@ -474,6 +499,7 @@ void release_mounts(struct list_head *he
+ mntput(mnt);
+ }
+ }
++EXPORT_SYMBOL(release_mounts);
+
+ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
+ {
+@@ -498,6 +524,7 @@ void umount_tree(struct vfsmount *mnt, i
+ change_mnt_propagation(p, MS_PRIVATE);
+ }
+ }
++EXPORT_SYMBOL(umount_tree);
+
+ static int do_umount(struct vfsmount *mnt, int flags)
+ {
+@@ -608,7 +635,7 @@ asmlinkage long sys_umount(char __user *
+ goto dput_and_out;
+
+ retval = -EPERM;
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ goto dput_and_out;
+
+ retval = do_umount(nd.mnt, flags);
+@@ -632,7 +659,7 @@ asmlinkage long sys_oldumount(char __use
+
+ static int mount_is_safe(struct nameidata *nd)
+ {
+- if (capable(CAP_SYS_ADMIN))
++ if (capable(CAP_VE_SYS_ADMIN))
+ return 0;
+ return -EPERM;
+ #ifdef notyet
+@@ -642,7 +669,7 @@ static int mount_is_safe(struct nameidat
+ if (current->uid != nd->dentry->d_inode->i_uid)
+ return -EPERM;
+ }
+- if (vfs_permission(nd, MAY_WRITE))
++ if (vfs_permission(nd, MAY_WRITE, NULL))
+ return -EPERM;
+ return 0;
+ #endif
+@@ -848,6 +875,8 @@ static int do_change_type(struct nameida
+
+ if (nd->dentry != nd->mnt->mnt_root)
+ return -EINVAL;
++ if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid))
++ return -EPERM;
+
+ down_write(&namespace_sem);
+ spin_lock(&vfsmount_lock);
+@@ -917,7 +946,7 @@ static int do_remount(struct nameidata *
+ int err;
+ struct super_block *sb = nd->mnt->mnt_sb;
+
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+
+ if (!check_mnt(nd->mnt))
+@@ -926,6 +955,9 @@ static int do_remount(struct nameidata *
+ if (nd->dentry != nd->mnt->mnt_root)
+ return -EINVAL;
+
++ if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid))
++ return -EPERM;
++
+ down_write(&sb->s_umount);
+ err = do_remount_sb(sb, flags, data, 0);
+ if (!err)
+@@ -951,7 +983,7 @@ static int do_move_mount(struct nameidat
+ struct nameidata old_nd, parent_nd;
+ struct vfsmount *p;
+ int err = 0;
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ if (!old_name || !*old_name)
+ return -EINVAL;
+@@ -959,6 +991,10 @@ static int do_move_mount(struct nameidat
+ if (err)
+ return err;
+
++ err = -EPERM;
++ if (!ve_accessible_veid(old_nd.mnt->owner, get_exec_env()->veid))
++ goto out_nosem;
++
+ down_write(&namespace_sem);
+ while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+ ;
+@@ -1014,6 +1050,7 @@ out:
+ up_write(&namespace_sem);
+ if (!err)
+ path_release(&parent_nd);
++out_nosem:
+ path_release(&old_nd);
+ return err;
+ }
+@@ -1031,7 +1068,7 @@ static int do_new_mount(struct nameidata
+ return -EINVAL;
+
+ /* we need capabilities... */
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+
+ mnt = do_kern_mount(type, flags, name, data);
+@@ -1072,6 +1109,10 @@ int do_add_mount(struct vfsmount *newmnt
+ if ((err = graft_tree(newmnt, nd)))
+ goto unlock;
+
++ if (newmnt->mnt_mountpoint->d_flags & DCACHE_VIRTUAL)
++ /* unaccessible yet - no lock */
++ newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL;
++
+ if (fslist) {
+ /* add to the specified expiration list */
+ spin_lock(&vfsmount_lock);
+@@ -1469,6 +1510,7 @@ out1:
+ free_page(type_page);
+ return retval;
+ }
++EXPORT_SYMBOL_GPL(sys_mount);
+
+ /*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+@@ -1520,7 +1562,7 @@ static void chroot_fs_refs(struct nameid
+ struct fs_struct *fs;
+
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p) {
++ do_each_thread_ve(g, p) {
+ task_lock(p);
+ fs = p->fs;
+ if (fs) {
+@@ -1535,7 +1577,7 @@ static void chroot_fs_refs(struct nameid
+ put_fs_struct(fs);
+ } else
+ task_unlock(p);
+- } while_each_thread(g, p);
++ } while_each_thread_ve(g, p);
+ read_unlock(&tasklist_lock);
+ }
+
+@@ -1688,10 +1730,10 @@ static void __init init_mount_tree(void)
+
+ init_task.namespace = namespace;
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ get_namespace(namespace);
+ p->namespace = namespace;
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+ read_unlock(&tasklist_lock);
+
+ set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root);
+@@ -1707,7 +1749,8 @@ void __init mnt_init(unsigned long mempa
+ init_rwsem(&namespace_sem);
+
+ mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount),
+- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
++ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC,
++ NULL, NULL);
+
+ mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+
+@@ -1763,3 +1806,4 @@ void __put_namespace(struct namespace *n
+ release_mounts(&umount_list);
+ kfree(namespace);
+ }
++EXPORT_SYMBOL_GPL(__put_namespace);
+diff -upr linux-2.6.16.orig/fs/nfs/dir.c linux-2.6.16-026test015/fs/nfs/dir.c
+--- linux-2.6.16.orig/fs/nfs/dir.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfs/dir.c 2006-07-04 14:41:37.000000000 +0400
+@@ -1635,7 +1635,8 @@ out:
+ return -EACCES;
+ }
+
+-int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++int nfs_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ struct rpc_cred *cred;
+ int res = 0;
+@@ -1683,7 +1684,7 @@ out:
+ out_notsup:
+ res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (res == 0)
+- res = generic_permission(inode, mask, NULL);
++ res = generic_permission(inode, mask, NULL, perm);
+ unlock_kernel();
+ return res;
+ }
+diff -upr linux-2.6.16.orig/fs/nfs/nfsroot.c linux-2.6.16-026test015/fs/nfs/nfsroot.c
+--- linux-2.6.16.orig/fs/nfs/nfsroot.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfs/nfsroot.c 2006-07-04 14:41:38.000000000 +0400
+@@ -312,7 +312,7 @@ static int __init root_nfs_name(char *na
+ /* Override them by options set on kernel command-line */
+ root_nfs_parse(name, buf);
+
+- cp = system_utsname.nodename;
++ cp = ve_utsname.nodename;
+ if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
+ printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
+ return -1;
+diff -upr linux-2.6.16.orig/fs/nfsd/nfs3proc.c linux-2.6.16-026test015/fs/nfsd/nfs3proc.c
+--- linux-2.6.16.orig/fs/nfsd/nfs3proc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfsd/nfs3proc.c 2006-07-04 14:41:36.000000000 +0400
+@@ -682,7 +682,7 @@ static struct svc_procedure nfsd_proced
+ PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
+ PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1),
+ PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
+- PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE),
++ PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
+ PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4),
+ PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+ PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
+diff -upr linux-2.6.16.orig/fs/nfsd/nfs4proc.c linux-2.6.16-026test015/fs/nfsd/nfs4proc.c
+--- linux-2.6.16.orig/fs/nfsd/nfs4proc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfsd/nfs4proc.c 2006-07-04 14:41:36.000000000 +0400
+@@ -975,7 +975,7 @@ struct nfsd4_voidargs { int dummy; };
+ */
+ static struct svc_procedure nfsd_procedures4[2] = {
+ PROC(null, void, void, void, RC_NOCACHE, 1),
+- PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE)
++ PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4)
+ };
+
+ struct svc_version nfsd_version4 = {
+diff -upr linux-2.6.16.orig/fs/nfsd/nfsfh.c linux-2.6.16-026test015/fs/nfsd/nfsfh.c
+--- linux-2.6.16.orig/fs/nfsd/nfsfh.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfsd/nfsfh.c 2006-07-04 14:41:37.000000000 +0400
+@@ -56,7 +56,7 @@ static int nfsd_acceptable(void *expv, s
+ /* make sure parents give x permission to user */
+ int err;
+ parent = dget_parent(tdentry);
+- err = permission(parent->d_inode, MAY_EXEC, NULL);
++ err = permission(parent->d_inode, MAY_EXEC, NULL, NULL);
+ if (err < 0) {
+ dput(parent);
+ break;
+diff -upr linux-2.6.16.orig/fs/nfsd/nfsproc.c linux-2.6.16-026test015/fs/nfsd/nfsproc.c
+--- linux-2.6.16.orig/fs/nfsd/nfsproc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfsd/nfsproc.c 2006-07-04 14:41:36.000000000 +0400
+@@ -553,7 +553,7 @@ static struct svc_procedure nfsd_proced
+ PROC(none, void, void, none, RC_NOCACHE, ST),
+ PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT),
+ PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
+- PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE),
++ PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
+ PROC(none, void, void, none, RC_NOCACHE, ST),
+ PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT),
+ PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT),
+diff -upr linux-2.6.16.orig/fs/nfsd/vfs.c linux-2.6.16-026test015/fs/nfsd/vfs.c
+--- linux-2.6.16.orig/fs/nfsd/vfs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/nfsd/vfs.c 2006-07-04 14:41:37.000000000 +0400
+@@ -1817,12 +1817,13 @@ nfsd_permission(struct svc_export *exp,
+ inode->i_uid == current->fsuid)
+ return 0;
+
+- err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
++ err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC),
++ NULL, NULL);
+
+ /* Allow read access to binaries even when mode 111 */
+ if (err == -EACCES && S_ISREG(inode->i_mode) &&
+ acc == (MAY_READ | MAY_OWNER_OVERRIDE))
+- err = permission(inode, MAY_EXEC, NULL);
++ err = permission(inode, MAY_EXEC, NULL, NULL);
+
+ return err? nfserrno(err) : 0;
+ }
+diff -upr linux-2.6.16.orig/fs/ntfs/file.c linux-2.6.16-026test015/fs/ntfs/file.c
+--- linux-2.6.16.orig/fs/ntfs/file.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ntfs/file.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1489,14 +1489,15 @@ static inline void ntfs_flush_dcache_pag
+ unsigned nr_pages)
+ {
+ BUG_ON(!nr_pages);
++ /*
++ * Warning: Do not do the decrement at the same time as the call to
++ * flush_dcache_page() because it is a NULL macro on i386 and hence the
++ * decrement never happens so the loop never terminates.
++ */
+ do {
+- /*
+- * Warning: Do not do the decrement at the same time as the
+- * call because flush_dcache_page() is a NULL macro on i386
+- * and hence the decrement never happens.
+- */
++ --nr_pages;
+ flush_dcache_page(pages[nr_pages]);
+- } while (--nr_pages > 0);
++ } while (nr_pages > 0);
+ }
+
+ /**
+diff -upr linux-2.6.16.orig/fs/ntfs/super.c linux-2.6.16-026test015/fs/ntfs/super.c
+--- linux-2.6.16.orig/fs/ntfs/super.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/ntfs/super.c 2006-07-04 14:41:37.000000000 +0400
+@@ -3033,7 +3033,7 @@ iput_tmp_ino_err_out_now:
+ * method again... FIXME: Do we need to do this twice now because of
+ * attribute inodes? I think not, so leave as is for now... (AIA)
+ */
+- if (invalidate_inodes(sb)) {
++ if (invalidate_inodes(sb, 0)) {
+ ntfs_error(sb, "Busy inodes left. This is most likely a NTFS "
+ "driver bug.");
+ /* Copied from fs/super.c. I just love this message. (-; */
+diff -upr linux-2.6.16.orig/fs/open.c linux-2.6.16-026test015/fs/open.c
+--- linux-2.6.16.orig/fs/open.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/open.c 2006-07-04 14:41:39.000000000 +0400
+@@ -25,6 +25,7 @@
+ #include <linux/fs.h>
+ #include <linux/personality.h>
+ #include <linux/pagemap.h>
++#include <linux/faudit.h>
+ #include <linux/syscalls.h>
+ #include <linux/rcupdate.h>
+
+@@ -51,7 +52,21 @@ int vfs_statfs(struct super_block *sb, s
+
+ EXPORT_SYMBOL(vfs_statfs);
+
+-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf)
++int faudit_statfs(struct super_block *sb, struct kstatfs *buf)
++{
++ struct faudit_statfs_arg arg;
++
++ arg.sb = sb;
++ arg.stat = buf;
++
++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg)
++ != NOTIFY_DONE)
++ return arg.err;
++ return 0;
++}
++
++static int vfs_statfs_native(struct super_block *sb, struct vfsmount *mnt,
++ struct statfs *buf)
+ {
+ struct kstatfs st;
+ int retval;
+@@ -60,6 +75,10 @@ static int vfs_statfs_native(struct supe
+ if (retval)
+ return retval;
+
++ retval = faudit_statfs(mnt->mnt_sb, &st);
++ if (retval)
++ return retval;
++
+ if (sizeof(*buf) == sizeof(st))
+ memcpy(buf, &st, sizeof(st));
+ else {
+@@ -94,7 +113,8 @@ static int vfs_statfs_native(struct supe
+ return 0;
+ }
+
+-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf)
++static int vfs_statfs64(struct super_block *sb, struct vfsmount *mnt,
++ struct statfs64 *buf)
+ {
+ struct kstatfs st;
+ int retval;
+@@ -103,6 +123,10 @@ static int vfs_statfs64(struct super_blo
+ if (retval)
+ return retval;
+
++ retval = faudit_statfs(mnt->mnt_sb, &st);
++ if (retval)
++ return retval;
++
+ if (sizeof(*buf) == sizeof(st))
+ memcpy(buf, &st, sizeof(st));
+ else {
+@@ -129,7 +153,8 @@ asmlinkage long sys_statfs(const char __
+ error = user_path_walk(path, &nd);
+ if (!error) {
+ struct statfs tmp;
+- error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp);
++ error = vfs_statfs_native(nd.dentry->d_inode->i_sb,
++ nd.mnt, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ path_release(&nd);
+@@ -148,7 +173,8 @@ asmlinkage long sys_statfs64(const char
+ error = user_path_walk(path, &nd);
+ if (!error) {
+ struct statfs64 tmp;
+- error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp);
++ error = vfs_statfs64(nd.dentry->d_inode->i_sb,
++ nd.mnt, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ path_release(&nd);
+@@ -167,7 +193,8 @@ asmlinkage long sys_fstatfs(unsigned int
+ file = fget(fd);
+ if (!file)
+ goto out;
+- error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp);
++ error = vfs_statfs_native(file->f_dentry->d_inode->i_sb,
++ file->f_vfsmnt, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ fput(file);
+@@ -188,7 +215,8 @@ asmlinkage long sys_fstatfs64(unsigned i
+ file = fget(fd);
+ if (!file)
+ goto out;
+- error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp);
++ error = vfs_statfs64(file->f_dentry->d_inode->i_sb,
++ file->f_vfsmnt, &tmp);
+ if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
+ error = -EFAULT;
+ fput(file);
+@@ -243,7 +271,7 @@ static long do_sys_truncate(const char _
+ if (!S_ISREG(inode->i_mode))
+ goto dput_and_out;
+
+- error = vfs_permission(&nd, MAY_WRITE);
++ error = vfs_permission(&nd, MAY_WRITE, NULL);
+ if (error)
+ goto dput_and_out;
+
+@@ -330,7 +358,10 @@ out:
+
+ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
+ {
+- return do_sys_ftruncate(fd, length, 1);
++ long ret = do_sys_ftruncate(fd, length, 1);
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ /* LFS versions of truncate are only needed on 32 bit machines */
+@@ -342,7 +373,10 @@ asmlinkage long sys_truncate64(const cha
+
+ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
+ {
+- return do_sys_ftruncate(fd, length, 0);
++ long ret = do_sys_ftruncate(fd, length, 0);
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+ #endif
+
+@@ -397,7 +431,7 @@ asmlinkage long sys_utime(char __user *
+ goto dput_and_out;
+
+ if (current->fsuid != inode->i_uid &&
+- (error = vfs_permission(&nd, MAY_WRITE)) != 0)
++ (error = vfs_permission(&nd, MAY_WRITE, NULL)) != 0)
+ goto dput_and_out;
+ }
+ mutex_lock(&inode->i_mutex);
+@@ -450,7 +484,7 @@ long do_utimes(int dfd, char __user *fil
+ goto dput_and_out;
+
+ if (current->fsuid != inode->i_uid &&
+- (error = vfs_permission(&nd, MAY_WRITE)) != 0)
++ (error = vfs_permission(&nd, MAY_WRITE, NULL)) != 0)
+ goto dput_and_out;
+ }
+ mutex_lock(&inode->i_mutex);
+@@ -514,7 +548,7 @@ asmlinkage long sys_faccessat(int dfd, c
+
+ res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ if (!res) {
+- res = vfs_permission(&nd, mode);
++ res = vfs_permission(&nd, mode, NULL);
+ /* SuS v2 requires we report a read only fs too */
+ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+ && !special_file(nd.dentry->d_inode->i_mode))
+@@ -543,7 +577,7 @@ asmlinkage long sys_chdir(const char __u
+ if (error)
+ goto out;
+
+- error = vfs_permission(&nd, MAY_EXEC);
++ error = vfs_permission(&nd, MAY_EXEC, NULL);
+ if (error)
+ goto dput_and_out;
+
+@@ -594,7 +628,7 @@ asmlinkage long sys_chroot(const char __
+ if (error)
+ goto out;
+
+- error = vfs_permission(&nd, MAY_EXEC);
++ error = vfs_permission(&nd, MAY_EXEC, NULL);
+ if (error)
+ goto dput_and_out;
+
+@@ -733,6 +767,7 @@ asmlinkage long sys_chown(const char __u
+ }
+ return error;
+ }
++EXPORT_SYMBOL_GPL(sys_chown);
+
+ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
+ gid_t group, int flag)
+@@ -1083,20 +1118,30 @@ long do_sys_open(int dfd, const char __u
+
+ asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+ {
++ long ret;
++
+ if (force_o_largefile())
+ flags |= O_LARGEFILE;
+
+- return do_sys_open(AT_FDCWD, filename, flags, mode);
++ ret = do_sys_open(AT_FDCWD, filename, flags, mode);
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+ EXPORT_SYMBOL_GPL(sys_open);
+
+ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
+ int mode)
+ {
++ long ret;
++
+ if (force_o_largefile())
+ flags |= O_LARGEFILE;
+
+- return do_sys_open(dfd, filename, flags, mode);
++ ret = do_sys_open(dfd, filename, flags, mode);
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+ EXPORT_SYMBOL_GPL(sys_openat);
+
+diff -upr linux-2.6.16.orig/fs/partitions/check.c linux-2.6.16-026test015/fs/partitions/check.c
+--- linux-2.6.16.orig/fs/partitions/check.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/partitions/check.c 2006-07-04 14:41:38.000000000 +0400
+@@ -128,6 +128,7 @@ char *disk_name(struct gendisk *hd, int
+
+ return buf;
+ }
++EXPORT_SYMBOL(disk_name);
+
+ const char *bdevname(struct block_device *bdev, char *buf)
+ {
+@@ -345,6 +346,7 @@ static char *make_block_name(struct gend
+ char *name;
+ static char *block_str = "block:";
+ int size;
++ char *s;
+
+ size = strlen(block_str) + strlen(disk->disk_name) + 1;
+ name = kmalloc(size, GFP_KERNEL);
+@@ -352,6 +354,10 @@ static char *make_block_name(struct gend
+ return NULL;
+ strcpy(name, block_str);
+ strcat(name, disk->disk_name);
++ /* ewww... some of these buggers have / in name... */
++ s = strchr(name, '/');
++ if (s)
++ *s = '!';
+ return name;
+ }
+
+diff -upr linux-2.6.16.orig/fs/pipe.c linux-2.6.16-026test015/fs/pipe.c
+--- linux-2.6.16.orig/fs/pipe.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/pipe.c 2006-07-04 14:41:39.000000000 +0400
+@@ -19,6 +19,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/ioctls.h>
+
++#include <ub/ub_mem.h>
++
+ /*
+ * We use a start+len construction, which provides full use of the
+ * allocated memory.
+@@ -284,7 +286,7 @@ pipe_writev(struct file *filp, const str
+ int error;
+
+ if (!page) {
+- page = alloc_page(GFP_HIGHUSER);
++ page = alloc_page(GFP_HIGHUSER | __GFP_UBC);
+ if (unlikely(!page)) {
+ ret = ret ? : -ENOMEM;
+ break;
+@@ -662,7 +664,7 @@ struct inode* pipe_new(struct inode* ino
+ {
+ struct pipe_inode_info *info;
+
+- info = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
++ info = ub_kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+ if (!info)
+ goto fail_page;
+ memset(info, 0, sizeof(*info));
+@@ -797,6 +799,7 @@ close_f1:
+ no_files:
+ return error;
+ }
++EXPORT_SYMBOL_GPL(do_pipe);
+
+ /*
+ * pipefs should _never_ be mounted by userland - too much of security hassle,
+diff -upr linux-2.6.16.orig/fs/proc/array.c linux-2.6.16-026test015/fs/proc/array.c
+--- linux-2.6.16.orig/fs/proc/array.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/array.c 2006-07-04 14:41:39.000000000 +0400
+@@ -75,6 +75,9 @@
+ #include <linux/times.h>
+ #include <linux/cpuset.h>
+ #include <linux/rcupdate.h>
++#include <linux/fairsched.h>
++
++#include <ub/beancounter.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -161,8 +164,13 @@ static inline char * task_state(struct t
+ struct group_info *group_info;
+ int g;
+ struct fdtable *fdt = NULL;
++ pid_t pid, ppid, tgid;
++
++ pid = get_task_pid(p);
++ tgid = get_task_tgid(p);
+
+ read_lock(&tasklist_lock);
++ ppid = get_task_ppid(p);
+ buffer += sprintf(buffer,
+ "State:\t%s\n"
+ "SleepAVG:\t%lu%%\n"
+@@ -170,13 +178,19 @@ static inline char * task_state(struct t
+ "Pid:\t%d\n"
+ "PPid:\t%d\n"
+ "TracerPid:\t%d\n"
++#ifdef CONFIG_FAIRSCHED
++ "FNid:\t%d\n"
++#endif
+ "Uid:\t%d\t%d\t%d\t%d\n"
+ "Gid:\t%d\t%d\t%d\t%d\n",
+ get_task_state(p),
+ (p->sleep_avg/1024)*100/(1020000000/1024),
+- p->tgid,
+- p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0,
+- pid_alive(p) && p->ptrace ? p->parent->pid : 0,
++ tgid,
++ pid, ppid,
++ pid_alive(p) && p->ptrace ? get_task_pid(p->parent) : 0,
++#ifdef CONFIG_FAIRSCHED
++ task_fairsched_node_id(p),
++#endif
+ p->uid, p->euid, p->suid, p->fsuid,
+ p->gid, p->egid, p->sgid, p->fsgid);
+ read_unlock(&tasklist_lock);
+@@ -199,6 +213,18 @@ static inline char * task_state(struct t
+ put_group_info(group_info);
+
+ buffer += sprintf(buffer, "\n");
++
++#ifdef CONFIG_VE
++ buffer += sprintf(buffer,
++ "envID:\t%d\n"
++ "VPid:\t%d\n"
++ "PNState:\t%u\n"
++ "StopState:\t%u\n",
++ VE_TASK_INFO(p)->owner_env->veid,
++ virt_pid(p),
++ p->pn_state,
++ p->stopped_state);
++#endif
+ return buffer;
+ }
+
+@@ -244,7 +270,7 @@ static void collect_sigign_sigcatch(stru
+
+ static inline char * task_sig(struct task_struct *p, char *buffer)
+ {
+- sigset_t pending, shpending, blocked, ignored, caught;
++ sigset_t pending, shpending, blocked, ignored, caught, saved;
+ int num_threads = 0;
+ unsigned long qsize = 0;
+ unsigned long qlim = 0;
+@@ -254,6 +280,7 @@ static inline char * task_sig(struct tas
+ sigemptyset(&blocked);
+ sigemptyset(&ignored);
+ sigemptyset(&caught);
++ sigemptyset(&saved);
+
+ /* Gather all the data with the appropriate locks held */
+ read_lock(&tasklist_lock);
+@@ -262,6 +289,7 @@ static inline char * task_sig(struct tas
+ pending = p->pending.signal;
+ shpending = p->signal->shared_pending.signal;
+ blocked = p->blocked;
++ saved = p->saved_sigmask;
+ collect_sigign_sigcatch(p, &ignored, &caught);
+ num_threads = atomic_read(&p->signal->count);
+ qsize = atomic_read(&p->user->sigpending);
+@@ -279,6 +307,7 @@ static inline char * task_sig(struct tas
+ buffer = render_sigset_t("SigBlk:\t", &blocked, buffer);
+ buffer = render_sigset_t("SigIgn:\t", &ignored, buffer);
+ buffer = render_sigset_t("SigCgt:\t", &caught, buffer);
++ buffer = render_sigset_t("SigSvd:\t", &saved, buffer);
+
+ return buffer;
+ }
+@@ -293,10 +322,27 @@ static inline char *task_cap(struct task
+ cap_t(p->cap_effective));
+ }
+
++#ifdef CONFIG_USER_RESOURCE
++static inline void ub_dump_task_info(struct task_struct *tsk,
++ char *stsk, int ltsk, char *smm, int lmm)
++{
++ print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk);
++ task_lock(tsk);
++ if (tsk->mm)
++ print_ub_uid(tsk->mm->mm_ub, smm, lmm);
++ else
++ strncpy(smm, "N/A", lmm);
++ task_unlock(tsk);
++}
++#endif
++
+ int proc_pid_status(struct task_struct *task, char * buffer)
+ {
+ char * orig = buffer;
+ struct mm_struct *mm = get_task_mm(task);
++#ifdef CONFIG_USER_RESOURCE
++ char tsk_ub_info[64], mm_ub_info[64];
++#endif
+
+ buffer = task_name(task, buffer);
+ buffer = task_state(task, buffer);
+@@ -311,6 +357,14 @@ int proc_pid_status(struct task_struct *
+ #if defined(CONFIG_S390)
+ buffer = task_show_regs(task, buffer);
+ #endif
++#ifdef CONFIG_USER_RESOURCE
++ ub_dump_task_info(task,
++ tsk_ub_info, sizeof(tsk_ub_info),
++ mm_ub_info, sizeof(mm_ub_info));
++
++ buffer += sprintf(buffer, "TaskUB:\t%s\n", tsk_ub_info);
++ buffer += sprintf(buffer, "MMUB:\t%s\n", mm_ub_info);
++#endif
+ return buffer - orig;
+ }
+
+@@ -333,6 +387,10 @@ static int do_task_stat(struct task_stru
+ DEFINE_KTIME(it_real_value);
+ struct task_struct *t;
+ char tcomm[sizeof(task->comm)];
++#ifdef CONFIG_USER_RESOURCE
++ char ub_task_info[64];
++ char ub_mm_info[64];
++#endif
+
+ state = *get_task_state(task);
+ vsize = eip = esp = 0;
+@@ -370,11 +428,12 @@ static int do_task_stat(struct task_stru
+ }
+ if (task->signal) {
+ if (task->signal->tty) {
+- tty_pgrp = task->signal->tty->pgrp;
++ tty_pgrp = pid_type_to_vpid(PIDTYPE_PGID,
++ task->signal->tty->pgrp);
+ tty_nr = new_encode_dev(tty_devnum(task->signal->tty));
+ }
+- pgid = process_group(task);
+- sid = task->signal->session;
++ pgid = get_task_pgid(task);
++ sid = get_task_sid(task);
+ cmin_flt = task->signal->cmin_flt;
+ cmaj_flt = task->signal->cmaj_flt;
+ cutime = task->signal->cutime;
+@@ -388,7 +447,7 @@ static int do_task_stat(struct task_stru
+ }
+ it_real_value = task->signal->real_timer.expires;
+ }
+- ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
++ ppid = get_task_ppid(task);
+ read_unlock(&tasklist_lock);
+
+ if (!whole || num_threads<2)
+@@ -407,14 +466,34 @@ static int do_task_stat(struct task_stru
+
+ /* Temporary variable needed for gcc-2.96 */
+ /* convert timespec -> nsec*/
++#ifndef CONFIG_VE
+ start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC
+ + task->start_time.tv_nsec;
++#else
++ start_time = (unsigned long long)(task->start_time.tv_sec -
++ get_exec_env()->init_entry->start_time.tv_sec) *
++ NSEC_PER_SEC + task->start_time.tv_nsec -
++ get_exec_env()->init_entry->start_time.tv_nsec;
++#endif
+ /* convert nsec -> ticks */
+ start_time = nsec_to_clock_t(start_time);
+
++#ifdef CONFIG_USER_RESOURCE
++ ub_dump_task_info(task,
++ ub_task_info, sizeof(ub_task_info),
++ ub_mm_info, sizeof(ub_mm_info));
++#endif
++
+ res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \
+ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \
+-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n",
++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu"
++#ifdef CONFIG_VE
++"0 0 0 0 0 0 0 0 %d %u"
++#endif
++#ifdef CONFIG_USER_RESOURCE
++ " %s %s"
++#endif
++ "\n",
+ task->pid,
+ tcomm,
+ state,
+@@ -459,7 +538,16 @@ static int do_task_stat(struct task_stru
+ task->exit_signal,
+ task_cpu(task),
+ task->rt_priority,
+- task->policy);
++ task->policy
++#ifdef CONFIG_VE
++ , virt_pid(task),
++ VEID(VE_TASK_INFO(task)->owner_env)
++#endif
++#ifdef CONFIG_USER_RESOURCE
++ , ub_task_info,
++ ub_mm_info
++#endif
++ );
+ if(mm)
+ mmput(mm);
+ return res;
+diff -upr linux-2.6.16.orig/fs/proc/base.c linux-2.6.16-026test015/fs/proc/base.c
+--- linux-2.6.16.orig/fs/proc/base.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/base.c 2006-07-04 14:41:38.000000000 +0400
+@@ -291,22 +291,29 @@ static int proc_fd_link(struct inode *in
+ struct files_struct *files;
+ struct file *file;
+ int fd = proc_type(inode) - PROC_TID_FD_DIR;
++ int err = -ENOENT;
+
+ files = get_files_struct(task);
+ if (files) {
+- rcu_read_lock();
++ /*
++ * We are not taking a ref to the file structure, so we must
++ * hold ->file_lock.
++ */
++ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (file) {
+- *mnt = mntget(file->f_vfsmnt);
+- *dentry = dget(file->f_dentry);
+- rcu_read_unlock();
+- put_files_struct(files);
+- return 0;
++ if (d_root_check(file->f_dentry, file->f_vfsmnt)) {
++ err = -EACCES;
++ } else {
++ *mnt = mntget(file->f_vfsmnt);
++ *dentry = dget(file->f_dentry);
++ err = 0;
++ }
+ }
+- rcu_read_unlock();
++ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ }
+- return -ENOENT;
++ return err;
+ }
+
+ static struct fs_struct *get_fs_struct(struct task_struct *task)
+@@ -326,10 +333,12 @@ static int proc_cwd_link(struct inode *i
+ int result = -ENOENT;
+ if (fs) {
+ read_lock(&fs->lock);
+- *mnt = mntget(fs->pwdmnt);
+- *dentry = dget(fs->pwd);
++ result = d_root_check(fs->pwd, fs->pwdmnt);
++ if (!result) {
++ *mnt = mntget(fs->pwdmnt);
++ *dentry = dget(fs->pwd);
++ }
+ read_unlock(&fs->lock);
+- result = 0;
+ put_fs_struct(fs);
+ }
+ return result;
+@@ -579,19 +588,21 @@ static int proc_check_root(struct inode
+ return proc_check_chroot(root, vfsmnt);
+ }
+
+-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
++static int proc_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+- if (generic_permission(inode, mask, NULL) != 0)
++ if (generic_permission(inode, mask, NULL, perm) != 0)
+ return -EACCES;
+ return proc_check_root(inode);
+ }
+
+-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
++static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ struct dentry *root;
+ struct vfsmount *vfsmnt;
+
+- if (generic_permission(inode, mask, NULL) != 0)
++ if (generic_permission(inode, mask, NULL, perm) != 0)
+ return -EACCES;
+
+ if (proc_task_root_link(inode, &root, &vfsmnt))
+@@ -1303,6 +1314,10 @@ static struct inode *proc_pid_make_inode
+ struct inode * inode;
+ struct proc_inode *ei;
+
++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env,
++ VE_OWNER_FSTYPE(sb->s_type)))
++ return NULL;
++
+ /* We need a new inode */
+
+ inode = new_inode(sb);
+@@ -1406,6 +1421,10 @@ static void pid_base_iput(struct dentry
+ spin_lock(&task->proc_lock);
+ if (task->proc_dentry == dentry)
+ task->proc_dentry = NULL;
++#ifdef CONFIG_VE
++ if (VE_TASK_INFO(task)->glob_proc_dentry == dentry)
++ VE_TASK_INFO(task)->glob_proc_dentry = NULL;
++#endif
+ spin_unlock(&task->proc_lock);
+ iput(inode);
+ }
+@@ -1485,7 +1504,12 @@ static struct dentry *proc_lookupfd(stru
+ if (!files)
+ goto out_unlock;
+ inode->i_mode = S_IFLNK;
+- rcu_read_lock();
++
++ /*
++ * We are not taking a ref to the file structure, so we must
++ * hold ->file_lock.
++ */
++ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (!file)
+ goto out_unlock2;
+@@ -1493,7 +1517,7 @@ static struct dentry *proc_lookupfd(stru
+ inode->i_mode |= S_IRUSR | S_IXUSR;
+ if (file->f_mode & 2)
+ inode->i_mode |= S_IWUSR | S_IXUSR;
+- rcu_read_unlock();
++ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+@@ -1503,7 +1527,7 @@ static struct dentry *proc_lookupfd(stru
+ return NULL;
+
+ out_unlock2:
+- rcu_read_unlock();
++ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+ out_unlock:
+ iput(inode);
+@@ -1879,14 +1903,14 @@ static int proc_self_readlink(struct den
+ int buflen)
+ {
+ char tmp[30];
+- sprintf(tmp, "%d", current->tgid);
++ sprintf(tmp, "%d", get_task_tgid(current));
+ return vfs_readlink(dentry,buffer,buflen,tmp);
+ }
+
+ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
+ {
+ char tmp[30];
+- sprintf(tmp, "%d", current->tgid);
++ sprintf(tmp, "%d", get_task_tgid(current));
+ return ERR_PTR(vfs_follow_link(nd,tmp));
+ }
+
+@@ -1911,11 +1935,8 @@ static struct inode_operations proc_self
+ * of PIDTYPE_PID.
+ */
+
+-struct dentry *proc_pid_unhash(struct task_struct *p)
++struct dentry *__proc_pid_unhash(struct task_struct *p, struct dentry *proc_dentry)
+ {
+- struct dentry *proc_dentry;
+-
+- proc_dentry = p->proc_dentry;
+ if (proc_dentry != NULL) {
+
+ spin_lock(&dcache_lock);
+@@ -1933,6 +1954,14 @@ struct dentry *proc_pid_unhash(struct ta
+ return proc_dentry;
+ }
+
++void proc_pid_unhash(struct task_struct *p, struct dentry *pd[2])
++{
++ pd[0] = __proc_pid_unhash(p, p->proc_dentry);
++#ifdef CONFIG_VE
++ pd[1] = __proc_pid_unhash(p, VE_TASK_INFO(p)->glob_proc_dentry);
++#endif
++}
++
+ /**
+ * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
+ * @proc_dentry: directoy to prune.
+@@ -1940,7 +1969,7 @@ struct dentry *proc_pid_unhash(struct ta
+ * Shrink the /proc directory that was used by the just killed thread.
+ */
+
+-void proc_pid_flush(struct dentry *proc_dentry)
++void __proc_pid_flush(struct dentry *proc_dentry)
+ {
+ might_sleep();
+ if(proc_dentry != NULL) {
+@@ -1949,12 +1978,21 @@ void proc_pid_flush(struct dentry *proc_
+ }
+ }
+
++void proc_pid_flush(struct dentry *proc_dentry[2])
++{
++ __proc_pid_flush(proc_dentry[0]);
++#ifdef CONFIG_VE
++ __proc_pid_flush(proc_dentry[1]);
++#endif
++}
++
+ /* SMP-safe */
+ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+ {
+ struct task_struct *task;
+ struct inode *inode;
+ struct proc_inode *ei;
++ struct dentry *pd[2];
+ unsigned tgid;
+ int died;
+
+@@ -1978,7 +2016,19 @@ struct dentry *proc_pid_lookup(struct in
+ goto out;
+
+ read_lock(&tasklist_lock);
+- task = find_task_by_pid(tgid);
++ task = find_task_by_pid_ve(tgid);
++ /* In theory we are allowed to lookup both /proc/VIRT_PID and
++ * /proc/GLOBAL_PID inside VE. However, current /proc implementation
++ * cannot maintain two references to one task, so that we have
++ * to prohibit /proc/GLOBAL_PID.
++ */
++ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tgid)) {
++ /* However, VE_ENTERed tasks are exception, they use global
++ * pids.
++ */
++ if (virt_pid(task) != tgid)
++ task = NULL;
++ }
+ if (task)
+ get_task_struct(task);
+ read_unlock(&tasklist_lock);
+@@ -2007,16 +2057,23 @@ struct dentry *proc_pid_lookup(struct in
+ died = 0;
+ d_add(dentry, inode);
+ spin_lock(&task->proc_lock);
++#ifdef CONFIG_VE
++ if (ve_is_super(VE_OWNER_FSTYPE(inode->i_sb->s_type)))
++ VE_TASK_INFO(task)->glob_proc_dentry = dentry;
++ else
++ task->proc_dentry = dentry;
++#else
+ task->proc_dentry = dentry;
++#endif
+ if (!pid_alive(task)) {
+- dentry = proc_pid_unhash(task);
++ proc_pid_unhash(task, pd);
+ died = 1;
+ }
+ spin_unlock(&task->proc_lock);
+
+ put_task_struct(task);
+ if (died) {
+- proc_pid_flush(dentry);
++ proc_pid_flush(pd);
+ goto out;
+ }
+ return NULL;
+@@ -2037,7 +2094,12 @@ static struct dentry *proc_task_lookup(s
+ goto out;
+
+ read_lock(&tasklist_lock);
+- task = find_task_by_pid(tid);
++ task = find_task_by_pid_ve(tid);
++ /* See comment above in similar place. */
++ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tid)) {
++ if (virt_pid(task) != tid)
++ task = NULL;
++ }
+ if (task)
+ get_task_struct(task);
+ read_unlock(&tasklist_lock);
+@@ -2081,16 +2143,23 @@ out:
+ * tasklist lock while doing this, and we must release it before
+ * we actually do the filldir itself, so we use a temp buffer..
+ */
+-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids)
++static int get_tgid_list(int index, unsigned long version, unsigned int *tgids,
++ struct ve_struct *ve)
+ {
+ struct task_struct *p;
+ int nr_tgids = 0;
+
+ index--;
+ read_lock(&tasklist_lock);
++ if (list_empty(&ve->vetask_lh))
++ goto out;
+ p = NULL;
+ if (version) {
+- p = find_task_by_pid(version);
++ struct ve_struct *oldve;
++
++ oldve = set_exec_env(ve);
++ p = find_task_by_pid_ve(version);
++ (void)set_exec_env(oldve);
+ if (p && !thread_group_leader(p))
+ p = NULL;
+ }
+@@ -2098,10 +2167,10 @@ static int get_tgid_list(int index, unsi
+ if (p)
+ index = 0;
+ else
+- p = next_task(&init_task);
++ p = __first_task_ve(ve);
+
+- for ( ; p != &init_task; p = next_task(p)) {
+- int tgid = p->pid;
++ for ( ; p != NULL; p = __next_task_ve(ve, p)) {
++ int tgid = get_task_pid_ve(p, ve);
+ if (!pid_alive(p))
+ continue;
+ if (--index >= 0)
+@@ -2111,6 +2180,7 @@ static int get_tgid_list(int index, unsi
+ if (nr_tgids >= PROC_MAXPIDS)
+ break;
+ }
++out:
+ read_unlock(&tasklist_lock);
+ return nr_tgids;
+ }
+@@ -2134,7 +2204,7 @@ static int get_tid_list(int index, unsig
+ * via next_thread().
+ */
+ if (pid_alive(task)) do {
+- int tid = task->pid;
++ int tid = get_task_pid(task);
+
+ if (--index >= 0)
+ continue;
+@@ -2171,7 +2241,8 @@ int proc_pid_readdir(struct file * filp,
+ next_tgid = filp->f_version;
+ filp->f_version = 0;
+ for (;;) {
+- nr_tgids = get_tgid_list(nr, next_tgid, tgid_array);
++ nr_tgids = get_tgid_list(nr, next_tgid, tgid_array,
++ filp->f_dentry->d_sb->s_type->owner_env);
+ if (!nr_tgids) {
+ /* no more entries ! */
+ break;
+diff -upr linux-2.6.16.orig/fs/proc/generic.c linux-2.6.16-026test015/fs/proc/generic.c
+--- linux-2.6.16.orig/fs/proc/generic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/generic.c 2006-07-04 14:41:38.000000000 +0400
+@@ -10,7 +10,9 @@
+
+ #include <linux/errno.h>
+ #include <linux/time.h>
++#include <linux/fs.h>
+ #include <linux/proc_fs.h>
++#include <linux/ve_owner.h>
+ #include <linux/stat.h>
+ #include <linux/module.h>
+ #include <linux/mount.h>
+@@ -29,6 +31,8 @@ static ssize_t proc_file_write(struct fi
+ size_t count, loff_t *ppos);
+ static loff_t proc_file_lseek(struct file *, loff_t, int);
+
++static DEFINE_RWLOCK(proc_tree_lock);
++
+ int proc_match(int len, const char *name, struct proc_dir_entry *de)
+ {
+ if (de->namelen != len)
+@@ -229,6 +233,7 @@ proc_file_lseek(struct file *file, loff_
+ return retval;
+ }
+
++#ifndef CONFIG_VE
+ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
+ {
+ struct inode *inode = dentry->d_inode;
+@@ -261,9 +266,12 @@ static int proc_getattr(struct vfsmount
+ generic_fillattr(inode, stat);
+ return 0;
+ }
++#endif
+
+ static struct inode_operations proc_file_inode_operations = {
++#ifndef CONFIG_VE
+ .setattr = proc_notify_change,
++#endif
+ };
+
+ /*
+@@ -271,14 +279,20 @@ static struct inode_operations proc_file
+ * returns the struct proc_dir_entry for "/proc/tty/driver", and
+ * returns "serial" in residual.
+ */
+-static int xlate_proc_name(const char *name,
++static int __xlate_proc_name(struct proc_dir_entry *root, const char *name,
+ struct proc_dir_entry **ret, const char **residual)
+ {
+ const char *cp = name, *next;
+ struct proc_dir_entry *de;
+ int len;
+
+- de = &proc_root;
++ if (*ret) {
++ de_get(*ret);
++ return 0;
++ }
++
++ read_lock(&proc_tree_lock);
++ de = root;
+ while (1) {
+ next = strchr(cp, '/');
+ if (!next)
+@@ -289,15 +303,35 @@ static int xlate_proc_name(const char *n
+ if (proc_match(len, cp, de))
+ break;
+ }
+- if (!de)
++ if (!de) {
++ read_unlock(&proc_tree_lock);
+ return -ENOENT;
++ }
+ cp += len + 1;
+ }
+ *residual = cp;
+- *ret = de;
++ *ret = de_get(de);
++ read_unlock(&proc_tree_lock);
+ return 0;
+ }
+
++#ifndef CONFIG_VE
++#define xlate_proc_loc_name xlate_proc_name
++#else
++static int xlate_proc_loc_name(const char *name,
++ struct proc_dir_entry **ret, const char **residual)
++{
++ return __xlate_proc_name(get_exec_env()->proc_root,
++ name, ret, residual);
++}
++#endif
++
++static int xlate_proc_name(const char *name,
++ struct proc_dir_entry **ret, const char **residual)
++{
++ return __xlate_proc_name(&proc_root, name, ret, residual);
++}
++
+ static DEFINE_IDR(proc_inum_idr);
+ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
+
+@@ -369,6 +403,20 @@ static struct dentry_operations proc_den
+ .d_delete = proc_delete_dentry,
+ };
+
++static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir,
++ struct dentry *d)
++{
++ struct proc_dir_entry *de;
++
++ for (de = dir->subdir; de; de = de->next) {
++ if (de->namelen != d->d_name.len)
++ continue;
++ if (!memcmp(d->d_name.name, de->name, de->namelen))
++ break;
++ }
++ return de_get(de);
++}
++
+ /*
+ * Don't create negative dentries here, return -ENOENT by hand
+ * instead.
+@@ -376,34 +424,147 @@ static struct dentry_operations proc_den
+ struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+ {
+ struct inode *inode = NULL;
+- struct proc_dir_entry * de;
++ struct proc_dir_entry *lde, *gde;
+ int error = -ENOENT;
+
+ lock_kernel();
+- de = PDE(dir);
+- if (de) {
+- for (de = de->subdir; de ; de = de->next) {
+- if (de->namelen != dentry->d_name.len)
+- continue;
+- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
+- unsigned int ino = de->low_ino;
++ lde = LPDE(dir);
+
+- error = -EINVAL;
+- inode = proc_get_inode(dir->i_sb, ino, de);
+- break;
+- }
+- }
+- }
++ if (!lde)
++ goto out;
++
++ read_lock(&proc_tree_lock);
++ lde = __proc_lookup(lde, dentry);
++#ifdef CONFIG_VE
++ gde = GPDE(dir);
++ if (gde)
++ gde = __proc_lookup(gde, dentry);
++#else
++ gde = NULL;
++#endif
++ read_unlock(&proc_tree_lock);
++
++ /*
++ * There are following possible cases after lookup:
++ *
++ * lde gde
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * NULL NULL ENOENT
++ * loc NULL found in local tree
++ * loc glob found in both trees
++ * NULL glob found in global tree
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ *
++ * We initialized inode as follows after lookup:
++ *
++ * inode->lde inode->gde
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * loc NULL in local tree
++ * loc glob both trees
++ * glob glob global tree
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * i.e. inode->lde is always initialized
++ */
++
++ if (lde == NULL && gde == NULL)
++ goto out;
++
++ if (lde != NULL)
++ inode = proc_get_inode(dir->i_sb, lde->low_ino, lde);
++ else
++ inode = proc_get_inode(dir->i_sb, gde->low_ino, gde);
++
++ /*
++ * We can sleep in proc_get_inode(), but since we have i_sem
++ * being taken, no one can setup GPDE/LPDE on this inode.
++ */
++ if (!inode)
++ goto out_put;
++
++#ifdef CONFIG_VE
++ GPDE(inode) = de_get(gde);
++ if (gde)
++ __module_get(gde->owner);
++
++ /* if dentry is found in both trees and it is a directory
++ * then inode's nlink count must be altered, because local
++ * and global subtrees may differ.
++ * on the other hand, they may intersect, so actual nlink
++ * value is difficult to calculate - upper estimate is used
++ * instead of it.
++ * dentry found in global tree only must not be writable
++ * in non-super ve.
++ */
++ if (lde && gde && lde != gde && gde->nlink > 1)
++ inode->i_nlink += gde->nlink - 2;
++ if (lde == NULL && !ve_is_super(
++ VE_OWNER_FSTYPE(dir->i_sb->s_type)))
++ inode->i_mode &= ~S_IWUGO;
++#endif
+ unlock_kernel();
++ dentry->d_op = &proc_dentry_operations;
++ d_add(dentry, inode);
++ de_put(lde);
++ de_put(gde);
++ return NULL;
+
+- if (inode) {
+- dentry->d_op = &proc_dentry_operations;
+- d_add(dentry, inode);
+- return NULL;
+- }
++out_put:
++ de_put(lde);
++ de_put(gde);
++out:
++ unlock_kernel();
+ return ERR_PTR(error);
+ }
+
++struct proc_dir_reader {
++ struct list_head list;
++ struct proc_dir_entry *next;
++};
++
++static LIST_HEAD(proc_dir_readers);
++static DEFINE_SPINLOCK(proc_dir_readers_lock);
++
++static inline void add_reader(struct proc_dir_reader *r,
++ struct proc_dir_entry *cur)
++{
++ r->next = cur->next;
++ spin_lock(&proc_dir_readers_lock);
++ list_add(&r->list, &proc_dir_readers);
++ spin_unlock(&proc_dir_readers_lock);
++}
++
++static inline struct proc_dir_entry *del_reader(struct proc_dir_reader *r)
++{
++ spin_lock(&proc_dir_readers_lock);
++ list_del(&r->list);
++ spin_unlock(&proc_dir_readers_lock);
++ return r->next;
++}
++
++static void notify_readers(struct proc_dir_entry *de)
++{
++ struct proc_dir_reader *r;
++
++ /* lockless since proc_tree_lock is taken for writing */
++ list_for_each_entry(r, &proc_dir_readers, list)
++ if (r->next == de)
++ r->next = de->next;
++}
++
++static inline int in_tree(struct proc_dir_entry *de, struct proc_dir_entry *dir)
++{
++ struct proc_dir_entry *gde;
++
++ for (gde = dir->subdir; gde; gde = gde->next) {
++ if (de->namelen != gde->namelen)
++ continue;
++ if (memcmp(de->name, gde->name, gde->namelen))
++ continue;
++ return 1;
++ }
++ return 0;
++}
++
+ /*
+ * This returns non-zero if at EOF, so that the /proc
+ * root directory can use this and check if it should
+@@ -421,6 +582,7 @@ int proc_readdir(struct file * filp,
+ int i;
+ struct inode *inode = filp->f_dentry->d_inode;
+ int ret = 0;
++ struct proc_dir_reader this;
+
+ lock_kernel();
+
+@@ -447,13 +609,12 @@ int proc_readdir(struct file * filp,
+ filp->f_pos++;
+ /* fall through */
+ default:
++ read_lock(&proc_tree_lock);
+ de = de->subdir;
+ i -= 2;
+ for (;;) {
+- if (!de) {
+- ret = 1;
+- goto out;
+- }
++ if (!de)
++ goto chk_global;
+ if (!i)
+ break;
+ de = de->next;
+@@ -461,12 +622,60 @@ int proc_readdir(struct file * filp,
+ }
+
+ do {
+- if (filldir(dirent, de->name, de->namelen, filp->f_pos,
+- de->low_ino, de->mode >> 12) < 0)
++ de_get(de);
++ add_reader(&this, de);
++ read_unlock(&proc_tree_lock);
++ ret = filldir(dirent, de->name, de->namelen,
++ filp->f_pos, de->low_ino,
++ de->mode >> 12);
++ read_lock(&proc_tree_lock);
++ de_put(de);
++ de = del_reader(&this);
++ if (ret < 0) {
++ read_unlock(&proc_tree_lock);
++ ret = 0;
+ goto out;
++ }
+ filp->f_pos++;
+- de = de->next;
+ } while (de);
++chk_global:
++#ifdef CONFIG_VE
++ de = GPDE(inode);
++ if (de == NULL)
++ goto done;
++
++ de = de->subdir;
++ while (de) {
++ if (in_tree(de, LPDE(inode))) {
++ de = de->next;
++ continue;
++ }
++
++ if (i > 0) {
++ i--;
++ de = de->next;
++ continue;
++ }
++
++ de_get(de);
++ add_reader(&this, de);
++ read_unlock(&proc_tree_lock);
++ ret = filldir(dirent, de->name, de->namelen,
++ filp->f_pos, de->low_ino,
++ de->mode >> 12);
++ read_lock(&proc_tree_lock);
++ de_put(de);
++ de = del_reader(&this);
++ if (ret < 0) {
++ read_unlock(&proc_tree_lock);
++ ret = 0;
++ goto out;
++ }
++ filp->f_pos++;
++ }
++done:
++#endif
++ read_unlock(&proc_tree_lock);
+ }
+ ret = 1;
+ out: unlock_kernel();
+@@ -488,8 +697,10 @@ static struct file_operations proc_dir_o
+ */
+ static struct inode_operations proc_dir_inode_operations = {
+ .lookup = proc_lookup,
++#ifndef CONFIG_VE
+ .getattr = proc_getattr,
+ .setattr = proc_notify_change,
++#endif
+ };
+
+ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
+@@ -499,10 +710,20 @@ static int proc_register(struct proc_dir
+ i = get_inode_number();
+ if (i == 0)
+ return -EAGAIN;
++
++ write_lock(&proc_tree_lock);
++ if (dir->deleted) {
++ write_unlock(&proc_tree_lock);
++ release_inode_number(i);
++ return -ENOENT;
++ }
++
+ dp->low_ino = i;
+ dp->next = dir->subdir;
+- dp->parent = dir;
++ dp->parent = de_get(dir);
+ dir->subdir = dp;
++ write_unlock(&proc_tree_lock);
++
+ if (S_ISDIR(dp->mode)) {
+ if (dp->proc_iops == NULL) {
+ dp->proc_fops = &proc_dir_operations;
+@@ -556,24 +777,26 @@ static struct proc_dir_entry *proc_creat
+ mode_t mode,
+ nlink_t nlink)
+ {
+- struct proc_dir_entry *ent = NULL;
++ struct proc_dir_entry *ent;
+ const char *fn = name;
+ int len;
+
+ /* make sure name is valid */
+- if (!name || !strlen(name)) goto out;
++ if (!name || !strlen(name))
++ goto out;
+
+- if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0)
++ if (xlate_proc_loc_name(name, parent, &fn) != 0)
+ goto out;
+
+ /* At this point there must not be any '/' characters beyond *fn */
+ if (strchr(fn, '/'))
+- goto out;
++ goto out_put;
+
+ len = strlen(fn);
+
+ ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
+- if (!ent) goto out;
++ if (!ent)
++ goto out_put;
+
+ memset(ent, 0, sizeof(struct proc_dir_entry));
+ memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
+@@ -581,8 +804,13 @@ static struct proc_dir_entry *proc_creat
+ ent->namelen = len;
+ ent->mode = mode;
+ ent->nlink = nlink;
+- out:
++ atomic_set(&ent->count, 1);
+ return ent;
++
++out_put:
++ de_put(*parent);
++out:
++ return NULL;
+ }
+
+ struct proc_dir_entry *proc_symlink(const char *name,
+@@ -606,6 +834,7 @@ struct proc_dir_entry *proc_symlink(cons
+ kfree(ent);
+ ent = NULL;
+ }
++ de_put(parent);
+ }
+ return ent;
+ }
+@@ -624,6 +853,7 @@ struct proc_dir_entry *proc_mkdir_mode(c
+ kfree(ent);
+ ent = NULL;
+ }
++ de_put(parent);
+ }
+ return ent;
+ }
+@@ -662,9 +892,28 @@ struct proc_dir_entry *create_proc_entry
+ kfree(ent);
+ ent = NULL;
+ }
++ de_put(parent);
+ }
+ return ent;
+ }
++EXPORT_SYMBOL(remove_proc_glob_entry);
++
++struct proc_dir_entry *create_proc_glob_entry(const char *name, mode_t mode,
++ struct proc_dir_entry *parent)
++{
++ const char *path;
++ struct proc_dir_entry *ent;
++
++ path = name;
++ if (xlate_proc_name(path, &parent, &name) != 0)
++ return NULL;
++
++ ent = create_proc_entry(name, mode, parent);
++ de_put(parent);
++ return ent;
++}
++
++EXPORT_SYMBOL(create_proc_glob_entry);
+
+ void free_proc_entry(struct proc_dir_entry *de)
+ {
+@@ -684,20 +933,21 @@ void free_proc_entry(struct proc_dir_ent
+ * Remove a /proc entry and free it if it's not currently in use.
+ * If it is in use, we set the 'deleted' flag.
+ */
+-void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
++static void __remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+ {
+ struct proc_dir_entry **p;
+ struct proc_dir_entry *de;
+ const char *fn = name;
+ int len;
+
+- if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
+- goto out;
+ len = strlen(fn);
++ write_lock(&proc_tree_lock);
+ for (p = &parent->subdir; *p; p=&(*p)->next ) {
+ if (!proc_match(len, fn, *p))
+ continue;
++
+ de = *p;
++ notify_readers(de);
+ *p = de->next;
+ de->next = NULL;
+ if (S_ISDIR(de->mode))
+@@ -705,15 +955,43 @@ void remove_proc_entry(const char *name,
+ proc_kill_inodes(de);
+ de->nlink = 0;
+ WARN_ON(de->subdir);
+- if (!atomic_read(&de->count))
+- free_proc_entry(de);
+- else {
+- de->deleted = 1;
+- printk("remove_proc_entry: %s/%s busy, count=%d\n",
+- parent->name, de->name, atomic_read(&de->count));
+- }
++ de->deleted = 1;
++ de_put(de);
++ de_put(parent);
+ break;
+ }
+-out:
+- return;
++ write_unlock(&proc_tree_lock);
++}
++
++void remove_proc_loc_entry(const char *name, struct proc_dir_entry *parent)
++{
++ const char *path;
++
++ path = name;
++ if (xlate_proc_loc_name(path, &parent, &name) != 0)
++ return;
++
++ __remove_proc_entry(name, parent);
++ de_put(parent);
++}
++
++void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent)
++{
++ const char *path;
++
++ path = name;
++ if (xlate_proc_name(path, &parent, &name) != 0)
++ return;
++
++ __remove_proc_entry(name, parent);
++ de_put(parent);
++}
++
++void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
++{
++ remove_proc_loc_entry(name, parent);
++#ifdef CONFIG_VE
++ if (ve_is_super(get_exec_env()))
++ remove_proc_glob_entry(name, parent);
++#endif
+ }
+diff -upr linux-2.6.16.orig/fs/proc/inode.c linux-2.6.16-026test015/fs/proc/inode.c
+--- linux-2.6.16.orig/fs/proc/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/inode.c 2006-07-04 14:41:38.000000000 +0400
+@@ -8,6 +8,7 @@
+ #include <linux/proc_fs.h>
+ #include <linux/kernel.h>
+ #include <linux/mm.h>
++#include <linux/ve_owner.h>
+ #include <linux/string.h>
+ #include <linux/stat.h>
+ #include <linux/file.h>
+@@ -21,34 +22,25 @@
+
+ #include "internal.h"
+
+-static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
+-{
+- if (de)
+- atomic_inc(&de->count);
+- return de;
+-}
+-
+ /*
+ * Decrements the use count and checks for deferred deletion.
+ */
+-static void de_put(struct proc_dir_entry *de)
++void de_put(struct proc_dir_entry *de)
+ {
+ if (de) {
+- lock_kernel();
+ if (!atomic_read(&de->count)) {
+ printk("de_put: entry %s already free!\n", de->name);
+- unlock_kernel();
+ return;
+ }
+
+ if (atomic_dec_and_test(&de->count)) {
+- if (de->deleted) {
+- printk("de_put: deferred delete of %s\n",
++ if (unlikely(!de->deleted)) {
++ printk("de_put: early delete of %s\n",
+ de->name);
+- free_proc_entry(de);
++ return;
+ }
++ free_proc_entry(de);
+ }
+- unlock_kernel();
+ }
+ }
+
+@@ -68,12 +60,19 @@ static void proc_delete_inode(struct ino
+ put_task_struct(tsk);
+
+ /* Let go of any associated proc directory entry */
+- de = PROC_I(inode)->pde;
++ de = LPDE(inode);
+ if (de) {
+ if (de->owner)
+ module_put(de->owner);
+ de_put(de);
+ }
++#ifdef CONFIG_VE
++ de = GPDE(inode);
++ if (de) {
++ module_put(de->owner);
++ de_put(de);
++ }
++#endif
+ clear_inode(inode);
+ }
+
+@@ -100,6 +99,9 @@ static struct inode *proc_alloc_inode(st
+ ei->pde = NULL;
+ inode = &ei->vfs_inode;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++#ifdef CONFIG_VE
++ GPDE(inode) = NULL;
++#endif
+ return inode;
+ }
+
+@@ -209,6 +211,12 @@ int proc_fill_super(struct super_block *
+ s->s_root = d_alloc_root(root_inode);
+ if (!s->s_root)
+ goto out_no_root;
++#ifdef CONFIG_VE
++ LPDE(root_inode) = de_get(get_exec_env()->proc_root);
++ GPDE(root_inode) = &proc_root;
++#else
++ LPDE(root_inode) = &proc_root;
++#endif
+ return 0;
+
+ out_no_root:
+diff -upr linux-2.6.16.orig/fs/proc/kmsg.c linux-2.6.16-026test015/fs/proc/kmsg.c
+--- linux-2.6.16.orig/fs/proc/kmsg.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/kmsg.c 2006-07-04 14:41:38.000000000 +0400
+@@ -11,6 +11,7 @@
+ #include <linux/kernel.h>
+ #include <linux/poll.h>
+ #include <linux/fs.h>
++#include <linux/veprintk.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/io.h>
+@@ -40,7 +41,7 @@ static ssize_t kmsg_read(struct file *fi
+
+ static unsigned int kmsg_poll(struct file *file, poll_table *wait)
+ {
+- poll_wait(file, &log_wait, wait);
++ poll_wait(file, &ve_log_wait, wait);
+ if (do_syslog(9, NULL, 0))
+ return POLLIN | POLLRDNORM;
+ return 0;
+diff -upr linux-2.6.16.orig/fs/proc/proc_misc.c linux-2.6.16-026test015/fs/proc/proc_misc.c
+--- linux-2.6.16.orig/fs/proc/proc_misc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/proc_misc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -32,6 +32,7 @@
+ #include <linux/pagemap.h>
+ #include <linux/swap.h>
+ #include <linux/slab.h>
++#include <linux/virtinfo.h>
+ #include <linux/smp.h>
+ #include <linux/signal.h>
+ #include <linux/module.h>
+@@ -45,6 +46,8 @@
+ #include <linux/jiffies.h>
+ #include <linux/sysrq.h>
+ #include <linux/vmalloc.h>
++#include <linux/version.h>
++#include <linux/compile.h>
+ #include <linux/crash_dump.h>
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -53,8 +56,10 @@
+ #include <asm/div64.h>
+ #include "internal.h"
+
+-#define LOAD_INT(x) ((x) >> FSHIFT)
+-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
++#ifdef CONFIG_FAIRSCHED
++#include <linux/fairsched.h>
++#endif
++
+ /*
+ * Warning: stuff below (imported functions) assumes that its output will fit
+ * into one page. For some of those functions it may be wrong. Moreover, we
+@@ -84,15 +89,33 @@ static int loadavg_read_proc(char *page,
+ {
+ int a, b, c;
+ int len;
+-
+- a = avenrun[0] + (FIXED_1/200);
+- b = avenrun[1] + (FIXED_1/200);
+- c = avenrun[2] + (FIXED_1/200);
++ unsigned long __nr_running;
++ int __nr_threads;
++ unsigned long *__avenrun;
++ struct ve_struct *ve;
++
++ ve = get_exec_env();
++
++ if (ve_is_super(ve)) {
++ __avenrun = &avenrun[0];
++ __nr_running = nr_running();
++ __nr_threads = nr_threads;
++ }
++#ifdef CONFIG_VE
++ else {
++ __avenrun = &ve->avenrun[0];
++ __nr_running = nr_running_ve(ve);
++ __nr_threads = atomic_read(&ve->pcounter);
++ }
++#endif
++ a = __avenrun[0] + (FIXED_1/200);
++ b = __avenrun[1] + (FIXED_1/200);
++ c = __avenrun[2] + (FIXED_1/200);
+ len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
+ LOAD_INT(a), LOAD_FRAC(a),
+ LOAD_INT(b), LOAD_FRAC(b),
+ LOAD_INT(c), LOAD_FRAC(c),
+- nr_running(), nr_threads, last_pid);
++ __nr_running, __nr_threads, last_pid);
+ return proc_calc_metrics(page, start, off, count, eof, len);
+ }
+
+@@ -105,6 +128,13 @@ static int uptime_read_proc(char *page,
+ cputime_t idletime = cputime_add(init_task.utime, init_task.stime);
+
+ do_posix_clock_monotonic_gettime(&uptime);
++#ifdef CONFIG_VE
++ if (!ve_is_super(get_exec_env())) {
++ set_normalized_timespec(&uptime,
++ uptime.tv_sec - get_exec_env()->start_timespec.tv_sec,
++ uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec);
++ }
++#endif
+ cputime_to_timespec(idletime, &idle);
+ len = sprintf(page,"%lu.%02lu %lu.%02lu\n",
+ (unsigned long) uptime.tv_sec,
+@@ -118,35 +148,37 @@ static int uptime_read_proc(char *page,
+ static int meminfo_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+ {
+- struct sysinfo i;
++ struct meminfo mi;
+ int len;
+- struct page_state ps;
+- unsigned long inactive;
+- unsigned long active;
+- unsigned long free;
+- unsigned long committed;
+- unsigned long allowed;
++ unsigned long dummy;
+ struct vmalloc_info vmi;
+- long cached;
+
+- get_page_state(&ps);
+- get_zone_counts(&active, &inactive, &free);
++ get_page_state(&mi.ps);
++ get_zone_counts(&mi.active, &mi.inactive, &dummy);
+
+ /*
+ * display in kilobytes.
+ */
+ #define K(x) ((x) << (PAGE_SHIFT - 10))
+- si_meminfo(&i);
+- si_swapinfo(&i);
+- committed = atomic_read(&vm_committed_space);
+- allowed = ((totalram_pages - hugetlb_total_pages())
+- * sysctl_overcommit_ratio / 100) + total_swap_pages;
++ si_meminfo(&mi.si);
++ si_swapinfo(&mi.si);
++ mi.committed_space = atomic_read(&vm_committed_space);
++ mi.swapcache = total_swapcache_pages;
++ mi.cache = get_page_cache_size() - mi.swapcache - mi.si.bufferram;
++ if (mi.cache < 0)
++ mi.cache = 0;
+
+- cached = get_page_cache_size() - total_swapcache_pages - i.bufferram;
+- if (cached < 0)
+- cached = 0;
++ mi.vmalloc_total = (VMALLOC_END - VMALLOC_START) >> PAGE_SHIFT;
++ mi.allowed = ((totalram_pages - hugetlb_total_pages())
++ * sysctl_overcommit_ratio / 100) + total_swap_pages;
+
+ get_vmalloc_info(&vmi);
++ mi.vmalloc_used = vmi.used >> PAGE_SHIFT;
++ mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT;
++
++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi)
++ & NOTIFY_FAIL)
++ return -ENOMSG;
+
+ /*
+ * Tagged format, for easy grepping and expansion.
+@@ -175,29 +207,29 @@ static int meminfo_read_proc(char *page,
+ "VmallocTotal: %8lu kB\n"
+ "VmallocUsed: %8lu kB\n"
+ "VmallocChunk: %8lu kB\n",
+- K(i.totalram),
+- K(i.freeram),
+- K(i.bufferram),
+- K(cached),
+- K(total_swapcache_pages),
+- K(active),
+- K(inactive),
+- K(i.totalhigh),
+- K(i.freehigh),
+- K(i.totalram-i.totalhigh),
+- K(i.freeram-i.freehigh),
+- K(i.totalswap),
+- K(i.freeswap),
+- K(ps.nr_dirty),
+- K(ps.nr_writeback),
+- K(ps.nr_mapped),
+- K(ps.nr_slab),
+- K(allowed),
+- K(committed),
+- K(ps.nr_page_table_pages),
+- (unsigned long)VMALLOC_TOTAL >> 10,
+- vmi.used >> 10,
+- vmi.largest_chunk >> 10
++ K(mi.si.totalram),
++ K(mi.si.freeram),
++ K(mi.si.bufferram),
++ K(mi.cache),
++ K(mi.swapcache),
++ K(mi.active),
++ K(mi.inactive),
++ K(mi.si.totalhigh),
++ K(mi.si.freehigh),
++ K(mi.si.totalram-mi.si.totalhigh),
++ K(mi.si.freeram-mi.si.freehigh),
++ K(mi.si.totalswap),
++ K(mi.si.freeswap),
++ K(mi.ps.nr_dirty),
++ K(mi.ps.nr_writeback),
++ K(mi.ps.nr_mapped),
++ K(mi.ps.nr_slab),
++ K(mi.allowed),
++ K(mi.committed_space),
++ K(mi.ps.nr_page_table_pages),
++ K(mi.vmalloc_total),
++ K(mi.vmalloc_used),
++ K(mi.vmalloc_largest)
+ );
+
+ len += hugetlb_report_meminfo(page + len);
+@@ -237,8 +269,15 @@ static int version_read_proc(char *page,
+ int count, int *eof, void *data)
+ {
+ int len;
++ struct new_utsname *utsname = &ve_utsname;
+
+- strcpy(page, linux_banner);
++ if (ve_is_super(get_exec_env()))
++ strcpy(page, linux_banner);
++ else
++ sprintf(page, "Linux version %s ("
++ LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") ("
++ LINUX_COMPILER ") %s\n",
++ utsname->release, utsname->version);
+ len = strlen(page);
+ return proc_calc_metrics(page, start, off, count, eof, len);
+ }
+@@ -249,144 +288,60 @@ static int cpuinfo_open(struct inode *in
+ return seq_open(file, &cpuinfo_op);
+ }
+
+-enum devinfo_states {
+- CHR_HDR,
+- CHR_LIST,
+- BLK_HDR,
+- BLK_LIST,
+- DEVINFO_DONE
+-};
+-
+-struct devinfo_state {
+- void *chrdev;
+- void *blkdev;
+- unsigned int num_records;
+- unsigned int cur_record;
+- enum devinfo_states state;
++static struct file_operations proc_cpuinfo_operations = {
++ .open = cpuinfo_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
+ };
+
+-static void *devinfo_start(struct seq_file *f, loff_t *pos)
++static int devinfo_show(struct seq_file *f, void *v)
+ {
+- struct devinfo_state *info = f->private;
++ int i = *(loff_t *) v;
+
+- if (*pos) {
+- if ((info) && (*pos <= info->num_records))
+- return info;
+- return NULL;
++ if (i < CHRDEV_MAJOR_HASH_SIZE) {
++ if (i == 0)
++ seq_printf(f, "Character devices:\n");
++ chrdev_show(f, i);
++ } else {
++ i -= CHRDEV_MAJOR_HASH_SIZE;
++ if (i == 0)
++ seq_printf(f, "\nBlock devices:\n");
++ blkdev_show(f, i);
+ }
+- info = kmalloc(sizeof(*info), GFP_KERNEL);
+- f->private = info;
+- info->chrdev = acquire_chrdev_list();
+- info->blkdev = acquire_blkdev_list();
+- info->state = CHR_HDR;
+- info->num_records = count_chrdev_list();
+- info->num_records += count_blkdev_list();
+- info->num_records += 2; /* Character and Block headers */
+- *pos = 1;
+- info->cur_record = *pos;
+- return info;
++ return 0;
+ }
+
+-static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
++static void *devinfo_start(struct seq_file *f, loff_t *pos)
+ {
+- int idummy;
+- char *ndummy;
+- struct devinfo_state *info = f->private;
+-
+- switch (info->state) {
+- case CHR_HDR:
+- info->state = CHR_LIST;
+- (*pos)++;
+- /*fallthrough*/
+- case CHR_LIST:
+- if (get_chrdev_info(info->chrdev,&idummy,&ndummy)) {
+- /*
+- * The character dev list is complete
+- */
+- info->state = BLK_HDR;
+- } else {
+- info->chrdev = get_next_chrdev(info->chrdev);
+- }
+- (*pos)++;
+- break;
+- case BLK_HDR:
+- info->state = BLK_LIST;
+- (*pos)++;
+- break;
+- case BLK_LIST:
+- if (get_blkdev_info(info->blkdev,&idummy,&ndummy)) {
+- /*
+- * The block dev list is complete
+- */
+- info->state = DEVINFO_DONE;
+- } else {
+- info->blkdev = get_next_blkdev(info->blkdev);
+- }
+- (*pos)++;
+- break;
+- case DEVINFO_DONE:
+- (*pos)++;
+- info->cur_record = *pos;
+- info = NULL;
+- break;
+- default:
+- break;
+- }
+- if (info)
+- info->cur_record = *pos;
+- return info;
++ if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
++ return pos;
++ return NULL;
+ }
+
+-static void devinfo_stop(struct seq_file *f, void *v)
++static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos)
+ {
+- struct devinfo_state *info = f->private;
+-
+- if (info) {
+- release_chrdev_list(info->chrdev);
+- release_blkdev_list(info->blkdev);
+- f->private = NULL;
+- kfree(info);
+- }
++ (*pos)++;
++ if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE))
++ return NULL;
++ return pos;
+ }
+
+-static int devinfo_show(struct seq_file *f, void *arg)
+-{
+- int major;
+- char *name;
+- struct devinfo_state *info = f->private;
+-
+- switch(info->state) {
+- case CHR_HDR:
+- seq_printf(f,"Character devices:\n");
+- /* fallthrough */
+- case CHR_LIST:
+- if (!get_chrdev_info(info->chrdev,&major,&name))
+- seq_printf(f,"%3d %s\n",major,name);
+- break;
+- case BLK_HDR:
+- seq_printf(f,"\nBlock devices:\n");
+- /* fallthrough */
+- case BLK_LIST:
+- if (!get_blkdev_info(info->blkdev,&major,&name))
+- seq_printf(f,"%3d %s\n",major,name);
+- break;
+- default:
+- break;
+- }
+-
+- return 0;
++static void devinfo_stop(struct seq_file *f, void *v)
++{
++ /* Nothing to do */
+ }
+
+-static struct seq_operations devinfo_op = {
+- .start = devinfo_start,
+- .next = devinfo_next,
+- .stop = devinfo_stop,
+- .show = devinfo_show,
++static struct seq_operations devinfo_ops = {
++ .start = devinfo_start,
++ .next = devinfo_next,
++ .stop = devinfo_stop,
++ .show = devinfo_show
+ };
+
+-static int devinfo_open(struct inode *inode, struct file *file)
++static int devinfo_open(struct inode *inode, struct file *filp)
+ {
+- return seq_open(file, &devinfo_op);
++ return seq_open(filp, &devinfo_ops);
+ }
+
+ static struct file_operations proc_devinfo_operations = {
+@@ -396,13 +351,6 @@ static struct file_operations proc_devin
+ .release = seq_release,
+ };
+
+-static struct file_operations proc_cpuinfo_operations = {
+- .open = cpuinfo_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = seq_release,
+-};
+-
+ extern struct seq_operations vmstat_op;
+ static int vmstat_open(struct inode *inode, struct file *file)
+ {
+@@ -487,18 +435,15 @@ static struct file_operations proc_slabi
+ };
+ #endif
+
+-static int show_stat(struct seq_file *p, void *v)
++static void show_stat_ve0(struct seq_file *p)
+ {
+ int i;
+- unsigned long jif;
++ struct page_state page_state;
+ cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
+ u64 sum = 0;
+
+ user = nice = system = idle = iowait =
+ irq = softirq = steal = cputime64_zero;
+- jif = - wall_to_monotonic.tv_sec;
+- if (wall_to_monotonic.tv_nsec)
+- --jif;
+
+ for_each_cpu(i) {
+ int j;
+@@ -552,9 +497,84 @@ static int show_stat(struct seq_file *p,
+ for (i = 0; i < NR_IRQS; i++)
+ seq_printf(p, " %u", kstat_irqs(i));
+ #endif
++ get_full_page_state(&page_state);
++ seq_printf(p, "\nswap %lu %lu\n", page_state.pswpin, page_state.pswpout);
++}
++
++#ifdef CONFIG_VE
++static void show_stat_ve(struct seq_file *p, struct ve_struct *env)
++{
++ int i;
++ u64 user, nice, system;
++ cycles_t idle, iowait;
++ cpumask_t ve_cpus;
++
++ ve_cpu_online_map(env, &ve_cpus);
++
++ user = nice = system = idle = iowait = 0;
++ for_each_cpu_mask(i, ve_cpus) {
++ user += VE_CPU_STATS(env, i)->user;
++ nice += VE_CPU_STATS(env, i)->nice;
++ system += VE_CPU_STATS(env, i)->system;
++ idle += ve_sched_get_idle_time(env, i);
++ iowait += ve_sched_get_iowait_time(env, i);
++ }
++
++ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n",
++ (unsigned long long)cputime64_to_clock_t(user),
++ (unsigned long long)cputime64_to_clock_t(nice),
++ (unsigned long long)cputime64_to_clock_t(system),
++ (unsigned long long)cycles_to_clocks(idle),
++ (unsigned long long)cycles_to_clocks(iowait));
++
++ for_each_cpu_mask(i, ve_cpus) {
++ user = VE_CPU_STATS(env, i)->user;
++ nice = VE_CPU_STATS(env, i)->nice;
++ system = VE_CPU_STATS(env, i)->system;
++ idle = ve_sched_get_idle_time(env, i);
++ iowait = ve_sched_get_iowait_time(env, i);
++ seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n",
++ i,
++ (unsigned long long)cputime64_to_clock_t(user),
++ (unsigned long long)cputime64_to_clock_t(nice),
++ (unsigned long long)cputime64_to_clock_t(system),
++ (unsigned long long)cycles_to_clocks(idle),
++ (unsigned long long)cycles_to_clocks(iowait));
++ }
++ seq_printf(p, "intr 0\nswap 0 0\n");
++}
++#endif
++
++int show_stat(struct seq_file *p, void *v)
++{
++ extern unsigned long total_forks;
++ unsigned long seq, jif;
++ struct ve_struct *env;
++ unsigned long __nr_running, __nr_iowait;
++
++ do {
++ seq = read_seqbegin(&xtime_lock);
++ jif = - wall_to_monotonic.tv_sec;
++ if (wall_to_monotonic.tv_nsec)
++ --jif;
++ } while (read_seqretry(&xtime_lock, seq));
++
++ env = get_exec_env();
++ if (ve_is_super(env)) {
++ show_stat_ve0(p);
++ __nr_running = nr_running();
++ __nr_iowait = nr_iowait();
++ }
++#ifdef CONFIG_VE
++ else {
++ show_stat_ve(p, env);
++ __nr_running = nr_running_ve(env);
++ __nr_iowait = nr_iowait_ve(env);
++ }
++#endif
+
+ seq_printf(p,
+- "\nctxt %llu\n"
++ "ctxt %llu\n"
+ "btime %lu\n"
+ "processes %lu\n"
+ "procs_running %lu\n"
+@@ -562,8 +582,8 @@ static int show_stat(struct seq_file *p,
+ nr_context_switches(),
+ (unsigned long)jif,
+ total_forks,
+- nr_running(),
+- nr_iowait());
++ __nr_running,
++ __nr_iowait);
+
+ return 0;
+ }
+@@ -652,7 +672,8 @@ static int cmdline_read_proc(char *page,
+ {
+ int len;
+
+- len = sprintf(page, "%s\n", saved_command_line);
++ len = sprintf(page, "%s\n",
++ ve_is_super(get_exec_env()) ? saved_command_line : "");
+ return proc_calc_metrics(page, start, off, count, eof, len);
+ }
+
+diff -upr linux-2.6.16.orig/fs/proc/proc_tty.c linux-2.6.16-026test015/fs/proc/proc_tty.c
+--- linux-2.6.16.orig/fs/proc/proc_tty.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/proc_tty.c 2006-07-04 14:41:38.000000000 +0400
+@@ -6,6 +6,7 @@
+
+ #include <asm/uaccess.h>
+
++#include <linux/ve_owner.h>
+ #include <linux/init.h>
+ #include <linux/errno.h>
+ #include <linux/time.h>
+@@ -106,24 +107,35 @@ static int show_tty_driver(struct seq_fi
+ /* iterator */
+ static void *t_start(struct seq_file *m, loff_t *pos)
+ {
+- struct list_head *p;
++ struct tty_driver *drv;
++
+ loff_t l = *pos;
+- list_for_each(p, &tty_drivers)
++ read_lock(&tty_driver_guard);
++ list_for_each_entry(drv, &tty_drivers, tty_drivers) {
++ if (!ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env()))
++ continue;
+ if (!l--)
+- return list_entry(p, struct tty_driver, tty_drivers);
++ return drv;
++ }
+ return NULL;
+ }
+
+ static void *t_next(struct seq_file *m, void *v, loff_t *pos)
+ {
+- struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next;
++ struct tty_driver *drv;
++
+ (*pos)++;
+- return p==&tty_drivers ? NULL :
+- list_entry(p, struct tty_driver, tty_drivers);
++ drv = (struct tty_driver *)v;
++ list_for_each_entry_continue(drv, &tty_drivers, tty_drivers) {
++ if (ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env()))
++ return drv;
++ }
++ return NULL;
+ }
+
+ static void t_stop(struct seq_file *m, void *v)
+ {
++ read_unlock(&tty_driver_guard);
+ }
+
+ static struct seq_operations tty_drivers_op = {
+diff -upr linux-2.6.16.orig/fs/proc/root.c linux-2.6.16-026test015/fs/proc/root.c
+--- linux-2.6.16.orig/fs/proc/root.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/root.c 2006-07-04 14:41:38.000000000 +0400
+@@ -20,7 +20,10 @@
+
+ #include "internal.h"
+
+-struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver;
++#ifndef CONFIG_VE
++struct proc_dir_entry *proc_net, *proc_net_stat;
++#endif
++struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver;
+
+ #ifdef CONFIG_SYSCTL
+ struct proc_dir_entry *proc_sys_root;
+@@ -32,12 +35,14 @@ static struct super_block *proc_get_sb(s
+ return get_sb_single(fs_type, flags, data, proc_fill_super);
+ }
+
+-static struct file_system_type proc_fs_type = {
++struct file_system_type proc_fs_type = {
+ .name = "proc",
+ .get_sb = proc_get_sb,
+ .kill_sb = kill_anon_super,
+ };
+
++EXPORT_SYMBOL(proc_fs_type);
++
+ void __init proc_root_init(void)
+ {
+ int err = proc_init_inodecache();
+@@ -157,7 +162,9 @@ EXPORT_SYMBOL(create_proc_entry);
+ EXPORT_SYMBOL(remove_proc_entry);
+ EXPORT_SYMBOL(proc_root);
+ EXPORT_SYMBOL(proc_root_fs);
++#ifndef CONFIG_VE
+ EXPORT_SYMBOL(proc_net);
+ EXPORT_SYMBOL(proc_net_stat);
++#endif
+ EXPORT_SYMBOL(proc_bus);
+ EXPORT_SYMBOL(proc_root_driver);
+diff -upr linux-2.6.16.orig/fs/proc/task_mmu.c linux-2.6.16-026test015/fs/proc/task_mmu.c
+--- linux-2.6.16.orig/fs/proc/task_mmu.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/task_mmu.c 2006-07-04 14:41:38.000000000 +0400
+@@ -90,9 +90,12 @@ int proc_exe_link(struct inode *inode, s
+ }
+
+ if (vma) {
+- *mnt = mntget(vma->vm_file->f_vfsmnt);
+- *dentry = dget(vma->vm_file->f_dentry);
+- result = 0;
++ result = d_root_check(vma->vm_file->f_dentry,
++ vma->vm_file->f_vfsmnt);
++ if (!result) {
++ *mnt = mntget(vma->vm_file->f_vfsmnt);
++ *dentry = dget(vma->vm_file->f_dentry);
++ }
+ }
+
+ up_read(&mm->mmap_sem);
+diff -upr linux-2.6.16.orig/fs/proc/task_nommu.c linux-2.6.16-026test015/fs/proc/task_nommu.c
+--- linux-2.6.16.orig/fs/proc/task_nommu.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/task_nommu.c 2006-07-04 14:41:38.000000000 +0400
+@@ -126,9 +126,12 @@ int proc_exe_link(struct inode *inode, s
+ }
+
+ if (vma) {
+- *mnt = mntget(vma->vm_file->f_vfsmnt);
+- *dentry = dget(vma->vm_file->f_dentry);
+- result = 0;
++ result = d_root_check(vma->vm_file->f_dentry,
++ vma->vm_file->f_vfsmnt);
++ if (!result) {
++ *mnt = mntget(vma->vm_file->f_vfsmnt);
++ *dentry = dget(vma->vm_file->f_dentry);
++ }
+ }
+
+ up_read(&mm->mmap_sem);
+diff -upr linux-2.6.16.orig/fs/proc/vmcore.c linux-2.6.16-026test015/fs/proc/vmcore.c
+--- linux-2.6.16.orig/fs/proc/vmcore.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/proc/vmcore.c 2006-07-04 14:41:36.000000000 +0400
+@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *
+ size_t buflen, loff_t *fpos)
+ {
+ ssize_t acc = 0, tmp;
+- size_t tsz, nr_bytes;
+- u64 start;
++ size_t tsz;
++ u64 start, nr_bytes;
+ struct vmcore *curr_m = NULL;
+
+ if (buflen == 0 || *fpos >= vmcore_size)
+diff -upr linux-2.6.16.orig/fs/quota.c linux-2.6.16-026test015/fs/quota.c
+--- linux-2.6.16.orig/fs/quota.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/quota.c 2006-07-04 14:41:39.000000000 +0400
+@@ -81,11 +81,11 @@ static int generic_quotactl_valid(struct
+ if (cmd == Q_GETQUOTA) {
+ if (((type == USRQUOTA && current->euid != id) ||
+ (type == GRPQUOTA && !in_egroup_p(id))) &&
+- !capable(CAP_SYS_ADMIN))
++ !capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ }
+ else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO)
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+
+ return 0;
+@@ -132,10 +132,10 @@ static int xqm_quotactl_valid(struct sup
+ if (cmd == Q_XGETQUOTA) {
+ if (((type == XQM_USRQUOTA && current->euid != id) ||
+ (type == XQM_GRPQUOTA && !in_egroup_p(id))) &&
+- !capable(CAP_SYS_ADMIN))
++ !capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) {
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ }
+
+@@ -216,7 +216,7 @@ restart:
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_read(&sb->s_umount);
+- if (sb->s_root && sb->s_qcop->quota_sync)
++ if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync)
+ quota_sync_sb(sb, type);
+ up_read(&sb->s_umount);
+ spin_lock(&sb_lock);
+@@ -337,6 +337,235 @@ static int do_quotactl(struct super_bloc
+ return 0;
+ }
+
++static struct super_block *quota_get_sb(const char __user *special)
++{
++ struct super_block *sb;
++ struct block_device *bdev;
++ char *tmp;
++
++ tmp = getname(special);
++ if (IS_ERR(tmp))
++ return (struct super_block *)tmp;
++ bdev = lookup_bdev(tmp, FMODE_QUOTACTL);
++ putname(tmp);
++ if (IS_ERR(bdev))
++ return (struct super_block *)bdev;
++ sb = get_super(bdev);
++ bdput(bdev);
++ if (!sb)
++ return ERR_PTR(-ENODEV);
++ return sb;
++}
++
++#ifdef CONFIG_QUOTA_COMPAT
++
++#define QC_QUOTAON 0x0100 /* enable quotas */
++#define QC_QUOTAOFF 0x0200 /* disable quotas */
++/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */
++#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */
++#define QC_SETQLIM 0x0700 /* set limits */
++/* GETSTATS at 0x0800 is now longer... */
++#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */
++#define QC_SETINFO 0x0A00 /* set info about quotas */
++#define QC_SETGRACE 0x0B00 /* set inode and block grace */
++#define QC_SETFLAGS 0x0C00 /* set flags for quota */
++#define QC_GETQUOTA 0x0D00 /* get limits and usage */
++#define QC_SETQUOTA 0x0E00 /* set limits and usage */
++#define QC_SETUSE 0x0F00 /* set usage */
++/* 0x1000 used by old RSQUASH */
++#define QC_GETSTATS 0x1100 /* get collected stats */
++
++struct compat_dqblk {
++ unsigned int dqb_ihardlimit;
++ unsigned int dqb_isoftlimit;
++ unsigned int dqb_curinodes;
++ unsigned int dqb_bhardlimit;
++ unsigned int dqb_bsoftlimit;
++ qsize_t dqb_curspace;
++ __kernel_time_t dqb_btime;
++ __kernel_time_t dqb_itime;
++};
++
++struct compat_dqinfo {
++ unsigned int dqi_bgrace;
++ unsigned int dqi_igrace;
++ unsigned int dqi_flags;
++ unsigned int dqi_blocks;
++ unsigned int dqi_free_blk;
++ unsigned int dqi_free_entry;
++};
++
++struct compat_dqstats {
++ __u32 lookups;
++ __u32 drops;
++ __u32 reads;
++ __u32 writes;
++ __u32 cache_hits;
++ __u32 allocated_dquots;
++ __u32 free_dquots;
++ __u32 syncs;
++ __u32 version;
++};
++
++asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
++static long compat_quotactl(unsigned int cmds, unsigned int type,
++ const char __user *special, qid_t id,
++ void __user *addr)
++{
++ struct super_block *sb;
++ long ret;
++
++ sb = NULL;
++ switch (cmds) {
++ case QC_QUOTAON:
++ return sys_quotactl(QCMD(Q_QUOTAON, type),
++ special, id, addr);
++
++ case QC_QUOTAOFF:
++ return sys_quotactl(QCMD(Q_QUOTAOFF, type),
++ special, id, addr);
++
++ case QC_SYNC:
++ return sys_quotactl(QCMD(Q_SYNC, type),
++ special, id, addr);
++
++ case QC_GETQUOTA: {
++ struct if_dqblk idq;
++ struct compat_dqblk cdq;
++
++ sb = quota_get_sb(special);
++ ret = PTR_ERR(sb);
++ if (IS_ERR(sb))
++ break;
++ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
++ if (ret)
++ break;
++ ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
++ if (ret)
++ break;
++ cdq.dqb_ihardlimit = idq.dqb_ihardlimit;
++ cdq.dqb_isoftlimit = idq.dqb_isoftlimit;
++ cdq.dqb_curinodes = idq.dqb_curinodes;
++ cdq.dqb_bhardlimit = idq.dqb_bhardlimit;
++ cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit;
++ cdq.dqb_curspace = idq.dqb_curspace;
++ cdq.dqb_btime = idq.dqb_btime;
++ cdq.dqb_itime = idq.dqb_itime;
++ ret = 0;
++ if (copy_to_user(addr, &cdq, sizeof(cdq)))
++ ret = -EFAULT;
++ break;
++ }
++
++ case QC_SETQUOTA:
++ case QC_SETUSE:
++ case QC_SETQLIM: {
++ struct if_dqblk idq;
++ struct compat_dqblk cdq;
++
++ sb = quota_get_sb(special);
++ ret = PTR_ERR(sb);
++ if (IS_ERR(sb))
++ break;
++ ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id);
++ if (ret)
++ break;
++ ret = -EFAULT;
++ if (copy_from_user(&cdq, addr, sizeof(cdq)))
++ break;
++ idq.dqb_ihardlimit = cdq.dqb_ihardlimit;
++ idq.dqb_isoftlimit = cdq.dqb_isoftlimit;
++ idq.dqb_curinodes = cdq.dqb_curinodes;
++ idq.dqb_bhardlimit = cdq.dqb_bhardlimit;
++ idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit;
++ idq.dqb_curspace = cdq.dqb_curspace;
++ idq.dqb_valid = 0;
++ if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM)
++ idq.dqb_valid |= QIF_LIMITS;
++ if (cmds == QC_SETQUOTA || cmds == QC_SETUSE)
++ idq.dqb_valid |= QIF_USAGE;
++ ret = sb->s_qcop->set_dqblk(sb, type, id, &idq);
++ break;
++ }
++
++ case QC_GETINFO: {
++ struct if_dqinfo iinf;
++ struct compat_dqinfo cinf;
++
++ sb = quota_get_sb(special);
++ ret = PTR_ERR(sb);
++ if (IS_ERR(sb))
++ break;
++ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id);
++ if (ret)
++ break;
++ ret = sb->s_qcop->get_info(sb, type, &iinf);
++ if (ret)
++ break;
++ cinf.dqi_bgrace = iinf.dqi_bgrace;
++ cinf.dqi_igrace = iinf.dqi_igrace;
++ cinf.dqi_flags = 0;
++ if (iinf.dqi_flags & DQF_INFO_DIRTY)
++ cinf.dqi_flags |= 0x0010;
++ cinf.dqi_blocks = 0;
++ cinf.dqi_free_blk = 0;
++ cinf.dqi_free_entry = 0;
++ ret = 0;
++ if (copy_to_user(addr, &cinf, sizeof(cinf)))
++ ret = -EFAULT;
++ break;
++ }
++
++ case QC_SETINFO:
++ case QC_SETGRACE:
++ case QC_SETFLAGS: {
++ struct if_dqinfo iinf;
++ struct compat_dqinfo cinf;
++
++ sb = quota_get_sb(special);
++ ret = PTR_ERR(sb);
++ if (IS_ERR(sb))
++ break;
++ ret = check_quotactl_valid(sb, type, Q_SETINFO, id);
++ if (ret)
++ break;
++ ret = -EFAULT;
++ if (copy_from_user(&cinf, addr, sizeof(cinf)))
++ break;
++ iinf.dqi_bgrace = cinf.dqi_bgrace;
++ iinf.dqi_igrace = cinf.dqi_igrace;
++ iinf.dqi_flags = cinf.dqi_flags;
++ iinf.dqi_valid = 0;
++ if (cmds == QC_SETINFO || cmds == QC_SETGRACE)
++ iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE;
++ if (cmds == QC_SETINFO || cmds == QC_SETFLAGS)
++ iinf.dqi_valid |= IIF_FLAGS;
++ ret = sb->s_qcop->set_info(sb, type, &iinf);
++ break;
++ }
++
++ case QC_GETSTATS: {
++ struct compat_dqstats stat;
++
++ memset(&stat, 0, sizeof(stat));
++ stat.version = 6*10000+5*100+0;
++ ret = 0;
++ if (copy_to_user(addr, &stat, sizeof(stat)))
++ ret = -EFAULT;
++ break;
++ }
++
++ default:
++ ret = -ENOSYS;
++ break;
++ }
++ if (sb && !IS_ERR(sb))
++ drop_super(sb);
++ return ret;
++}
++
++#endif
++
+ /*
+ * This is the system call interface. This communicates with
+ * the user-level programs. Currently this only supports diskquota
+@@ -347,25 +576,20 @@ asmlinkage long sys_quotactl(unsigned in
+ {
+ uint cmds, type;
+ struct super_block *sb = NULL;
+- struct block_device *bdev;
+- char *tmp;
+ int ret;
+
+ cmds = cmd >> SUBCMDSHIFT;
+ type = cmd & SUBCMDMASK;
+
++#ifdef CONFIG_QUOTA_COMPAT
++ if (cmds >= 0x0100 && cmds < 0x3000)
++ return compat_quotactl(cmds, type, special, id, addr);
++#endif
++
+ if (cmds != Q_SYNC || special) {
+- tmp = getname(special);
+- if (IS_ERR(tmp))
+- return PTR_ERR(tmp);
+- bdev = lookup_bdev(tmp);
+- putname(tmp);
+- if (IS_ERR(bdev))
+- return PTR_ERR(bdev);
+- sb = get_super(bdev);
+- bdput(bdev);
+- if (!sb)
+- return -ENODEV;
++ sb = quota_get_sb(special);
++ if (IS_ERR(sb))
++ return PTR_ERR(sb);
+ }
+
+ ret = check_quotactl_valid(sb, type, cmds, id);
+diff -upr linux-2.6.16.orig/fs/reiserfs/namei.c linux-2.6.16-026test015/fs/reiserfs/namei.c
+--- linux-2.6.16.orig/fs/reiserfs/namei.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/reiserfs/namei.c 2006-07-04 14:41:39.000000000 +0400
+@@ -864,6 +864,9 @@ static int reiserfs_rmdir(struct inode *
+ INITIALIZE_PATH(path);
+ struct reiserfs_dir_entry de;
+
++ inode = dentry->d_inode;
++ DQUOT_INIT(inode);
++
+ /* we will be doing 2 balancings and update 2 stat data, we change quotas
+ * of the owner of the directory and of the owner of the parent directory.
+ * The quota structure is possibly deleted only on last iput => outside
+@@ -888,8 +891,6 @@ static int reiserfs_rmdir(struct inode *
+ goto end_rmdir;
+ }
+
+- inode = dentry->d_inode;
+-
+ reiserfs_update_inode_transaction(inode);
+ reiserfs_update_inode_transaction(dir);
+
+@@ -952,6 +953,7 @@ static int reiserfs_unlink(struct inode
+ unsigned long savelink;
+
+ inode = dentry->d_inode;
++ DQUOT_INIT(inode);
+
+ /* in this transaction we can be doing at max two balancings and update
+ * two stat datas, we change quotas of the owner of the directory and of
+@@ -1259,6 +1261,8 @@ static int reiserfs_rename(struct inode
+
+ old_inode = old_dentry->d_inode;
+ new_dentry_inode = new_dentry->d_inode;
++ if (new_dentry_inode)
++ DQUOT_INIT(new_dentry_inode);
+
+ // make sure, that oldname still exists and points to an object we
+ // are going to rename
+diff -upr linux-2.6.16.orig/fs/reiserfs/xattr.c linux-2.6.16-026test015/fs/reiserfs/xattr.c
+--- linux-2.6.16.orig/fs/reiserfs/xattr.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/reiserfs/xattr.c 2006-07-04 14:41:37.000000000 +0400
+@@ -1343,7 +1343,8 @@ static int reiserfs_check_acl(struct ino
+ return error;
+ }
+
+-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
++int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ /*
+ * We don't do permission checks on the internal objects.
+@@ -1356,7 +1357,7 @@ int reiserfs_permission(struct inode *in
+ * Stat data v1 doesn't support ACLs.
+ */
+ if (get_inode_sd_version(inode) == STAT_DATA_V1)
+- return generic_permission(inode, mask, NULL);
++ return generic_permission(inode, mask, NULL, perm);
+ else
+- return generic_permission(inode, mask, reiserfs_check_acl);
++ return generic_permission(inode, mask, reiserfs_check_acl, perm);
+ }
+diff -upr linux-2.6.16.orig/fs/reiserfs/xattr_acl.c linux-2.6.16-026test015/fs/reiserfs/xattr_acl.c
+--- linux-2.6.16.orig/fs/reiserfs/xattr_acl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/reiserfs/xattr_acl.c 2006-07-04 14:41:36.000000000 +0400
+@@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct in
+ acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+ reiserfs_read_unlock_xattrs(inode->i_sb);
+ reiserfs_read_unlock_xattr_i(inode);
+- ret = acl ? 1 : 0;
+- posix_acl_release(acl);
++ ret = (acl && !IS_ERR(acl));
++ if (ret)
++ posix_acl_release(acl);
+ }
+
+ return ret;
+diff -upr linux-2.6.16.orig/fs/select.c linux-2.6.16-026test015/fs/select.c
+--- linux-2.6.16.orig/fs/select.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/select.c 2006-07-04 14:41:37.000000000 +0400
+@@ -24,6 +24,8 @@
+ #include <linux/fs.h>
+ #include <linux/rcupdate.h>
+
++#include <ub/ub_mem.h>
++
+ #include <asm/uaccess.h>
+
+ #define ROUND_UP(x,y) (((x)+(y)-1)/(y))
+@@ -286,7 +288,7 @@ int do_select(int n, fd_set_bits *fds, s
+
+ static void *select_bits_alloc(int size)
+ {
+- return kmalloc(6 * size, GFP_KERNEL);
++ return ub_kmalloc(6 * size, GFP_KERNEL);
+ }
+
+ static void select_bits_free(void *bits, int size)
+@@ -645,7 +647,7 @@ int do_sys_poll(struct pollfd __user *uf
+ err = -ENOMEM;
+ while(i!=0) {
+ struct poll_list *pp;
+- pp = kmalloc(sizeof(struct poll_list)+
++ pp = ub_kmalloc(sizeof(struct poll_list)+
+ sizeof(struct pollfd)*
+ (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i),
+ GFP_KERNEL);
+diff -upr linux-2.6.16.orig/fs/seq_file.c linux-2.6.16-026test015/fs/seq_file.c
+--- linux-2.6.16.orig/fs/seq_file.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/seq_file.c 2006-07-04 14:41:38.000000000 +0400
+@@ -345,6 +345,8 @@ int seq_path(struct seq_file *m,
+ if (m->count < m->size) {
+ char *s = m->buf + m->count;
+ char *p = d_path(dentry, mnt, s, m->size - m->count);
++ if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG)
++ return 0;
+ if (!IS_ERR(p)) {
+ while (s <= p) {
+ char c = *p++;
+diff -upr linux-2.6.16.orig/fs/simfs.c linux-2.6.16-026test015/fs/simfs.c
+--- linux-2.6.16.orig/fs/simfs.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/simfs.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,290 @@
++/*
++ * fs/simfs.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/fs.h>
++#include <linux/file.h>
++#include <linux/init.h>
++#include <linux/namei.h>
++#include <linux/err.h>
++#include <linux/module.h>
++#include <linux/mount.h>
++#include <linux/vzquota.h>
++#include <linux/statfs.h>
++#include <linux/virtinfo.h>
++#include <linux/faudit.h>
++#include <linux/genhd.h>
++
++#include <asm/unistd.h>
++#include <asm/uaccess.h>
++
++#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb
++
++static struct super_operations sim_super_ops;
++
++static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry,
++ struct kstat *stat)
++{
++ struct super_block *sb;
++ struct inode *inode;
++
++ inode = dentry->d_inode;
++ if (!inode->i_op->getattr) {
++ generic_fillattr(inode, stat);
++ if (!stat->blksize) {
++ unsigned blocks;
++
++ sb = inode->i_sb;
++ blocks = (stat->size + sb->s_blocksize-1) >>
++ sb->s_blocksize_bits;
++ stat->blocks = (sb->s_blocksize / 512) * blocks;
++ stat->blksize = sb->s_blocksize;
++ }
++ } else {
++ int err;
++
++ err = inode->i_op->getattr(mnt, dentry, stat);
++ if (err)
++ return err;
++ }
++
++ sb = mnt->mnt_sb;
++ if (sb->s_op == &sim_super_ops)
++ stat->dev = sb->s_dev;
++ return 0;
++}
++
++static void quota_get_stat(struct super_block *sb, struct kstatfs *buf)
++{
++ int err;
++ struct dq_stat qstat;
++ struct virt_info_quota q;
++ long free_file, adj_file;
++ s64 blk, free_blk, adj_blk;
++ int bsize_bits;
++
++ q.super = sb;
++ q.qstat = &qstat;
++ err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q);
++ if (err != NOTIFY_OK)
++ return;
++
++ bsize_bits = ffs(buf->f_bsize) - 1;
++ free_blk = (s64)(qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits;
++ if (free_blk < 0)
++ free_blk = 0;
++ /*
++ * In the regular case, we always set buf->f_bfree and buf->f_blocks to
++ * the values reported by quota. In case of real disk space shortage,
++ * we adjust the values. We want this adjustment to look as if the
++ * total disk space were reduced, not as if the usage were increased.
++ * -- SAW
++ */
++ adj_blk = 0;
++ if (buf->f_bfree < free_blk)
++ adj_blk = free_blk - buf->f_bfree;
++ buf->f_bfree = (long)(free_blk - adj_blk);
++
++ if (free_blk < buf->f_bavail)
++ buf->f_bavail = (long)free_blk; /* min(f_bavail, free_blk) */
++
++ blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk;
++ buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk;
++
++ free_file = qstat.isoftlimit - qstat.icurrent;
++ if (free_file < 0)
++ free_file = 0;
++ if (buf->f_ffree == -1)
++ /*
++ * One filesystem uses -1 to represent the fact that it doesn't
++ * have a detached limit for inode number.
++ * May be, because -1 is a good pretendent for the maximum value
++ * of signed long type, may be, because it's just nice to have
++ * an exceptional case... Guess what that filesystem is :-)
++ * -- SAW
++ */
++ buf->f_ffree = free_file;
++ adj_file = 0;
++ if (buf->f_ffree < free_file)
++ adj_file = free_file - buf->f_ffree;
++ buf->f_ffree = free_file - adj_file;
++ buf->f_files = qstat.isoftlimit - adj_file;
++}
++
++static int sim_statfs(struct super_block *sb, struct kstatfs *buf)
++{
++ int err;
++ struct super_block *lsb;
++ struct kstatfs statbuf;
++
++ err = 0;
++ if (sb->s_op != &sim_super_ops)
++ return 0;
++
++ lsb = SIMFS_GET_LOWER_FS_SB(sb);
++
++ err = -ENOSYS;
++ if (lsb && lsb->s_op && lsb->s_op->statfs)
++ err = lsb->s_op->statfs(lsb, &statbuf);
++ if (err)
++ return err;
++
++ quota_get_stat(sb, &statbuf);
++
++ buf->f_files = statbuf.f_files;
++ buf->f_ffree = statbuf.f_ffree;
++ buf->f_blocks = statbuf.f_blocks;
++ buf->f_bfree = statbuf.f_bfree;
++ buf->f_bavail = statbuf.f_bavail;
++ return 0;
++}
++
++static int sim_systemcall(struct vnotifier_block *me, unsigned long n,
++ void *d, int old_ret)
++{
++ int err;
++
++ switch (n) {
++ case VIRTINFO_FAUDIT_STAT: {
++ struct faudit_stat_arg *arg;
++
++ arg = (struct faudit_stat_arg *)d;
++ err = sim_getattr(arg->mnt, arg->dentry, arg->stat);
++ arg->err = err;
++ }
++ break;
++ case VIRTINFO_FAUDIT_STATFS: {
++ struct faudit_statfs_arg *arg;
++
++ arg = (struct faudit_statfs_arg *)d;
++ err = sim_statfs(arg->sb, arg->stat);
++ arg->err = err;
++ }
++ break;
++ default:
++ return old_ret;
++ }
++ return (err ? NOTIFY_BAD : NOTIFY_OK);
++}
++
++static struct inode *sim_quota_root(struct super_block *sb)
++{
++ return sb->s_root->d_inode;
++}
++
++void sim_put_super(struct super_block *sb)
++{
++ struct virt_info_quota viq;
++
++ viq.super = sb;
++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq);
++ bdput(sb->s_bdev);
++}
++
++static struct super_operations sim_super_ops = {
++ .get_quota_root = sim_quota_root,
++ .put_super = sim_put_super,
++};
++
++static int sim_fill_super(struct super_block *s, void *data)
++{
++ int err;
++ struct nameidata *nd;
++
++ err = set_anon_super(s, NULL);
++ if (err)
++ goto out;
++
++ err = 0;
++ nd = (struct nameidata *)data;
++ s->s_root = dget(nd->dentry);
++ s->s_op = &sim_super_ops;
++out:
++ return err;
++}
++
++struct super_block *sim_get_sb(struct file_system_type *type,
++ int flags, const char *dev_name, void *opt)
++{
++ int err;
++ struct nameidata nd;
++ struct super_block *sb;
++ struct block_device *bd;
++ struct virt_info_quota viq;
++ static struct hd_struct fake_hds;
++
++ sb = ERR_PTR(-EINVAL);
++ if (opt == NULL)
++ goto out;
++
++ err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
++ sb = ERR_PTR(err);
++ if (err)
++ goto out;
++
++ sb = sget(type, NULL, sim_fill_super, &nd);
++ if (IS_ERR(sb))
++ goto out_path;
++
++ bd = bdget(sb->s_dev);
++ if (!bd)
++ goto out_killsb;
++
++ sb->s_bdev = bd;
++ bd->bd_part = &fake_hds;
++ viq.super = sb;
++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq);
++out_path:
++ path_release(&nd);
++out:
++ return sb;
++
++out_killsb:
++ up_write(&sb->s_umount);
++ deactivate_super(sb);
++ sb = ERR_PTR(-ENODEV);
++ goto out_path;
++}
++
++static struct file_system_type sim_fs_type = {
++ .owner = THIS_MODULE,
++ .name = "simfs",
++ .get_sb = sim_get_sb,
++ .kill_sb = kill_anon_super,
++};
++
++static struct vnotifier_block sim_syscalls = {
++ .notifier_call = sim_systemcall,
++};
++
++static int __init init_simfs(void)
++{
++ int err;
++
++ err = register_filesystem(&sim_fs_type);
++ if (err)
++ return err;
++
++ virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls);
++ return 0;
++}
++
++static void __exit exit_simfs(void)
++{
++ virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls);
++ unregister_filesystem(&sim_fs_type);
++}
++
++MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
++MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System");
++MODULE_LICENSE("GPL v2");
++
++module_init(init_simfs);
++module_exit(exit_simfs);
+diff -upr linux-2.6.16.orig/fs/smbfs/dir.c linux-2.6.16-026test015/fs/smbfs/dir.c
+--- linux-2.6.16.orig/fs/smbfs/dir.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/smbfs/dir.c 2006-07-04 14:41:36.000000000 +0400
+@@ -434,6 +434,11 @@ smb_lookup(struct inode *dir, struct den
+ if (dentry->d_name.len > SMB_MAXNAMELEN)
+ goto out;
+
++ /* Do not allow lookup of names with backslashes in */
++ error = -EINVAL;
++ if (memchr(dentry->d_name.name, '\\', dentry->d_name.len))
++ goto out;
++
+ lock_kernel();
+ error = smb_proc_getattr(dentry, &finfo);
+ #ifdef SMBFS_PARANOIA
+diff -upr linux-2.6.16.orig/fs/smbfs/file.c linux-2.6.16-026test015/fs/smbfs/file.c
+--- linux-2.6.16.orig/fs/smbfs/file.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/smbfs/file.c 2006-07-04 14:41:37.000000000 +0400
+@@ -387,7 +387,8 @@ smb_file_release(struct inode *inode, st
+ * privileges, so we need our own check for this.
+ */
+ static int
+-smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
++smb_file_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ int mode = inode->i_mode;
+ int error = 0;
+diff -upr linux-2.6.16.orig/fs/smbfs/inode.c linux-2.6.16-026test015/fs/smbfs/inode.c
+--- linux-2.6.16.orig/fs/smbfs/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/smbfs/inode.c 2006-07-04 14:41:37.000000000 +0400
+@@ -233,7 +233,7 @@ smb_invalidate_inodes(struct smb_sb_info
+ {
+ VERBOSE("\n");
+ shrink_dcache_sb(SB_of(server));
+- invalidate_inodes(SB_of(server));
++ invalidate_inodes(SB_of(server), 0);
+ }
+
+ /*
+diff -upr linux-2.6.16.orig/fs/smbfs/request.c linux-2.6.16-026test015/fs/smbfs/request.c
+--- linux-2.6.16.orig/fs/smbfs/request.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/smbfs/request.c 2006-07-04 14:41:36.000000000 +0400
+@@ -339,9 +339,11 @@ int smb_add_request(struct smb_request *
+ /*
+ * On timeout or on interrupt we want to try and remove the
+ * request from the recvq/xmitq.
++ * First check if the request is still part of a queue. (May
++ * have been removed by some error condition)
+ */
+ smb_lock_server(server);
+- if (!(req->rq_flags & SMB_REQ_RECEIVED)) {
++ if (!list_empty(&req->rq_queue)) {
+ list_del_init(&req->rq_queue);
+ smb_rput(req);
+ }
+diff -upr linux-2.6.16.orig/fs/stat.c linux-2.6.16-026test015/fs/stat.c
+--- linux-2.6.16.orig/fs/stat.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/stat.c 2006-07-04 14:41:39.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/namei.h>
+ #include <linux/security.h>
+ #include <linux/syscalls.h>
++#include <linux/faudit.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+@@ -42,11 +43,19 @@ int vfs_getattr(struct vfsmount *mnt, st
+ {
+ struct inode *inode = dentry->d_inode;
+ int retval;
++ struct faudit_stat_arg arg;
+
+ retval = security_inode_getattr(mnt, dentry);
+ if (retval)
+ return retval;
+
++ arg.mnt = mnt;
++ arg.dentry = dentry;
++ arg.stat = stat;
++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg)
++ != NOTIFY_DONE)
++ return arg.err;
++
+ if (inode->i_op->getattr)
+ return inode->i_op->getattr(mnt, dentry, stat);
+
+diff -upr linux-2.6.16.orig/fs/super.c linux-2.6.16-026test015/fs/super.c
+--- linux-2.6.16.orig/fs/super.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/super.c 2006-07-04 14:41:38.000000000 +0400
+@@ -23,6 +23,7 @@
+ #include <linux/config.h>
+ #include <linux/module.h>
+ #include <linux/slab.h>
++#include <linux/ve_owner.h>
+ #include <linux/init.h>
+ #include <linux/smp_lock.h>
+ #include <linux/acct.h>
+@@ -231,13 +232,13 @@ void generic_shutdown_super(struct super
+ if (root) {
+ sb->s_root = NULL;
+ shrink_dcache_parent(root);
+- shrink_dcache_anon(&sb->s_anon);
++ shrink_dcache_anon(sb);
+ dput(root);
+ fsync_super(sb);
+ lock_super(sb);
+ sb->s_flags &= ~MS_ACTIVE;
+ /* bad name - it should be evict_inodes() */
+- invalidate_inodes(sb);
++ invalidate_inodes(sb, 0);
+ lock_kernel();
+
+ if (sop->write_super && sb->s_dirt)
+@@ -246,7 +247,7 @@ void generic_shutdown_super(struct super
+ sop->put_super(sb);
+
+ /* Forget any remaining inodes */
+- if (invalidate_inodes(sb)) {
++ if (invalidate_inodes(sb, 1)) {
+ printk("VFS: Busy inodes after unmount of %s. "
+ "Self-destruct in 5 seconds. Have a nice day...\n",
+ sb->s_id);
+@@ -481,11 +482,20 @@ asmlinkage long sys_ustat(unsigned dev,
+ struct super_block *s;
+ struct ustat tmp;
+ struct kstatfs sbuf;
+- int err = -EINVAL;
++ dev_t kdev;
++ int err;
++
++ kdev = new_decode_dev(dev);
++#ifdef CONFIG_VE
++ err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ);
++ if (err)
++ goto out;
++#endif
+
+- s = user_get_super(new_decode_dev(dev));
+- if (s == NULL)
+- goto out;
++ err = -EINVAL;
++ s = user_get_super(kdev);
++ if (s == NULL)
++ goto out;
+ err = vfs_statfs(s, &sbuf);
+ drop_super(s);
+ if (err)
+@@ -599,6 +609,13 @@ void emergency_remount(void)
+ static struct idr unnamed_dev_idr;
+ static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
+
++/* for compatibility with coreutils still unaware of new minor sizes */
++int unnamed_dev_majors[] = {
++ 0, 144, 145, 146, 242, 243, 244, 245,
++ 246, 247, 248, 249, 250, 251, 252, 253
++};
++EXPORT_SYMBOL(unnamed_dev_majors);
++
+ int set_anon_super(struct super_block *s, void *data)
+ {
+ int dev;
+@@ -616,13 +633,13 @@ int set_anon_super(struct super_block *s
+ else if (error)
+ return -EAGAIN;
+
+- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) {
++ if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) {
+ spin_lock(&unnamed_dev_lock);
+ idr_remove(&unnamed_dev_idr, dev);
+ spin_unlock(&unnamed_dev_lock);
+ return -EMFILE;
+ }
+- s->s_dev = MKDEV(0, dev & MINORMASK);
++ s->s_dev = make_unnamed_dev(dev);
+ return 0;
+ }
+
+@@ -630,8 +647,9 @@ EXPORT_SYMBOL(set_anon_super);
+
+ void kill_anon_super(struct super_block *sb)
+ {
+- int slot = MINOR(sb->s_dev);
++ int slot;
+
++ slot = unnamed_dev_idx(sb->s_dev);
+ generic_shutdown_super(sb);
+ spin_lock(&unnamed_dev_lock);
+ idr_remove(&unnamed_dev_idr, slot);
+diff -upr linux-2.6.16.orig/fs/sysfs/bin.c linux-2.6.16-026test015/fs/sysfs/bin.c
+--- linux-2.6.16.orig/fs/sysfs/bin.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/bin.c 2006-07-04 14:41:37.000000000 +0400
+@@ -120,6 +120,9 @@ static int open(struct inode * inode, st
+ struct bin_attribute * attr = to_bin_attr(file->f_dentry);
+ int error = -EINVAL;
+
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ if (!kobj || !attr)
+ goto Done;
+
+@@ -196,6 +199,9 @@ int sysfs_create_bin_file(struct kobject
+
+ int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+ {
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ sysfs_hash_and_remove(kobj->dentry,attr->attr.name);
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/fs/sysfs/dir.c linux-2.6.16-026test015/fs/sysfs/dir.c
+--- linux-2.6.16.orig/fs/sysfs/dir.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/dir.c 2006-07-04 14:41:37.000000000 +0400
+@@ -144,6 +144,9 @@ int sysfs_create_dir(struct kobject * ko
+ struct dentry * parent;
+ int error = 0;
+
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ BUG_ON(!kobj);
+
+ if (kobj->parent)
+@@ -278,10 +281,14 @@ void sysfs_remove_subdir(struct dentry *
+
+ void sysfs_remove_dir(struct kobject * kobj)
+ {
+- struct dentry * dentry = dget(kobj->dentry);
++ struct dentry * dentry;
+ struct sysfs_dirent * parent_sd;
+ struct sysfs_dirent * sd, * tmp;
+
++ if (!ve_sysfs_alowed())
++ return;
++
++ dentry = dget(kobj->dentry);
+ if (!dentry)
+ return;
+
+@@ -302,6 +309,7 @@ void sysfs_remove_dir(struct kobject * k
+ * Drop reference from dget() on entrance.
+ */
+ dput(dentry);
++ kobj->dentry = NULL;
+ }
+
+ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
+@@ -309,6 +317,9 @@ int sysfs_rename_dir(struct kobject * ko
+ int error = 0;
+ struct dentry * new_dentry, * parent;
+
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ if (!strcmp(kobject_name(kobj), new_name))
+ return -EINVAL;
+
+diff -upr linux-2.6.16.orig/fs/sysfs/file.c linux-2.6.16-026test015/fs/sysfs/file.c
+--- linux-2.6.16.orig/fs/sysfs/file.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/file.c 2006-07-04 14:41:37.000000000 +0400
+@@ -183,7 +183,7 @@ fill_write_buffer(struct sysfs_buffer *
+ return -ENOMEM;
+
+ if (count >= PAGE_SIZE)
+- count = PAGE_SIZE;
++ count = PAGE_SIZE - 1;
+ error = copy_from_user(buffer->page,buf,count);
+ buffer->needs_read_fill = 1;
+ return error ? -EFAULT : count;
+@@ -380,6 +380,9 @@ int sysfs_add_file(struct dentry * dir,
+
+ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
+ {
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ BUG_ON(!kobj || !kobj->dentry || !attr);
+
+ return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR);
+@@ -398,6 +401,9 @@ int sysfs_update_file(struct kobject * k
+ struct dentry * victim;
+ int res = -ENOENT;
+
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ mutex_lock(&dir->d_inode->i_mutex);
+ victim = lookup_one_len(attr->name, dir, strlen(attr->name));
+ if (!IS_ERR(victim)) {
+@@ -473,6 +479,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
+
+ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr)
+ {
++ if (!ve_sysfs_alowed())
++ return;
++
+ sysfs_hash_and_remove(kobj->dentry,attr->name);
+ }
+
+diff -upr linux-2.6.16.orig/fs/sysfs/group.c linux-2.6.16-026test015/fs/sysfs/group.c
+--- linux-2.6.16.orig/fs/sysfs/group.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/group.c 2006-07-04 14:41:37.000000000 +0400
+@@ -46,6 +46,9 @@ int sysfs_create_group(struct kobject *
+ struct dentry * dir;
+ int error;
+
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ BUG_ON(!kobj || !kobj->dentry);
+
+ if (grp->name) {
+@@ -68,6 +71,9 @@ void sysfs_remove_group(struct kobject *
+ {
+ struct dentry * dir;
+
++ if (!ve_sysfs_alowed())
++ return;
++
+ if (grp->name)
+ dir = lookup_one_len(grp->name, kobj->dentry,
+ strlen(grp->name));
+diff -upr linux-2.6.16.orig/fs/sysfs/inode.c linux-2.6.16-026test015/fs/sysfs/inode.c
+--- linux-2.6.16.orig/fs/sysfs/inode.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/inode.c 2006-07-04 14:41:37.000000000 +0400
+@@ -8,14 +8,13 @@
+
+ #undef DEBUG
+
++#include <linux/config.h>
+ #include <linux/pagemap.h>
+ #include <linux/namei.h>
+ #include <linux/backing-dev.h>
+ #include <linux/capability.h>
+ #include "sysfs.h"
+
+-extern struct super_block * sysfs_sb;
+-
+ static struct address_space_operations sysfs_aops = {
+ .readpage = simple_readpage,
+ .prepare_write = simple_prepare_write,
+@@ -227,12 +226,16 @@ void sysfs_drop_dentry(struct sysfs_dire
+ void sysfs_hash_and_remove(struct dentry * dir, const char * name)
+ {
+ struct sysfs_dirent * sd;
+- struct sysfs_dirent * parent_sd = dir->d_fsdata;
++ struct sysfs_dirent * parent_sd;
++
++ if (!dir)
++ return;
+
+ if (dir->d_inode == NULL)
+ /* no inode means this hasn't been made visible yet */
+ return;
+
++ parent_sd = dir->d_fsdata;
+ mutex_lock(&dir->d_inode->i_mutex);
+ list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+ if (!sd->s_element)
+diff -upr linux-2.6.16.orig/fs/sysfs/mount.c linux-2.6.16-026test015/fs/sysfs/mount.c
+--- linux-2.6.16.orig/fs/sysfs/mount.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/mount.c 2006-07-04 14:41:38.000000000 +0400
+@@ -7,6 +7,7 @@
+ #include <linux/fs.h>
+ #include <linux/mount.h>
+ #include <linux/pagemap.h>
++#include <linux/module.h>
+ #include <linux/init.h>
+
+ #include "sysfs.h"
+@@ -14,8 +15,11 @@
+ /* Random magic number */
+ #define SYSFS_MAGIC 0x62656572
+
++#ifndef CONFIG_VE
+ struct vfsmount *sysfs_mount;
+ struct super_block * sysfs_sb = NULL;
++#endif
++
+ kmem_cache_t *sysfs_dir_cachep;
+
+ static struct super_operations sysfs_ops = {
+@@ -31,6 +35,15 @@ static struct sysfs_dirent sysfs_root =
+ .s_iattr = NULL,
+ };
+
++#ifdef CONFIG_VE
++static void init_ve0_sysfs_root(void)
++{
++ get_ve0()->sysfs_root = &sysfs_root;
++}
++
++#define sysfs_root (*(get_exec_env()->sysfs_root))
++#endif
++
+ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
+ {
+ struct inode *inode;
+@@ -72,16 +85,21 @@ static struct super_block *sysfs_get_sb(
+ return get_sb_single(fs_type, flags, data, sysfs_fill_super);
+ }
+
+-static struct file_system_type sysfs_fs_type = {
++struct file_system_type sysfs_fs_type = {
+ .name = "sysfs",
+ .get_sb = sysfs_get_sb,
+ .kill_sb = kill_litter_super,
+ };
+
++EXPORT_SYMBOL(sysfs_fs_type);
++
+ int __init sysfs_init(void)
+ {
+ int err = -ENOMEM;
+
++#ifdef CONFIG_VE
++ init_ve0_sysfs_root();
++#endif
+ sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
+ sizeof(struct sysfs_dirent),
+ 0, 0, NULL, NULL);
+diff -upr linux-2.6.16.orig/fs/sysfs/symlink.c linux-2.6.16-026test015/fs/sysfs/symlink.c
+--- linux-2.6.16.orig/fs/sysfs/symlink.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/symlink.c 2006-07-04 14:41:37.000000000 +0400
+@@ -66,6 +66,7 @@ static int sysfs_add_link(struct dentry
+ if (!error)
+ return 0;
+
++ kobject_put(target);
+ kfree(sl->link_name);
+ exit2:
+ kfree(sl);
+@@ -86,6 +87,9 @@ int sysfs_create_link(struct kobject * k
+
+ BUG_ON(!kobj || !kobj->dentry || !name);
+
++ if (!ve_sysfs_alowed())
++ return 0;
++
+ mutex_lock(&dentry->d_inode->i_mutex);
+ error = sysfs_add_link(dentry, name, target);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+@@ -101,6 +105,9 @@ int sysfs_create_link(struct kobject * k
+
+ void sysfs_remove_link(struct kobject * kobj, const char * name)
+ {
++ if(!ve_sysfs_alowed())
++ return;
++
+ sysfs_hash_and_remove(kobj->dentry,name);
+ }
+
+diff -upr linux-2.6.16.orig/fs/sysfs/sysfs.h linux-2.6.16-026test015/fs/sysfs/sysfs.h
+--- linux-2.6.16.orig/fs/sysfs/sysfs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/sysfs/sysfs.h 2006-07-04 14:41:38.000000000 +0400
+@@ -1,5 +1,14 @@
+
+-extern struct vfsmount * sysfs_mount;
++#ifndef CONFIG_VE
++extern struct vfsmount *sysfs_mount;
++extern struct super_block *sysfs_sb;
++#define ve_sysfs_alowed() (1)
++#else
++#define sysfs_mount (get_exec_env()->sysfs_mnt)
++#define sysfs_sb (get_exec_env()->sysfs_sb)
++#define ve_sysfs_alowed() (sysfs_sb != NULL)
++#endif
++
+ extern kmem_cache_t *sysfs_dir_cachep;
+
+ extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *);
+@@ -19,7 +28,6 @@ extern void sysfs_drop_dentry(struct sys
+ extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+
+ extern struct rw_semaphore sysfs_rename_sem;
+-extern struct super_block * sysfs_sb;
+ extern struct file_operations sysfs_dir_operations;
+ extern struct file_operations sysfs_file_operations;
+ extern struct file_operations bin_fops;
+diff -upr linux-2.6.16.orig/fs/vzdq_file.c linux-2.6.16-026test015/fs/vzdq_file.c
+--- linux-2.6.16.orig/fs/vzdq_file.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/vzdq_file.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,851 @@
++/*
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo quota files as proc entry implementation.
++ * It is required for std quota tools to work correctly as they are expecting
++ * aquota.user and aquota.group files.
++ */
++
++#include <linux/ctype.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/module.h>
++#include <linux/proc_fs.h>
++#include <linux/sysctl.h>
++#include <linux/mount.h>
++#include <linux/namespace.h>
++#include <linux/quotaio_v2.h>
++#include <asm/uaccess.h>
++
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/vzdq_tree.h>
++#include <linux/vzquota.h>
++
++/* ----------------------------------------------------------------------
++ *
++ * File read operation
++ *
++ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c,
++ * perhaps) abuse vz_quota_sem.
++ * Taking a global semaphore for lengthy and user-controlled operations inside
++ * VPSs is not a good idea in general.
++ * In this case, the reasons for taking this semaphore are completely unclear,
++ * especially taking into account that the only function that has comments
++ * about the necessity to be called under this semaphore
++ * (create_proc_quotafile) is actually called OUTSIDE it.
++ *
++ * --------------------------------------------------------------------- */
++
++#define DQBLOCK_SIZE 1024
++#define DQUOTBLKNUM 21U
++#define DQTREE_DEPTH 4
++#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1)
++#define ISINDBLOCK(num) ((num)%2 != 0)
++#define FIRST_DATABLK 2 /* first even number */
++#define LAST_IND_LEVEL (DQTREE_DEPTH - 1)
++#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS))
++#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \
++ & QUOTATREE_BMASK)
++
++#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH)
++#error xBITS and DQTREE_DEPTH does not correspond
++#endif
++
++#define BLOCK_NOT_FOUND 1
++
++/* data for quota file -- one per proc entry */
++struct quotatree_data {
++ struct list_head list;
++ struct vz_quota_master *qmblk;
++ int type; /* type of the tree */
++};
++
++/* serialized by vz_quota_sem */
++static LIST_HEAD(qf_data_head);
++
++static const u_int32_t vzquota_magics[] = V2_INITQMAGICS;
++static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS;
++
++static inline loff_t get_depoff(int depth)
++{
++ loff_t res = 1;
++ while (depth) {
++ res += (1 << ((depth - 1)*QUOTAID_EBITS + 1));
++ depth--;
++ }
++ return res;
++}
++
++static inline loff_t get_blknum(loff_t num, int depth)
++{
++ loff_t res;
++ res = (num << 1) + get_depoff(depth);
++ return res;
++}
++
++static int get_depth(loff_t num)
++{
++ int i;
++ for (i = 0; i < DQTREE_DEPTH; i++) {
++ if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1
++ || num < get_depoff(i + 1)))
++ return i;
++ }
++ return -1;
++}
++
++static inline loff_t get_offset(loff_t num)
++{
++ loff_t res, tmp;
++
++ tmp = get_depth(num);
++ if (tmp < 0)
++ return -1;
++ num -= get_depoff(tmp);
++ BUG_ON(num < 0);
++ res = num >> 1;
++
++ return res;
++}
++
++static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level)
++{
++ /* return maximum available block num */
++ return tree->levels[level].freenum;
++}
++
++static inline loff_t get_block_num(struct quotatree_tree *tree)
++{
++ loff_t ind_blk_num, quot_blk_num, max_ind, max_quot;
++
++ quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1);
++ max_quot = TREENUM_2_BLKNUM(quot_blk_num);
++ ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1));
++ max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL)
++ : get_blknum(ind_blk_num, 0);
++
++ return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1;
++}
++
++/* Write quota file header */
++static int read_header(void *buf, struct quotatree_tree *tree,
++ struct dq_info *dq_ugid_info, int type)
++{
++ struct v2_disk_dqheader *dqh;
++ struct v2_disk_dqinfo *dq_disk_info;
++
++ dqh = buf;
++ dq_disk_info = buf + sizeof(struct v2_disk_dqheader);
++
++ dqh->dqh_magic = vzquota_magics[type];
++ dqh->dqh_version = vzquota_versions[type];
++
++ dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire;
++ dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire;
++ dq_disk_info->dqi_flags = 0; /* no flags */
++ dq_disk_info->dqi_blocks = get_block_num(tree);
++ dq_disk_info->dqi_free_blk = 0; /* first block in the file */
++ dq_disk_info->dqi_free_entry = FIRST_DATABLK;
++
++ return 0;
++}
++
++static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf)
++{
++ int i, j, lev_num;
++
++ lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1;
++ for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) {
++ struct quotatree_node *next, *parent;
++
++ parent = p;
++ next = p;
++ for (j = lev_num; j >= 0; j--) {
++ if (!next->blocks[GETLEVINDX(i,j)]) {
++ buf[i] = 0;
++ goto bad_branch;
++ }
++ parent = next;
++ next = next->blocks[GETLEVINDX(i,j)];
++ }
++ buf[i] = (depth == DQTREE_DEPTH - 1) ?
++ TREENUM_2_BLKNUM(parent->num)
++ : get_blknum(next->num, depth + 1);
++
++ bad_branch:
++ ;
++ }
++
++ return 0;
++}
++
++/*
++ * Write index block to disk (or buffer)
++ * @buf has length 256*sizeof(u_int32_t) bytes
++ */
++static int read_index_block(int num, u_int32_t *buf,
++ struct quotatree_tree *tree)
++{
++ struct quotatree_node *p;
++ u_int32_t index;
++ loff_t off;
++ int depth, res;
++
++ res = BLOCK_NOT_FOUND;
++ index = 0;
++ depth = get_depth(num);
++ off = get_offset(num);
++ if (depth < 0 || off < 0)
++ return -EINVAL;
++
++ list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh,
++ list) {
++ if (p->num >= off)
++ res = 0;
++ if (p->num != off)
++ continue;
++ get_block_child(depth, p, buf);
++ break;
++ }
++
++ return res;
++}
++
++static inline void convert_quot_format(struct v2_disk_dqblk *dq,
++ struct vz_quota_ugid *vzq)
++{
++ dq->dqb_id = vzq->qugid_id;
++ dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit;
++ dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit;
++ dq->dqb_curinodes = vzq->qugid_stat.icurrent;
++ dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE;
++ dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE;
++ dq->dqb_curspace = vzq->qugid_stat.bcurrent;
++ dq->dqb_btime = vzq->qugid_stat.btime;
++ dq->dqb_itime = vzq->qugid_stat.itime;
++}
++
++static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree)
++{
++ int res, i, entries = 0;
++ struct v2_disk_dqdbheader *dq_header;
++ struct quotatree_node *p;
++ struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader);
++
++ res = BLOCK_NOT_FOUND;
++ dq_header = buf;
++ memset(dq_header, 0, sizeof(*dq_header));
++
++ list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh),
++ list) {
++ if (TREENUM_2_BLKNUM(p->num) >= num)
++ res = 0;
++ if (TREENUM_2_BLKNUM(p->num) != num)
++ continue;
++
++ for (i = 0; i < QUOTATREE_BSIZE; i++) {
++ if (!p->blocks[i])
++ continue;
++ convert_quot_format(blk + entries,
++ (struct vz_quota_ugid *)p->blocks[i]);
++ entries++;
++ res = 0;
++ }
++ break;
++ }
++ dq_header->dqdh_entries = entries;
++
++ return res;
++}
++
++static int read_block(int num, void *buf, struct quotatree_tree *tree,
++ struct dq_info *dq_ugid_info, int magic)
++{
++ int res;
++
++ memset(buf, 0, DQBLOCK_SIZE);
++ if (!num)
++ res = read_header(buf, tree, dq_ugid_info, magic);
++ else if (ISINDBLOCK(num))
++ res = read_index_block(num, (u_int32_t*)buf, tree);
++ else
++ res = read_dquot(num, buf, tree);
++
++ return res;
++}
++
++/*
++ * FIXME: this function can handle quota files up to 2GB only.
++ */
++static int read_proc_quotafile(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ off_t blk_num, blk_off, buf_off;
++ char *tmp;
++ size_t buf_size;
++ struct quotatree_data *qtd;
++ struct quotatree_tree *tree;
++ struct dq_info *dqi;
++ int res;
++
++ tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL);
++ if (!tmp)
++ return -ENOMEM;
++
++ qtd = data;
++ down(&vz_quota_sem);
++ down(&qtd->qmblk->dq_sem);
++
++ res = 0;
++ tree = QUGID_TREE(qtd->qmblk, qtd->type);
++ if (!tree) {
++ *eof = 1;
++ goto out_dq;
++ }
++
++ dqi = &qtd->qmblk->dq_ugid_info[qtd->type];
++
++ buf_off = 0;
++ buf_size = count;
++ blk_num = off / DQBLOCK_SIZE;
++ blk_off = off % DQBLOCK_SIZE;
++
++ while (buf_size > 0) {
++ off_t len;
++
++ len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size);
++ res = read_block(blk_num, tmp, tree, dqi, qtd->type);
++ if (res < 0)
++ goto out_err;
++ if (res == BLOCK_NOT_FOUND) {
++ *eof = 1;
++ break;
++ }
++ memcpy(page + buf_off, tmp + blk_off, len);
++
++ blk_num++;
++ buf_size -= len;
++ blk_off = 0;
++ buf_off += len;
++ }
++ res = buf_off;
++
++out_err:
++ *start = NULL + count;
++out_dq:
++ up(&qtd->qmblk->dq_sem);
++ up(&vz_quota_sem);
++ kfree(tmp);
++
++ return res;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * /proc/vz/vzaquota/QID/aquota.* files
++ *
++ * FIXME: this code lacks serialization of read/readdir/lseek.
++ * However, this problem should be fixed after the mainstream issue of what
++ * appears to be non-atomic read and update of file position in sys_read.
++ *
++ * --------------------------------------------------------------------- */
++
++static inline unsigned long vzdq_aquot_getino(dev_t dev)
++{
++ return 0xec000000UL + dev;
++}
++
++static inline dev_t vzdq_aquot_getidev(struct inode *inode)
++{
++ return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link;
++}
++
++static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev)
++{
++ PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev;
++}
++
++static ssize_t vzdq_aquotf_read(struct file *file,
++ char __user *buf, size_t size, loff_t *ppos)
++{
++ char *page;
++ size_t bufsize;
++ ssize_t l, l2, copied;
++ char *start;
++ struct inode *inode;
++ struct block_device *bdev;
++ struct super_block *sb;
++ struct quotatree_data data;
++ int eof, err;
++
++ err = -ENOMEM;
++ page = (char *)__get_free_page(GFP_KERNEL);
++ if (page == NULL)
++ goto out_err;
++
++ err = -ENODEV;
++ inode = file->f_dentry->d_inode;
++ bdev = bdget(vzdq_aquot_getidev(inode));
++ if (bdev == NULL)
++ goto out_err;
++ sb = get_super(bdev);
++ bdput(bdev);
++ if (sb == NULL)
++ goto out_err;
++ data.qmblk = vzquota_find_qmblk(sb);
++ data.type = PROC_I(inode)->type - 1;
++ drop_super(sb);
++ if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD)
++ goto out_err;
++
++ copied = 0;
++ l = l2 = 0;
++ while (1) {
++ bufsize = min(size, (size_t)PAGE_SIZE);
++ if (bufsize <= 0)
++ break;
++
++ l = read_proc_quotafile(page, &start, *ppos, bufsize,
++ &eof, &data);
++ if (l <= 0)
++ break;
++
++ l2 = copy_to_user(buf, page, l);
++ copied += l - l2;
++ if (l2)
++ break;
++
++ buf += l;
++ size -= l;
++ *ppos += (unsigned long)start;
++ l = l2 = 0;
++ }
++
++ qmblk_put(data.qmblk);
++ free_page((unsigned long)page);
++ if (copied)
++ return copied;
++ else if (l2) /* last copy_to_user failed */
++ return -EFAULT;
++ else /* read error or EOF */
++ return l;
++
++out_err:
++ if (page != NULL)
++ free_page((unsigned long)page);
++ return err;
++}
++
++static struct file_operations vzdq_aquotf_file_operations = {
++ .read = &vzdq_aquotf_read,
++};
++
++static struct inode_operations vzdq_aquotf_inode_operations = {
++};
++
++
++/* ----------------------------------------------------------------------
++ *
++ * /proc/vz/vzaquota/QID directory
++ *
++ * --------------------------------------------------------------------- */
++
++static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler)
++{
++ loff_t n;
++ int err;
++
++ n = file->f_pos;
++ for (err = 0; !err; n++) {
++ switch (n) {
++ case 0:
++ err = (*filler)(data, ".", 1, n,
++ file->f_dentry->d_inode->i_ino,
++ DT_DIR);
++ break;
++ case 1:
++ err = (*filler)(data, "..", 2, n,
++ parent_ino(file->f_dentry), DT_DIR);
++ break;
++ case 2:
++ err = (*filler)(data, "aquota.user", 11, n,
++ file->f_dentry->d_inode->i_ino
++ + USRQUOTA + 1,
++ DT_REG);
++ break;
++ case 3:
++ err = (*filler)(data, "aquota.group", 12, n,
++ file->f_dentry->d_inode->i_ino
++ + GRPQUOTA + 1,
++ DT_REG);
++ break;
++ default:
++ goto out;
++ }
++ }
++out:
++ file->f_pos = n;
++ return err;
++}
++
++struct vzdq_aquotq_lookdata {
++ dev_t dev;
++ int type;
++};
++
++static int vzdq_aquotq_looktest(struct inode *inode, void *data)
++{
++ struct vzdq_aquotq_lookdata *d;
++
++ d = data;
++ return inode->i_op == &vzdq_aquotf_inode_operations &&
++ vzdq_aquot_getidev(inode) == d->dev &&
++ PROC_I(inode)->type == d->type + 1;
++}
++
++static int vzdq_aquotq_lookset(struct inode *inode, void *data)
++{
++ struct vzdq_aquotq_lookdata *d;
++
++ d = data;
++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++ inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1;
++ inode->i_mode = S_IFREG | S_IRUSR;
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ inode->i_nlink = 1;
++ inode->i_op = &vzdq_aquotf_inode_operations;
++ inode->i_fop = &vzdq_aquotf_file_operations;
++ PROC_I(inode)->type = d->type + 1;
++ vzdq_aquot_setidev(inode, d->dev);
++ return 0;
++}
++
++static struct dentry *vzdq_aquotq_lookup(struct inode *dir,
++ struct dentry *dentry,
++ struct nameidata *nd)
++{
++ struct inode *inode;
++ struct vzdq_aquotq_lookdata d;
++ int k;
++
++ if (dentry->d_name.len == 11) {
++ if (memcmp(dentry->d_name.name, "aquota.user", 11))
++ goto out;
++ k = USRQUOTA;
++ } else if (dentry->d_name.len == 12) {
++ if (memcmp(dentry->d_name.name, "aquota.group", 11))
++ goto out;
++ k = GRPQUOTA;
++ } else
++ goto out;
++ d.dev = vzdq_aquot_getidev(dir);
++ d.type = k;
++ inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1,
++ vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d);
++ if (inode == NULL)
++ goto out;
++ unlock_new_inode(inode);
++ d_add(dentry, inode);
++ return NULL;
++
++out:
++ return ERR_PTR(-ENOENT);
++}
++
++static struct file_operations vzdq_aquotq_file_operations = {
++ .read = &generic_read_dir,
++ .readdir = &vzdq_aquotq_readdir,
++};
++
++static struct inode_operations vzdq_aquotq_inode_operations = {
++ .lookup = &vzdq_aquotq_lookup,
++};
++
++
++/* ----------------------------------------------------------------------
++ *
++ * /proc/vz/vzaquota directory
++ *
++ * --------------------------------------------------------------------- */
++
++struct vzdq_aquot_de {
++ struct list_head list;
++ struct vfsmount *mnt;
++};
++
++static int vzdq_aquot_buildmntlist(struct ve_struct *ve,
++ struct list_head *head)
++{
++ struct vfsmount *rmnt, *mnt;
++ struct vzdq_aquot_de *p;
++ int err;
++
++#ifdef CONFIG_VE
++ rmnt = mntget(ve->fs_rootmnt);
++#else
++ read_lock(&current->fs->lock);
++ rmnt = mntget(current->fs->rootmnt);
++ read_unlock(&current->fs->lock);
++#endif
++ mnt = rmnt;
++ spin_lock(&vfsmount_lock);
++ while (1) {
++ list_for_each_entry(p, head, list) {
++ if (p->mnt->mnt_sb == mnt->mnt_sb)
++ goto skip;
++ }
++
++ err = -ENOMEM;
++ p = kmalloc(sizeof(*p), GFP_KERNEL);
++ if (p == NULL)
++ goto out;
++ p->mnt = mntget(mnt);
++ list_add_tail(&p->list, head);
++
++skip:
++ err = 0;
++ if (list_empty(&mnt->mnt_mounts)) {
++ while (1) {
++ if (mnt == rmnt)
++ goto out;
++ if (mnt->mnt_child.next !=
++ &mnt->mnt_parent->mnt_mounts)
++ break;
++ mnt = mnt->mnt_parent;
++ }
++ mnt = list_entry(mnt->mnt_child.next,
++ struct vfsmount, mnt_child);
++ } else
++ mnt = list_entry(mnt->mnt_mounts.next,
++ struct vfsmount, mnt_child);
++ }
++out:
++ spin_unlock(&vfsmount_lock);
++ mntput(rmnt);
++ return err;
++}
++
++static void vzdq_aquot_releasemntlist(struct ve_struct *ve,
++ struct list_head *head)
++{
++ struct vzdq_aquot_de *p;
++
++ while (!list_empty(head)) {
++ p = list_entry(head->next, typeof(*p), list);
++ mntput(p->mnt);
++ list_del(&p->list);
++ kfree(p);
++ }
++}
++
++static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler)
++{
++ struct ve_struct *ve, *old_ve;
++ struct list_head mntlist;
++ struct vzdq_aquot_de *de;
++ struct super_block *sb;
++ struct vz_quota_master *qmblk;
++ loff_t i, n;
++ char buf[24];
++ int l, err;
++
++ i = 0;
++ n = file->f_pos;
++ ve = VE_OWNER_FSTYPE(file->f_dentry->d_sb->s_type);
++ old_ve = set_exec_env(ve);
++
++ INIT_LIST_HEAD(&mntlist);
++#ifdef CONFIG_VE
++ /*
++ * The only reason of disabling readdir for the host system is that
++ * this readdir can be slow and CPU consuming with large number of VPSs
++ * (or just mount points).
++ */
++ err = ve_is_super(ve);
++#else
++ err = 0;
++#endif
++ if (!err) {
++ err = vzdq_aquot_buildmntlist(ve, &mntlist);
++ if (err)
++ goto out_err;
++ }
++
++ if (i >= n) {
++ if ((*filler)(data, ".", 1, i,
++ file->f_dentry->d_inode->i_ino, DT_DIR))
++ goto out_fill;
++ }
++ i++;
++
++ if (i >= n) {
++ if ((*filler)(data, "..", 2, i,
++ parent_ino(file->f_dentry), DT_DIR))
++ goto out_fill;
++ }
++ i++;
++
++ list_for_each_entry (de, &mntlist, list) {
++ sb = de->mnt->mnt_sb;
++#ifdef CONFIG_VE
++ if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL))
++ continue;
++#endif
++ qmblk = vzquota_find_qmblk(sb);
++ if (qmblk == NULL || qmblk == VZ_QUOTA_BAD)
++ continue;
++
++ qmblk_put(qmblk);
++ i++;
++ if (i <= n)
++ continue;
++
++ l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev));
++ if ((*filler)(data, buf, l, i - 1,
++ vzdq_aquot_getino(sb->s_dev), DT_DIR))
++ break;
++ }
++
++out_fill:
++ err = 0;
++ file->f_pos = i;
++out_err:
++ vzdq_aquot_releasemntlist(ve, &mntlist);
++ (void)set_exec_env(old_ve);
++ return err;
++}
++
++static int vzdq_aquotd_looktest(struct inode *inode, void *data)
++{
++ return inode->i_op == &vzdq_aquotq_inode_operations &&
++ vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data;
++}
++
++static int vzdq_aquotd_lookset(struct inode *inode, void *data)
++{
++ dev_t dev;
++
++ dev = (dev_t)(unsigned long)data;
++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
++ inode->i_ino = vzdq_aquot_getino(dev);
++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
++ inode->i_uid = 0;
++ inode->i_gid = 0;
++ inode->i_nlink = 2;
++ inode->i_op = &vzdq_aquotq_inode_operations;
++ inode->i_fop = &vzdq_aquotq_file_operations;
++ vzdq_aquot_setidev(inode, dev);
++ return 0;
++}
++
++static struct dentry *vzdq_aquotd_lookup(struct inode *dir,
++ struct dentry *dentry,
++ struct nameidata *nd)
++{
++ struct ve_struct *ve, *old_ve;
++ const unsigned char *s;
++ int l;
++ dev_t dev;
++ struct inode *inode;
++
++ ve = VE_OWNER_FSTYPE(dir->i_sb->s_type);
++ old_ve = set_exec_env(ve);
++#ifdef CONFIG_VE
++ /*
++ * Lookup is much lighter than readdir, so it can be allowed for the
++ * host system. But it would be strange to be able to do lookup only
++ * without readdir...
++ */
++ if (ve_is_super(ve))
++ goto out;
++#endif
++
++ dev = 0;
++ l = dentry->d_name.len;
++ if (l <= 0)
++ goto out;
++ for (s = dentry->d_name.name; l > 0; s++, l--) {
++ if (!isxdigit(*s))
++ goto out;
++ if (dev & ~(~0UL >> 4))
++ goto out;
++ dev <<= 4;
++ if (isdigit(*s))
++ dev += *s - '0';
++ else if (islower(*s))
++ dev += *s - 'a' + 10;
++ else
++ dev += *s - 'A' + 10;
++ }
++ dev = new_decode_dev(dev);
++
++#ifdef CONFIG_VE
++ if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL))
++ goto out;
++#endif
++
++ inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev),
++ vzdq_aquotd_looktest, vzdq_aquotd_lookset,
++ (void *)(unsigned long)dev);
++ if (inode == NULL)
++ goto out;
++ unlock_new_inode(inode);
++
++ d_add(dentry, inode);
++ (void)set_exec_env(old_ve);
++ return NULL;
++
++out:
++ (void)set_exec_env(old_ve);
++ return ERR_PTR(-ENOENT);
++}
++
++static struct file_operations vzdq_aquotd_file_operations = {
++ .read = &generic_read_dir,
++ .readdir = &vzdq_aquotd_readdir,
++};
++
++static struct inode_operations vzdq_aquotd_inode_operations = {
++ .lookup = &vzdq_aquotd_lookup,
++};
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Initialization and deinitialization
++ *
++ * --------------------------------------------------------------------- */
++
++/*
++ * FIXME: creation of proc entries here is unsafe with respect to module
++ * unloading.
++ */
++void vzaquota_init(void)
++{
++ struct proc_dir_entry *de;
++
++ de = create_proc_glob_entry("vz/vzaquota",
++ S_IFDIR | S_IRUSR | S_IXUSR, NULL);
++ if (de != NULL) {
++ de->proc_iops = &vzdq_aquotd_inode_operations;
++ de->proc_fops = &vzdq_aquotd_file_operations;
++ } else
++ printk("VZDQ: vz/vzaquota creation failed\n");
++#if defined(CONFIG_SYSCTL)
++ de = create_proc_glob_entry("sys/fs/quota",
++ S_IFDIR | S_IRUSR | S_IXUSR, NULL);
++ if (de == NULL)
++ printk("VZDQ: sys/fs/quota creation failed\n");
++#endif
++}
++
++void vzaquota_fini(void)
++{
++}
+diff -upr linux-2.6.16.orig/fs/vzdq_mgmt.c linux-2.6.16-026test015/fs/vzdq_mgmt.c
+--- linux-2.6.16.orig/fs/vzdq_mgmt.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/vzdq_mgmt.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,735 @@
++/*
++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/list.h>
++#include <asm/semaphore.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/dcache.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/writeback.h>
++#include <linux/gfp.h>
++#include <asm/uaccess.h>
++#include <linux/proc_fs.h>
++#include <linux/quota.h>
++#include <linux/vzctl_quota.h>
++#include <linux/vzquota.h>
++
++
++/* ----------------------------------------------------------------------
++ * Switching quota on.
++ * --------------------------------------------------------------------- */
++
++/*
++ * check limits copied from user
++ */
++int vzquota_check_sane_limits(struct dq_stat *qstat)
++{
++ int err;
++
++ err = -EINVAL;
++
++ /* softlimit must be less then hardlimit */
++ if (qstat->bsoftlimit > qstat->bhardlimit)
++ goto out;
++
++ if (qstat->isoftlimit > qstat->ihardlimit)
++ goto out;
++
++ err = 0;
++out:
++ return err;
++}
++
++/*
++ * check usage values copied from user
++ */
++int vzquota_check_sane_values(struct dq_stat *qstat)
++{
++ int err;
++
++ err = -EINVAL;
++
++ /* expiration time must not be set if softlimit was not exceeded */
++ if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != (time_t)0)
++ goto out;
++
++ if (qstat->icurrent < qstat->isoftlimit && qstat->itime != (time_t)0)
++ goto out;
++
++ err = vzquota_check_sane_limits(qstat);
++out:
++ return err;
++}
++
++/*
++ * create new quota master block
++ * this function should:
++ * - copy limits and usage parameters from user buffer;
++ * - allock, initialize quota block and insert it to hash;
++ */
++static int vzquota_create(unsigned int quota_id, struct vz_quota_stat *u_qstat)
++{
++ int err;
++ struct vz_quota_stat qstat;
++ struct vz_quota_master *qmblk;
++
++ down(&vz_quota_sem);
++
++ err = -EFAULT;
++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat)))
++ goto out;
++
++ err = -EINVAL;
++ if (quota_id == 0)
++ goto out;
++
++ if (vzquota_check_sane_values(&qstat.dq_stat))
++ goto out;
++ err = 0;
++ qmblk = vzquota_alloc_master(quota_id, &qstat);
++
++ if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */
++ err = PTR_ERR(qmblk);
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++/**
++ * vzquota_on - turn quota on
++ *
++ * This function should:
++ * - find and get refcnt of directory entry for quota root and corresponding
++ * mountpoint;
++ * - find corresponding quota block and mark it with given path;
++ * - check quota tree;
++ * - initialize quota for the tree root.
++ */
++static int vzquota_on(unsigned int quota_id, const char *quota_root)
++{
++ int err;
++ struct nameidata nd;
++ struct vz_quota_master *qmblk;
++ struct super_block *dqsb;
++
++ dqsb = NULL;
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EBUSY;
++ if (qmblk->dq_state != VZDQ_STARTING)
++ goto out;
++
++ err = user_path_walk(quota_root, &nd);
++ if (err)
++ goto out;
++ /* init path must be a directory */
++ err = -ENOTDIR;
++ if (!S_ISDIR(nd.dentry->d_inode->i_mode))
++ goto out_path;
++
++ qmblk->dq_root_dentry = nd.dentry;
++ qmblk->dq_root_mnt = nd.mnt;
++ qmblk->dq_sb = nd.dentry->d_inode->i_sb;
++ err = vzquota_get_super(qmblk->dq_sb);
++ if (err)
++ goto out_super;
++
++ /*
++ * Serialization with quota initialization and operations is performed
++ * through generation check: generation is memorized before qmblk is
++ * found and compared under inode_qmblk_lock with assignment.
++ *
++ * Note that the dentry tree is shrunk only for high-level logical
++ * serialization, purely as a courtesy to the user: to have consistent
++ * quota statistics, files should be closed etc. on quota on.
++ */
++ err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_dentry->d_inode,
++ qmblk);
++ if (err)
++ goto out_init;
++ qmblk->dq_state = VZDQ_WORKING;
++
++ up(&vz_quota_sem);
++ return 0;
++
++out_init:
++ dqsb = qmblk->dq_sb;
++out_super:
++ /* clear for qmblk_put/quota_free_master */
++ qmblk->dq_sb = NULL;
++ qmblk->dq_root_dentry = NULL;
++ qmblk->dq_root_mnt = NULL;
++out_path:
++ path_release(&nd);
++out:
++ if (dqsb)
++ vzquota_put_super(dqsb);
++ up(&vz_quota_sem);
++ return err;
++}
++
++
++/* ----------------------------------------------------------------------
++ * Switching quota off.
++ * --------------------------------------------------------------------- */
++
++/*
++ * destroy quota block by ID
++ */
++static int vzquota_destroy(unsigned int quota_id)
++{
++ int err;
++ struct vz_quota_master *qmblk;
++ struct dentry *dentry;
++ struct vfsmount *mnt;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EBUSY;
++ if (qmblk->dq_state == VZDQ_WORKING)
++ goto out; /* quota_off first */
++
++ list_del_init(&qmblk->dq_hash);
++ dentry = qmblk->dq_root_dentry;
++ qmblk->dq_root_dentry = NULL;
++ mnt = qmblk->dq_root_mnt;
++ qmblk->dq_root_mnt = NULL;
++
++ if (qmblk->dq_sb)
++ vzquota_put_super(qmblk->dq_sb);
++ up(&vz_quota_sem);
++
++ qmblk_put(qmblk);
++ dput(dentry);
++ mntput(mnt);
++ return 0;
++
++out:
++ up(&vz_quota_sem);
++ return err;
++}
++
++/**
++ * vzquota_off - turn quota off
++ */
++
++static int __vzquota_sync_list(struct list_head *lh,
++ struct vz_quota_master *qmblk,
++ enum writeback_sync_modes sync_mode)
++{
++ struct writeback_control wbc;
++ LIST_HEAD(list);
++ struct vz_quota_ilink *qlnk;
++ struct inode *inode;
++ int err;
++
++ memset(&wbc, 0, sizeof(wbc));
++ wbc.sync_mode = sync_mode;
++
++ err = 0;
++ while (!list_empty(lh) && !err) {
++ if (need_resched()) {
++ inode_qmblk_unlock(qmblk->dq_sb);
++ schedule();
++ inode_qmblk_lock(qmblk->dq_sb);
++ }
++
++ qlnk = list_first_entry(lh, struct vz_quota_ilink, list);
++ list_move(&qlnk->list, &list);
++
++ inode = igrab(QLNK_INODE(qlnk));
++ if (!inode)
++ continue;
++
++ inode_qmblk_unlock(qmblk->dq_sb);
++
++ wbc.nr_to_write = LONG_MAX;
++ err = sync_inode(inode, &wbc);
++ iput(inode);
++
++ inode_qmblk_lock(qmblk->dq_sb);
++ }
++
++ list_splice(&list, lh);
++ return err;
++}
++
++static int vzquota_sync_list(struct list_head *lh,
++ struct vz_quota_master *qmblk)
++{
++ int err;
++
++ err = __vzquota_sync_list(lh, qmblk, WB_SYNC_NONE);
++ if (err)
++ return err;
++
++ err = __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL);
++ if (err)
++ return err;
++
++ return 0;
++}
++
++static int vzquota_sync_inodes(struct vz_quota_master *qmblk)
++{
++ int err;
++ LIST_HEAD(qlnk_list);
++
++ list_splice_init(&qmblk->dq_ilink_list, &qlnk_list);
++ err = vzquota_sync_list(&qlnk_list, qmblk);
++ if (!err && !list_empty(&qmblk->dq_ilink_list))
++ err = -EBUSY;
++ list_splice(&qlnk_list, &qmblk->dq_ilink_list);
++
++ return err;
++}
++
++static int vzquota_off(unsigned int quota_id)
++{
++ int err;
++ struct vz_quota_master *qmblk;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EALREADY;
++ if (qmblk->dq_state != VZDQ_WORKING)
++ goto out;
++
++ inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */
++ err = vzquota_sync_inodes(qmblk);
++ if (err)
++ goto out_unlock;
++ inode_qmblk_unlock(qmblk->dq_sb);
++
++ err = vzquota_off_qmblk(qmblk->dq_sb, qmblk);
++ if (err)
++ goto out;
++
++ /* vzquota_destroy will free resources */
++ qmblk->dq_state = VZDQ_STOPING;
++out:
++ up(&vz_quota_sem);
++
++ return err;
++
++out_unlock:
++ inode_qmblk_unlock(qmblk->dq_sb);
++ goto out;
++}
++
++
++/* ----------------------------------------------------------------------
++ * Other VZQUOTA ioctl's.
++ * --------------------------------------------------------------------- */
++
++/*
++ * this function should:
++ * - set new limits/buffer under quota master block lock
++ * - if new softlimit less then usage, then set expiration time
++ * - no need to alloc ugid hash table - we'll do that on demand
++ */
++int vzquota_update_limit(struct dq_stat *_qstat,
++ struct dq_stat *qstat)
++{
++ int err;
++
++ err = -EINVAL;
++ if (vzquota_check_sane_limits(qstat))
++ goto out;
++
++ err = 0;
++
++ /* limits */
++ _qstat->bsoftlimit = qstat->bsoftlimit;
++ _qstat->bhardlimit = qstat->bhardlimit;
++ /*
++ * If the soft limit is exceeded, administrator can override the moment
++ * when the grace period for limit exceeding ends.
++ * Specifying the moment may be useful if the soft limit is set to be
++ * lower than the current usage. In the latter case, if the grace
++ * period end isn't specified, the grace period will start from the
++ * moment of the first write operation.
++ * There is a race with the user level. Soft limit may be already
++ * exceeded before the limit change, and grace period end calculated by
++ * the kernel will be overriden. User level may check if the limit is
++ * already exceeded, but check and set calls are not atomic.
++ * This race isn't dangerous. Under normal cicrumstances, the
++ * difference between the grace period end calculated by the kernel and
++ * the user level should be not greater than as the difference between
++ * the moments of check and set calls, i.e. not bigger than the quota
++ * timer resolution - 1 sec.
++ */
++ if (qstat->btime != (time_t)0 &&
++ _qstat->bcurrent >= _qstat->bsoftlimit)
++ _qstat->btime = qstat->btime;
++
++ _qstat->isoftlimit = qstat->isoftlimit;
++ _qstat->ihardlimit = qstat->ihardlimit;
++ if (qstat->itime != (time_t)0 &&
++ _qstat->icurrent >= _qstat->isoftlimit)
++ _qstat->itime = qstat->itime;
++
++out:
++ return err;
++}
++
++/*
++ * set new quota limits.
++ * this function should:
++ * copy new limits from user level
++ * - find quota block
++ * - set new limits and flags.
++ */
++static int vzquota_setlimit(unsigned int quota_id,
++ struct vz_quota_stat *u_qstat)
++{
++ int err;
++ struct vz_quota_stat qstat;
++ struct vz_quota_master *qmblk;
++
++ down(&vz_quota_sem); /* for hash list protection */
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EFAULT;
++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat)))
++ goto out;
++
++ qmblk_data_write_lock(qmblk);
++ err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat);
++ if (err == 0)
++ qmblk->dq_info = qstat.dq_info;
++ qmblk_data_write_unlock(qmblk);
++
++out:
++ up(&vz_quota_sem);
++ return err;
++}
++
++/*
++ * get quota limits.
++ * very simple - just return stat buffer to user
++ */
++static int vzquota_getstat(unsigned int quota_id,
++ struct vz_quota_stat *u_qstat)
++{
++ int err;
++ struct vz_quota_stat qstat;
++ struct vz_quota_master *qmblk;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ qmblk_data_read_lock(qmblk);
++ /* copy whole buffer under lock */
++ memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat));
++ memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info));
++ qmblk_data_read_unlock(qmblk);
++
++ err = copy_to_user(u_qstat, &qstat, sizeof(qstat));
++ if (err)
++ err = -EFAULT;
++
++out:
++ up(&vz_quota_sem);
++ return err;
++}
++
++/*
++ * This is a system call to turn per-VE disk quota on.
++ * Note this call is allowed to run ONLY from VE0
++ */
++long do_vzquotactl(int cmd, unsigned int quota_id,
++ struct vz_quota_stat *qstat, const char *ve_root)
++{
++ int ret;
++
++ ret = -EPERM;
++ /* access allowed only from root of VE0 */
++ if (!capable(CAP_SYS_RESOURCE) ||
++ !capable(CAP_SYS_ADMIN))
++ goto out;
++
++ switch (cmd) {
++ case VZ_DQ_CREATE:
++ ret = vzquota_create(quota_id, qstat);
++ break;
++ case VZ_DQ_DESTROY:
++ ret = vzquota_destroy(quota_id);
++ break;
++ case VZ_DQ_ON:
++ ret = vzquota_on(quota_id, ve_root);
++ break;
++ case VZ_DQ_OFF:
++ ret = vzquota_off(quota_id);
++ break;
++ case VZ_DQ_SETLIMIT:
++ ret = vzquota_setlimit(quota_id, qstat);
++ break;
++ case VZ_DQ_GETSTAT:
++ ret = vzquota_getstat(quota_id, qstat);
++ break;
++
++ default:
++ ret = -EINVAL;
++ goto out;
++ }
++
++out:
++ return ret;
++}
++
++
++/* ----------------------------------------------------------------------
++ * Proc filesystem routines
++ * ---------------------------------------------------------------------*/
++
++#if defined(CONFIG_PROC_FS)
++
++#define QUOTA_UINT_LEN 15
++#define QUOTA_TIME_LEN_FMT_UINT "%11u"
++#define QUOTA_NUM_LEN_FMT_UINT "%15u"
++#define QUOTA_NUM_LEN_FMT_ULL "%15Lu"
++#define QUOTA_TIME_LEN_FMT_STR "%11s"
++#define QUOTA_NUM_LEN_FMT_STR "%15s"
++#define QUOTA_PROC_MAX_LINE_LEN 2048
++
++/*
++ * prints /proc/ve_dq header line
++ */
++static int print_proc_header(char * buffer)
++{
++ return sprintf(buffer,
++ "%-11s"
++ QUOTA_NUM_LEN_FMT_STR
++ QUOTA_NUM_LEN_FMT_STR
++ QUOTA_NUM_LEN_FMT_STR
++ QUOTA_TIME_LEN_FMT_STR
++ QUOTA_TIME_LEN_FMT_STR
++ "\n",
++ "qid: path",
++ "usage", "softlimit", "hardlimit", "time", "expire");
++}
++
++/*
++ * prints proc master record id, dentry path
++ */
++static int print_proc_master_id(char * buffer, char * path_buf,
++ struct vz_quota_master * qp)
++{
++ char *path;
++ int over;
++
++ path = NULL;
++ switch (qp->dq_state) {
++ case VZDQ_WORKING:
++ if (!path_buf) {
++ path = "";
++ break;
++ }
++ path = d_path(qp->dq_root_dentry,
++ qp->dq_root_mnt, path_buf, PAGE_SIZE);
++ if (IS_ERR(path)) {
++ path = "";
++ break;
++ }
++ /* do not print large path, truncate it */
++ over = strlen(path) -
++ (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 -
++ QUOTA_UINT_LEN);
++ if (over > 0) {
++ path += over - 3;
++ path[0] = path[1] = path[3] = '.';
++ }
++ break;
++ case VZDQ_STARTING:
++ path = "-- started --";
++ break;
++ case VZDQ_STOPING:
++ path = "-- stopped --";
++ break;
++ }
++
++ return sprintf(buffer, "%u: %s\n", qp->dq_id, path);
++}
++
++/*
++ * prints struct vz_quota_stat data
++ */
++static int print_proc_stat(char * buffer, struct dq_stat *qs,
++ struct dq_info *qi)
++{
++ return sprintf(buffer,
++ "%11s"
++ QUOTA_NUM_LEN_FMT_ULL
++ QUOTA_NUM_LEN_FMT_ULL
++ QUOTA_NUM_LEN_FMT_ULL
++ QUOTA_TIME_LEN_FMT_UINT
++ QUOTA_TIME_LEN_FMT_UINT
++ "\n"
++ "%11s"
++ QUOTA_NUM_LEN_FMT_UINT
++ QUOTA_NUM_LEN_FMT_UINT
++ QUOTA_NUM_LEN_FMT_UINT
++ QUOTA_TIME_LEN_FMT_UINT
++ QUOTA_TIME_LEN_FMT_UINT
++ "\n",
++ "1k-blocks",
++ qs->bcurrent >> 10,
++ qs->bsoftlimit >> 10,
++ qs->bhardlimit >> 10,
++ (unsigned int)qs->btime,
++ (unsigned int)qi->bexpire,
++ "inodes",
++ qs->icurrent,
++ qs->isoftlimit,
++ qs->ihardlimit,
++ (unsigned int)qs->itime,
++ (unsigned int)qi->iexpire);
++}
++
++
++/*
++ * for /proc filesystem output
++ */
++static int vzquota_read_proc(char *page, char **start, off_t off, int count,
++ int *eof, void *data)
++{
++ int len, i;
++ off_t printed = 0;
++ char *p = page;
++ struct vz_quota_master *qp;
++ struct vz_quota_ilink *ql2;
++ struct list_head *listp;
++ char *path_buf;
++
++ path_buf = (char*)__get_free_page(GFP_KERNEL);
++ if (path_buf == NULL)
++ return -ENOMEM;
++
++ len = print_proc_header(p);
++ printed += len;
++ if (off < printed) /* keep header in output */ {
++ *start = p + off;
++ p += len;
++ }
++
++ down(&vz_quota_sem);
++
++ /* traverse master hash table for all records */
++ for (i = 0; i < vzquota_hash_size; i++) {
++ list_for_each(listp, &vzquota_hash_table[i]) {
++ qp = list_entry(listp,
++ struct vz_quota_master, dq_hash);
++
++ /* Skip other VE's information if not root of VE0 */
++ if ((!capable(CAP_SYS_ADMIN) ||
++ !capable(CAP_SYS_RESOURCE))) {
++ ql2 = INODE_QLNK(current->fs->root->d_inode);
++ if (ql2 == NULL || qp != ql2->qmblk)
++ continue;
++ }
++ /*
++ * Now print the next record
++ */
++ len = 0;
++ /* we print quotaid and path only in VE0 */
++ if (capable(CAP_SYS_ADMIN))
++ len += print_proc_master_id(p+len,path_buf, qp);
++ len += print_proc_stat(p+len, &qp->dq_stat,
++ &qp->dq_info);
++ printed += len;
++ /* skip unnecessary lines */
++ if (printed <= off)
++ continue;
++ p += len;
++ /* provide start offset */
++ if (*start == NULL)
++ *start = p + (off - printed);
++ /* have we printed all requested size? */
++ if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN ||
++ (p - *start) >= count)
++ goto out;
++ }
++ }
++
++ *eof = 1; /* checked all hash */
++out:
++ up(&vz_quota_sem);
++
++ len = 0;
++ if (*start != NULL) {
++ len = (p - *start);
++ if (len > count)
++ len = count;
++ }
++
++ if (path_buf)
++ free_page((unsigned long) path_buf);
++
++ return len;
++}
++
++/*
++ * Register procfs read callback
++ */
++int vzquota_proc_init(void)
++{
++ struct proc_dir_entry *de;
++
++ de = create_proc_entry("vz/vzquota", S_IFREG|S_IRUSR, NULL);
++ if (de == NULL) {
++ /* create "vz" subdirectory, if not exist */
++ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL);
++ if (de == NULL)
++ goto out_err;
++ de = create_proc_entry("vzquota", S_IFREG|S_IRUSR, de);
++ if (de == NULL)
++ goto out_err;
++ }
++ de->read_proc = vzquota_read_proc;
++ de->data = NULL;
++ return 0;
++out_err:
++ return -EBUSY;
++}
++
++void vzquota_proc_release(void)
++{
++ /* Unregister procfs read callback */
++ remove_proc_entry("vz/vzquota", NULL);
++}
++
++#endif
+diff -upr linux-2.6.16.orig/fs/vzdq_ops.c linux-2.6.16-026test015/fs/vzdq_ops.c
+--- linux-2.6.16.orig/fs/vzdq_ops.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/vzdq_ops.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,565 @@
++/*
++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/types.h>
++#include <asm/semaphore.h>
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/quota.h>
++#include <linux/vzquota.h>
++
++
++/* ----------------------------------------------------------------------
++ * Quota superblock operations - helper functions.
++ * --------------------------------------------------------------------- */
++
++static inline void vzquota_incr_inodes(struct dq_stat *dqstat,
++ unsigned long number)
++{
++ dqstat->icurrent += number;
++}
++
++static inline void vzquota_incr_space(struct dq_stat *dqstat,
++ __u64 number)
++{
++ dqstat->bcurrent += number;
++}
++
++static inline void vzquota_decr_inodes(struct dq_stat *dqstat,
++ unsigned long number)
++{
++ if (dqstat->icurrent > number)
++ dqstat->icurrent -= number;
++ else
++ dqstat->icurrent = 0;
++ if (dqstat->icurrent < dqstat->isoftlimit)
++ dqstat->itime = (time_t) 0;
++}
++
++static inline void vzquota_decr_space(struct dq_stat *dqstat,
++ __u64 number)
++{
++ if (dqstat->bcurrent > number)
++ dqstat->bcurrent -= number;
++ else
++ dqstat->bcurrent = 0;
++ if (dqstat->bcurrent < dqstat->bsoftlimit)
++ dqstat->btime = (time_t) 0;
++}
++
++/*
++ * better printk() message or use /proc/vzquotamsg interface
++ * similar to /proc/kmsg
++ */
++static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag,
++ const char *fmt)
++{
++ if (dq_info->flags & flag) /* warning already printed for this
++ masterblock */
++ return;
++ printk(fmt, dq_id);
++ dq_info->flags |= flag;
++}
++
++/*
++ * ignore_hardlimit -
++ *
++ * Intended to allow superuser of VE0 to overwrite hardlimits.
++ *
++ * ignore_hardlimit() has a very bad feature:
++ *
++ * writepage() operation for writable mapping of a file with holes
++ * may trigger get_block() with wrong current and as a consequence,
++ * opens a possibility to overcommit hardlimits
++ */
++/* for the reason above, it is disabled now */
++static inline int ignore_hardlimit(struct dq_info *dqstat)
++{
++#if 0
++ return ve_is_super(get_exec_env()) &&
++ capable(CAP_SYS_RESOURCE) &&
++ (dqstat->options & VZ_QUOTA_OPT_RSQUASH);
++#else
++ return 0;
++#endif
++}
++
++static int vzquota_check_inodes(struct dq_info *dq_info,
++ struct dq_stat *dqstat,
++ unsigned long number, int dq_id)
++{
++ if (number == 0)
++ return QUOTA_OK;
++
++ if (dqstat->icurrent + number > dqstat->ihardlimit &&
++ !ignore_hardlimit(dq_info)) {
++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
++ "VZ QUOTA: file hardlimit reached for id=%d\n");
++ return NO_QUOTA;
++ }
++
++ if (dqstat->icurrent + number > dqstat->isoftlimit) {
++ if (dqstat->itime == (time_t)0) {
++ vzquota_warn(dq_info, dq_id, 0,
++ "VZ QUOTA: file softlimit exceeded "
++ "for id=%d\n");
++ dqstat->itime = CURRENT_TIME_SECONDS +
++ dq_info->iexpire;
++ } else if (CURRENT_TIME_SECONDS >= dqstat->itime &&
++ !ignore_hardlimit(dq_info)) {
++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES,
++ "VZ QUOTA: file softlimit expired "
++ "for id=%d\n");
++ return NO_QUOTA;
++ }
++ }
++
++ return QUOTA_OK;
++}
++
++static int vzquota_check_space(struct dq_info *dq_info,
++ struct dq_stat *dqstat,
++ __u64 number, int dq_id, char prealloc)
++{
++ if (number == 0)
++ return QUOTA_OK;
++
++ if (dqstat->bcurrent + number > dqstat->bhardlimit &&
++ !ignore_hardlimit(dq_info)) {
++ if (!prealloc)
++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
++ "VZ QUOTA: disk hardlimit reached "
++ "for id=%d\n");
++ return NO_QUOTA;
++ }
++
++ if (dqstat->bcurrent + number > dqstat->bsoftlimit) {
++ if (dqstat->btime == (time_t)0) {
++ if (!prealloc) {
++ vzquota_warn(dq_info, dq_id, 0,
++ "VZ QUOTA: disk softlimit exceeded "
++ "for id=%d\n");
++ dqstat->btime = CURRENT_TIME_SECONDS
++ + dq_info->bexpire;
++ } else {
++ /*
++ * Original Linux quota doesn't allow
++ * preallocation to exceed softlimit so
++ * exceeding will be always printed
++ */
++ return NO_QUOTA;
++ }
++ } else if (CURRENT_TIME_SECONDS >= dqstat->btime &&
++ !ignore_hardlimit(dq_info)) {
++ if (!prealloc)
++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE,
++ "VZ QUOTA: disk quota "
++ "softlimit expired "
++ "for id=%d\n");
++ return NO_QUOTA;
++ }
++ }
++
++ return QUOTA_OK;
++}
++
++static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk,
++ struct vz_quota_ugid *qugid[],
++ int type, unsigned long number)
++{
++ struct dq_info *dqinfo;
++ struct dq_stat *dqstat;
++
++ if (qugid[type] == NULL)
++ return QUOTA_OK;
++ if (qugid[type] == VZ_QUOTA_UGBAD)
++ return NO_QUOTA;
++
++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
++ return QUOTA_OK;
++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
++ return QUOTA_OK;
++ if (number == 0)
++ return QUOTA_OK;
++
++ dqinfo = &qmblk->dq_ugid_info[type];
++ dqstat = &qugid[type]->qugid_stat;
++
++ if (dqstat->ihardlimit != 0 &&
++ dqstat->icurrent + number > dqstat->ihardlimit)
++ return NO_QUOTA;
++
++ if (dqstat->isoftlimit != 0 &&
++ dqstat->icurrent + number > dqstat->isoftlimit) {
++ if (dqstat->itime == (time_t)0)
++ dqstat->itime = CURRENT_TIME_SECONDS +
++ dqinfo->iexpire;
++ else if (CURRENT_TIME_SECONDS >= dqstat->itime)
++ return NO_QUOTA;
++ }
++
++ return QUOTA_OK;
++}
++
++static int vzquota_check_ugid_space(struct vz_quota_master *qmblk,
++ struct vz_quota_ugid *qugid[],
++ int type, __u64 number, char prealloc)
++{
++ struct dq_info *dqinfo;
++ struct dq_stat *dqstat;
++
++ if (qugid[type] == NULL)
++ return QUOTA_OK;
++ if (qugid[type] == VZ_QUOTA_UGBAD)
++ return NO_QUOTA;
++
++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA))
++ return QUOTA_OK;
++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA))
++ return QUOTA_OK;
++ if (number == 0)
++ return QUOTA_OK;
++
++ dqinfo = &qmblk->dq_ugid_info[type];
++ dqstat = &qugid[type]->qugid_stat;
++
++ if (dqstat->bhardlimit != 0 &&
++ dqstat->bcurrent + number > dqstat->bhardlimit)
++ return NO_QUOTA;
++
++ if (dqstat->bsoftlimit != 0 &&
++ dqstat->bcurrent + number > dqstat->bsoftlimit) {
++ if (dqstat->btime == (time_t)0) {
++ if (!prealloc)
++ dqstat->btime = CURRENT_TIME_SECONDS
++ + dqinfo->bexpire;
++ else
++ /*
++ * Original Linux quota doesn't allow
++ * preallocation to exceed softlimit so
++ * exceeding will be always printed
++ */
++ return NO_QUOTA;
++ } else if (CURRENT_TIME_SECONDS >= dqstat->btime)
++ return NO_QUOTA;
++ }
++
++ return QUOTA_OK;
++}
++
++/* ----------------------------------------------------------------------
++ * Quota superblock operations
++ * --------------------------------------------------------------------- */
++
++/*
++ * S_NOQUOTA note.
++ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for
++ * - quota file (absent in our case)
++ * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like
++ * filesystem-specific new_inode, before the inode gets outside links.
++ * For the latter case, the only quota operation where care about S_NOQUOTA
++ * might be required is vzquota_drop, but there S_NOQUOTA has already been
++ * checked in DQUOT_DROP().
++ * So, S_NOQUOTA may be ignored for now in the VZDQ code.
++ *
++ * The above note is not entirely correct.
++ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from
++ * delete_inode if new_inode fails (for example, because of inode quota
++ * limits), so S_NOQUOTA check is needed in free_inode.
++ * This seems to be the dark corner of the current quota API.
++ */
++
++/*
++ * Initialize quota operations for the specified inode.
++ */
++static int vzquota_initialize(struct inode *inode, int type)
++{
++ vzquota_inode_init_call(inode);
++ return 0; /* ignored by caller */
++}
++
++/*
++ * Release quota for the specified inode.
++ */
++static int vzquota_drop(struct inode *inode)
++{
++ vzquota_inode_drop_call(inode);
++ return 0; /* ignored by caller */
++}
++
++/*
++ * Allocate block callback.
++ *
++ * If (prealloc) disk quota exceeding warning is not printed.
++ * See Linux quota to know why.
++ *
++ * Return:
++ * QUOTA_OK == 0 on SUCCESS
++ * NO_QUOTA == 1 if allocation should fail
++ */
++static int vzquota_alloc_space(struct inode *inode,
++ qsize_t number, int prealloc)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_datast data;
++ int ret = QUOTA_OK;
++
++ qmblk = vzquota_inode_data(inode, &data);
++ if (qmblk == VZ_QUOTA_BAD)
++ return NO_QUOTA;
++ if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++ int cnt;
++ struct vz_quota_ugid * qugid[MAXQUOTAS];
++#endif
++
++ /* checking first */
++ ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat,
++ number, qmblk->dq_id, prealloc);
++ if (ret == NO_QUOTA)
++ goto no_quota;
++#ifdef CONFIG_VZ_QUOTA_UGID
++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
++ ret = vzquota_check_ugid_space(qmblk, qugid,
++ cnt, number, prealloc);
++ if (ret == NO_QUOTA)
++ goto no_quota;
++ }
++ /* check ok, may increment */
++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++ if (qugid[cnt] == NULL)
++ continue;
++ vzquota_incr_space(&qugid[cnt]->qugid_stat, number);
++ }
++#endif
++ vzquota_incr_space(&qmblk->dq_stat, number);
++ vzquota_data_unlock(inode, &data);
++ }
++
++ inode_add_bytes(inode, number);
++ might_sleep();
++ return QUOTA_OK;
++
++no_quota:
++ vzquota_data_unlock(inode, &data);
++ return NO_QUOTA;
++}
++
++/*
++ * Allocate inodes callback.
++ *
++ * Return:
++ * QUOTA_OK == 0 on SUCCESS
++ * NO_QUOTA == 1 if allocation should fail
++ */
++static int vzquota_alloc_inode(const struct inode *inode, unsigned long number)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_datast data;
++ int ret = QUOTA_OK;
++
++ qmblk = vzquota_inode_data((struct inode *)inode, &data);
++ if (qmblk == VZ_QUOTA_BAD)
++ return NO_QUOTA;
++ if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++ int cnt;
++ struct vz_quota_ugid *qugid[MAXQUOTAS];
++#endif
++
++ /* checking first */
++ ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat,
++ number, qmblk->dq_id);
++ if (ret == NO_QUOTA)
++ goto no_quota;
++#ifdef CONFIG_VZ_QUOTA_UGID
++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt];
++ ret = vzquota_check_ugid_inodes(qmblk, qugid,
++ cnt, number);
++ if (ret == NO_QUOTA)
++ goto no_quota;
++ }
++ /* check ok, may increment */
++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++ if (qugid[cnt] == NULL)
++ continue;
++ vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number);
++ }
++#endif
++ vzquota_incr_inodes(&qmblk->dq_stat, number);
++ vzquota_data_unlock((struct inode *)inode, &data);
++ }
++
++ might_sleep();
++ return QUOTA_OK;
++
++no_quota:
++ vzquota_data_unlock((struct inode *)inode, &data);
++ return NO_QUOTA;
++}
++
++/*
++ * Free space callback.
++ */
++static int vzquota_free_space(struct inode *inode, qsize_t number)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_datast data;
++
++ qmblk = vzquota_inode_data(inode, &data);
++ if (qmblk == VZ_QUOTA_BAD)
++ return NO_QUOTA; /* isn't checked by the caller */
++ if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++ int cnt;
++ struct vz_quota_ugid * qugid;
++#endif
++
++ vzquota_decr_space(&qmblk->dq_stat, number);
++#ifdef CONFIG_VZ_QUOTA_UGID
++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++ qugid = INODE_QLNK(inode)->qugid[cnt];
++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
++ continue;
++ vzquota_decr_space(&qugid->qugid_stat, number);
++ }
++#endif
++ vzquota_data_unlock(inode, &data);
++ }
++ inode_sub_bytes(inode, number);
++ might_sleep();
++ return QUOTA_OK;
++}
++
++/*
++ * Free inodes callback.
++ */
++static int vzquota_free_inode(const struct inode *inode, unsigned long number)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_datast data;
++
++ if (IS_NOQUOTA(inode))
++ return QUOTA_OK;
++
++ qmblk = vzquota_inode_data((struct inode *)inode, &data);
++ if (qmblk == VZ_QUOTA_BAD)
++ return NO_QUOTA;
++ if (qmblk != NULL) {
++#ifdef CONFIG_VZ_QUOTA_UGID
++ int cnt;
++ struct vz_quota_ugid * qugid;
++#endif
++
++ vzquota_decr_inodes(&qmblk->dq_stat, number);
++#ifdef CONFIG_VZ_QUOTA_UGID
++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
++ qugid = INODE_QLNK(inode)->qugid[cnt];
++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD)
++ continue;
++ vzquota_decr_inodes(&qugid->qugid_stat, number);
++ }
++#endif
++ vzquota_data_unlock((struct inode *)inode, &data);
++ }
++ might_sleep();
++ return QUOTA_OK;
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++
++/*
++ * helper function for quota_transfer
++ * check that we can add inode to this quota_id
++ */
++static int vzquota_transfer_check(struct vz_quota_master *qmblk,
++ struct vz_quota_ugid *qugid[],
++ unsigned int type, __u64 size)
++{
++ if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK ||
++ vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK)
++ return -1;
++ return 0;
++}
++
++int vzquota_transfer_usage(struct inode *inode,
++ int mask,
++ struct vz_quota_ilink *qlnk)
++{
++ struct vz_quota_ugid *qugid_old;
++ __u64 space;
++ int i;
++
++ space = inode_get_bytes(inode);
++ for (i = 0; i < MAXQUOTAS; i++) {
++ if (!(mask & (1 << i)))
++ continue;
++ if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space))
++ return -1;
++ }
++
++ for (i = 0; i < MAXQUOTAS; i++) {
++ if (!(mask & (1 << i)))
++ continue;
++ qugid_old = INODE_QLNK(inode)->qugid[i];
++ vzquota_decr_space(&qugid_old->qugid_stat, space);
++ vzquota_decr_inodes(&qugid_old->qugid_stat, 1);
++ vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space);
++ vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1);
++ }
++ return 0;
++}
++
++/*
++ * Transfer the inode between diffent user/group quotas.
++ */
++static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
++{
++ return vzquota_inode_transfer_call(inode, iattr) ?
++ NO_QUOTA : QUOTA_OK;
++}
++
++#else /* CONFIG_VZ_QUOTA_UGID */
++
++static int vzquota_transfer(struct inode *inode, struct iattr *iattr)
++{
++ return QUOTA_OK;
++}
++
++#endif
++
++/*
++ * Called under following semaphores:
++ * old_d->d_inode->i_sb->s_vfs_rename_sem
++ * old_d->d_inode->i_sem
++ * new_d->d_inode->i_sem
++ * [not verified --SAW]
++ */
++static int vzquota_rename(struct inode *inode,
++ struct inode *old_dir, struct inode *new_dir)
++{
++ return vzquota_rename_check(inode, old_dir, new_dir) ?
++ NO_QUOTA : QUOTA_OK;
++}
++
++/*
++ * Structure of superblock diskquota operations.
++ */
++struct dquot_operations vz_quota_operations = {
++ initialize: vzquota_initialize,
++ drop: vzquota_drop,
++ alloc_space: vzquota_alloc_space,
++ alloc_inode: vzquota_alloc_inode,
++ free_space: vzquota_free_space,
++ free_inode: vzquota_free_inode,
++ transfer: vzquota_transfer,
++ rename: vzquota_rename
++};
+diff -upr linux-2.6.16.orig/fs/vzdq_tree.c linux-2.6.16-026test015/fs/vzdq_tree.c
+--- linux-2.6.16.orig/fs/vzdq_tree.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/vzdq_tree.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,286 @@
++/*
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo quota tree implementation
++ */
++
++#include <linux/errno.h>
++#include <linux/slab.h>
++#include <linux/vzdq_tree.h>
++
++struct quotatree_tree *quotatree_alloc(void)
++{
++ int l;
++ struct quotatree_tree *tree;
++
++ tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL);
++ if (tree == NULL)
++ goto out;
++
++ for (l = 0; l < QUOTATREE_DEPTH; l++) {
++ INIT_LIST_HEAD(&tree->levels[l].usedlh);
++ INIT_LIST_HEAD(&tree->levels[l].freelh);
++ tree->levels[l].freenum = 0;
++ }
++ tree->root = NULL;
++ tree->leaf_num = 0;
++out:
++ return tree;
++}
++
++static struct quotatree_node *
++quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level,
++ struct quotatree_find_state *st)
++{
++ void **block;
++ struct quotatree_node *parent;
++ int l, index;
++
++ parent = NULL;
++ block = (void **)&tree->root;
++ l = 0;
++ while (l < level && *block != NULL) {
++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
++ parent = *block;
++ block = parent->blocks + index;
++ l++;
++ }
++ if (st != NULL) {
++ st->block = block;
++ st->level = l;
++ }
++
++ return parent;
++}
++
++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
++ struct quotatree_find_state *st)
++{
++ quotatree_follow(tree, id, QUOTATREE_DEPTH, st);
++ if (st->level == QUOTATREE_DEPTH)
++ return *st->block;
++ else
++ return NULL;
++}
++
++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index)
++{
++ int i, count;
++ struct quotatree_node *p;
++ void *leaf;
++
++ if (QTREE_LEAFNUM(tree) <= index)
++ return NULL;
++
++ count = 0;
++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
++ for (i = 0; i < QUOTATREE_BSIZE; i++) {
++ leaf = p->blocks[i];
++ if (leaf == NULL)
++ continue;
++ if (count == index)
++ return leaf;
++ count++;
++ }
++ }
++ return NULL;
++}
++
++/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id)
++ * in the tree... */
++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id)
++{
++ int off;
++ struct quotatree_node *parent, *p;
++ struct list_head *lh;
++
++ /* get parent refering correct quota tree node of the last level */
++ parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL);
++ if (!parent)
++ return NULL;
++
++ off = (id & QUOTATREE_BMASK) + 1; /* next ugid */
++ lh = &parent->list;
++ do {
++ p = list_entry(lh, struct quotatree_node, list);
++ for ( ; off < QUOTATREE_BSIZE; off++)
++ if (p->blocks[off])
++ return p->blocks[off];
++ off = 0;
++ lh = lh->next;
++ } while (lh != &QTREE_LEAFLVL(tree)->usedlh);
++
++ return NULL;
++}
++
++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
++ struct quotatree_find_state *st, void *data)
++{
++ struct quotatree_node *p;
++ int l, index;
++
++ while (st->level < QUOTATREE_DEPTH) {
++ l = st->level;
++ if (!list_empty(&tree->levels[l].freelh)) {
++ p = list_entry(tree->levels[l].freelh.next,
++ struct quotatree_node, list);
++ list_del(&p->list);
++ } else {
++ p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL);
++ if (p == NULL)
++ return -ENOMEM;
++ /* save block number in the l-level
++ * it uses for quota file generation */
++ p->num = tree->levels[l].freenum++;
++ }
++ list_add(&p->list, &tree->levels[l].usedlh);
++ memset(p->blocks, 0, sizeof(p->blocks));
++ *st->block = p;
++
++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK;
++ st->block = p->blocks + index;
++ st->level++;
++ }
++ tree->leaf_num++;
++ *st->block = data;
++
++ return 0;
++}
++
++static struct quotatree_node *
++quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id,
++ int level)
++{
++ struct quotatree_node *parent;
++ struct quotatree_find_state st;
++
++ parent = quotatree_follow(tree, id, level, &st);
++ if (st.level == QUOTATREE_DEPTH)
++ tree->leaf_num--;
++ *st.block = NULL;
++ return parent;
++}
++
++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id)
++{
++ struct quotatree_node *p;
++ int level, i;
++
++ p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH);
++ for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) {
++ for (i = 0; i < QUOTATREE_BSIZE; i++)
++ if (p->blocks[i] != NULL)
++ return;
++ list_move(&p->list, &tree->levels[level].freelh);
++ p = quotatree_remove_ptr(tree, id, level);
++ }
++}
++
++#if 0
++static void quotatree_walk(struct quotatree_tree *tree,
++ struct quotatree_node *node_start,
++ quotaid_t id_start,
++ int level_start, int level_end,
++ int (*callback)(struct quotatree_tree *,
++ quotaid_t id,
++ int level,
++ void *ptr,
++ void *data),
++ void *data)
++{
++ struct quotatree_node *p;
++ int l, shift, index;
++ quotaid_t id;
++ struct quotatree_find_state st;
++
++ p = node_start;
++ l = level_start;
++ shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
++ id = id_start;
++ index = 0;
++
++ /*
++ * Invariants:
++ * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS;
++ * id & ((1 << shift) - 1) == 0
++ * p is l-level node corresponding to id
++ */
++ do {
++ if (!p)
++ break;
++
++ if (l < level_end) {
++ for (; index < QUOTATREE_BSIZE; index++)
++ if (p->blocks[index] != NULL)
++ break;
++ if (index < QUOTATREE_BSIZE) {
++ /* descend */
++ p = p->blocks[index];
++ l++;
++ shift -= QUOTAID_BBITS;
++ id += (quotaid_t)index << shift;
++ index = 0;
++ continue;
++ }
++ }
++
++ if ((*callback)(tree, id, l, p, data))
++ break;
++
++ /* ascend and to the next node */
++ p = quotatree_follow(tree, id, l, &st);
++
++ index = ((id >> shift) & QUOTATREE_BMASK) + 1;
++ l--;
++ shift += QUOTAID_BBITS;
++ id &= ~(((quotaid_t)1 << shift) - 1);
++ } while (l >= level_start);
++}
++#endif
++
++static void free_list(struct list_head *node_list)
++{
++ struct quotatree_node *p, *tmp;
++
++ list_for_each_entry_safe(p, tmp, node_list, list) {
++ list_del(&p->list);
++ kfree(p);
++ }
++}
++
++static inline void quotatree_free_nodes(struct quotatree_tree *tree)
++{
++ int i;
++
++ for (i = 0; i < QUOTATREE_DEPTH; i++) {
++ free_list(&tree->levels[i].usedlh);
++ free_list(&tree->levels[i].freelh);
++ }
++}
++
++static void quotatree_free_leafs(struct quotatree_tree *tree,
++ void (*dtor)(void *))
++{
++ int i;
++ struct quotatree_node *p;
++
++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) {
++ for (i = 0; i < QUOTATREE_BSIZE; i++) {
++ if (p->blocks[i] == NULL)
++ continue;
++
++ dtor(p->blocks[i]);
++ }
++ }
++}
++
++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *))
++{
++ quotatree_free_leafs(tree, dtor);
++ quotatree_free_nodes(tree);
++ kfree(tree);
++}
+diff -upr linux-2.6.16.orig/fs/vzdq_ugid.c linux-2.6.16-026test015/fs/vzdq_ugid.c
+--- linux-2.6.16.orig/fs/vzdq_ugid.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/vzdq_ugid.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1130 @@
++/*
++ * Copyright (C) 2002 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo UID/GID disk quota implementation
++ */
++
++#include <linux/config.h>
++#include <linux/string.h>
++#include <linux/slab.h>
++#include <linux/list.h>
++#include <linux/smp_lock.h>
++#include <linux/rcupdate.h>
++#include <asm/uaccess.h>
++#include <linux/proc_fs.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/quota.h>
++#include <linux/quotaio_v2.h>
++#include <linux/virtinfo.h>
++
++#include <linux/vzctl.h>
++#include <linux/vzctl_quota.h>
++#include <linux/vzquota.h>
++
++/*
++ * XXX
++ * may be something is needed for sb->s_dquot->info[]?
++ */
++
++#define USRQUOTA_MASK (1 << USRQUOTA)
++#define GRPQUOTA_MASK (1 << GRPQUOTA)
++#define QTYPE2MASK(type) (1 << (type))
++
++static kmem_cache_t *vz_quota_ugid_cachep;
++
++/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects
++ * list on the hash table */
++extern struct semaphore vz_quota_sem;
++
++inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid)
++{
++ if (qugid != VZ_QUOTA_UGBAD)
++ atomic_inc(&qugid->qugid_count);
++ return qugid;
++}
++
++/* we don't limit users with zero limits */
++static inline int vzquota_fake_stat(struct dq_stat *stat)
++{
++ return stat->bhardlimit == 0 && stat->bsoftlimit == 0 &&
++ stat->ihardlimit == 0 && stat->isoftlimit == 0;
++}
++
++/* callback function for quotatree_free() */
++static inline void vzquota_free_qugid(void *ptr)
++{
++ kmem_cache_free(vz_quota_ugid_cachep, ptr);
++}
++
++/*
++ * destroy ugid, if it have zero refcount, limits and usage
++ * must be called under qmblk->dq_sem
++ */
++void vzquota_put_ugid(struct vz_quota_master *qmblk,
++ struct vz_quota_ugid *qugid)
++{
++ if (qugid == VZ_QUOTA_UGBAD)
++ return;
++ qmblk_data_read_lock(qmblk);
++ if (atomic_dec_and_test(&qugid->qugid_count) &&
++ (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 &&
++ vzquota_fake_stat(&qugid->qugid_stat) &&
++ qugid->qugid_stat.bcurrent == 0 &&
++ qugid->qugid_stat.icurrent == 0) {
++ quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type),
++ qugid->qugid_id);
++ qmblk->dq_ugid_count--;
++ vzquota_free_qugid(qugid);
++ }
++ qmblk_data_read_unlock(qmblk);
++}
++
++/*
++ * Get ugid block by its index, like it would present in array.
++ * In reality, this is not array - this is leafs chain of the tree.
++ * NULL if index is out of range.
++ * qmblk semaphore is required to protect the tree.
++ */
++static inline struct vz_quota_ugid *
++vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type)
++{
++ return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index);
++}
++
++/*
++ * get next element from ugid "virtual array"
++ * ugid must be in current array and this array may not be changed between
++ * two accesses (quaranteed by "stopped" quota state and quota semaphore)
++ * qmblk semaphore is required to protect the tree
++ */
++static inline struct vz_quota_ugid *
++vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid)
++{
++ return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type),
++ qugid->qugid_id);
++}
++
++/*
++ * requires dq_sem
++ */
++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
++ unsigned int quota_id, int type, int flags)
++{
++ struct vz_quota_ugid *qugid;
++ struct quotatree_tree *tree;
++ struct quotatree_find_state st;
++
++ tree = QUGID_TREE(qmblk, type);
++ qugid = quotatree_find(tree, quota_id, &st);
++ if (qugid)
++ goto success;
++
++ /* caller does not want alloc */
++ if (flags & VZDQUG_FIND_DONT_ALLOC)
++ goto fail;
++
++ if (flags & VZDQUG_FIND_FAKE)
++ goto doit;
++
++ /* check limit */
++ if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max)
++ goto fail;
++
++ /* see comment at VZDQUG_FIXED_SET define */
++ if (qmblk->dq_flags & VZDQUG_FIXED_SET)
++ goto fail;
++
++doit:
++ /* alloc new structure */
++ qugid = kmem_cache_alloc(vz_quota_ugid_cachep,
++ SLAB_NOFS | __GFP_NOFAIL);
++ if (qugid == NULL)
++ goto fail;
++
++ /* initialize new structure */
++ qugid->qugid_id = quota_id;
++ memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat));
++ qugid->qugid_type = type;
++ atomic_set(&qugid->qugid_count, 0);
++
++ /* insert in tree */
++ if (quotatree_insert(tree, quota_id, &st, qugid) < 0)
++ goto fail_insert;
++ qmblk->dq_ugid_count++;
++
++success:
++ vzquota_get_ugid(qugid);
++ return qugid;
++
++fail_insert:
++ vzquota_free_qugid(qugid);
++fail:
++ return VZ_QUOTA_UGBAD;
++}
++
++/*
++ * takes dq_sem, may schedule
++ */
++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
++ unsigned int quota_id, int type, int flags)
++{
++ struct vz_quota_ugid *qugid;
++
++ down(&qmblk->dq_sem);
++ qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags);
++ up(&qmblk->dq_sem);
++
++ return qugid;
++}
++
++/*
++ * destroy all ugid records on given quota master
++ */
++void vzquota_kill_ugid(struct vz_quota_master *qmblk)
++{
++ BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) ||
++ (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL));
++
++ if (qmblk->dq_uid_tree != NULL) {
++ quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid);
++ quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid);
++ }
++}
++
++
++/* ----------------------------------------------------------------------
++ * Management interface to ugid quota for (super)users.
++ * --------------------------------------------------------------------- */
++
++/**
++ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems
++ *
++ * This function finds a quota master block corresponding to the root of
++ * a virtual filesystem.
++ * Returns a quota master block with reference taken, or %NULL if not under
++ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation
++ * operations will fail).
++ *
++ * Note: this function uses vzquota_inode_qmblk().
++ * The latter is a rather confusing function: it returns qmblk that used to be
++ * on the inode some time ago (without guarantee that it still has any
++ * relations to the inode). So, vzquota_find_qmblk() leaves it up to the
++ * caller to think whether the inode could have changed its qmblk and what to
++ * do in that case.
++ * Currently, the callers appear to not care :(
++ */
++struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb)
++{
++ struct inode *qrinode;
++ struct vz_quota_master *qmblk;
++
++ qmblk = NULL;
++ qrinode = NULL;
++ if (sb->s_op->get_quota_root != NULL)
++ qrinode = sb->s_op->get_quota_root(sb);
++ if (qrinode != NULL)
++ qmblk = vzquota_inode_qmblk(qrinode);
++ return qmblk;
++}
++
++static int vzquota_initialize2(struct inode *inode, int type)
++{
++ return QUOTA_OK;
++}
++
++static int vzquota_drop2(struct inode *inode)
++{
++ return QUOTA_OK;
++}
++
++static int vzquota_alloc_space2(struct inode *inode,
++ qsize_t number, int prealloc)
++{
++ inode_add_bytes(inode, number);
++ return QUOTA_OK;
++}
++
++static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number)
++{
++ return QUOTA_OK;
++}
++
++static int vzquota_free_space2(struct inode *inode, qsize_t number)
++{
++ inode_sub_bytes(inode, number);
++ return QUOTA_OK;
++}
++
++static int vzquota_free_inode2(const struct inode *inode, unsigned long number)
++{
++ return QUOTA_OK;
++}
++
++static int vzquota_transfer2(struct inode *inode, struct iattr *iattr)
++{
++ return QUOTA_OK;
++}
++
++struct dquot_operations vz_quota_operations2 = {
++ initialize: vzquota_initialize2,
++ drop: vzquota_drop2,
++ alloc_space: vzquota_alloc_space2,
++ alloc_inode: vzquota_alloc_inode2,
++ free_space: vzquota_free_space2,
++ free_inode: vzquota_free_inode2,
++ transfer: vzquota_transfer2
++};
++
++static int vz_quota_on(struct super_block *sb, int type,
++ int format_id, char *path)
++{
++ struct vz_quota_master *qmblk;
++ int mask, mask2;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ down(&vz_quota_sem);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++
++ mask = 0;
++ mask2 = 0;
++ sb->dq_op = &vz_quota_operations2;
++ sb->s_qcop = &vz_quotactl_operations;
++ if (type == USRQUOTA) {
++ mask = DQUOT_USR_ENABLED;
++ mask2 = VZDQ_USRQUOTA;
++ }
++ if (type == GRPQUOTA) {
++ mask = DQUOT_GRP_ENABLED;
++ mask2 = VZDQ_GRPQUOTA;
++ }
++ err = -EBUSY;
++ if (qmblk->dq_flags & mask2)
++ goto out;
++
++ err = 0;
++ qmblk->dq_flags |= mask2;
++ sb->s_dquot.flags |= mask;
++
++out:
++ up(&vz_quota_sem);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++ return err;
++}
++
++static int vz_quota_off(struct super_block *sb, int type)
++{
++ struct vz_quota_master *qmblk;
++ int mask2;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ down(&vz_quota_sem);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++
++ mask2 = 0;
++ if (type == USRQUOTA)
++ mask2 = VZDQ_USRQUOTA;
++ if (type == GRPQUOTA)
++ mask2 = VZDQ_GRPQUOTA;
++ err = -EINVAL;
++ if (!(qmblk->dq_flags & mask2))
++ goto out;
++
++ qmblk->dq_flags &= ~mask2;
++ err = 0;
++
++out:
++ up(&vz_quota_sem);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++ return err;
++}
++
++static int vz_quota_sync(struct super_block *sb, int type)
++{
++ return 0; /* vz quota is always uptodate */
++}
++
++static int vz_get_dqblk(struct super_block *sb, int type,
++ qid_t id, struct if_dqblk *di)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid *ugid;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ down(&vz_quota_sem);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++
++ err = 0;
++ ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC);
++ if (ugid != VZ_QUOTA_UGBAD) {
++ qmblk_data_read_lock(qmblk);
++ di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10;
++ di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10;
++ di->dqb_curspace = ugid->qugid_stat.bcurrent;
++ di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit;
++ di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit;
++ di->dqb_curinodes = ugid->qugid_stat.icurrent;
++ di->dqb_btime = ugid->qugid_stat.btime;
++ di->dqb_itime = ugid->qugid_stat.itime;
++ qmblk_data_read_unlock(qmblk);
++ di->dqb_valid = QIF_ALL;
++ vzquota_put_ugid(qmblk, ugid);
++ } else {
++ memset(di, 0, sizeof(*di));
++ di->dqb_valid = QIF_ALL;
++ }
++
++out:
++ up(&vz_quota_sem);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++ return err;
++}
++
++/* must be called under vz_quota_sem */
++static int __vz_set_dqblk(struct vz_quota_master *qmblk,
++ int type, qid_t id, struct if_dqblk *di)
++{
++ struct vz_quota_ugid *ugid;
++
++ ugid = vzquota_find_ugid(qmblk, id, type, 0);
++ if (ugid == VZ_QUOTA_UGBAD)
++ return -ESRCH;
++
++ qmblk_data_write_lock(qmblk);
++ /*
++ * Subtle compatibility breakage.
++ *
++ * Some old non-vz kernel quota didn't start grace period
++ * if the new soft limit happens to be below the usage.
++ * Non-vz kernel quota in 2.4.20 starts the grace period
++ * (if it hasn't been started).
++ * Current non-vz kernel performs even more complicated
++ * manipulations...
++ *
++ * Also, current non-vz kernels have inconsistency related to
++ * the grace time start. In regular operations the grace period
++ * is started if the usage is greater than the soft limit (and,
++ * strangely, is cancelled if the usage is less).
++ * However, set_dqblk starts the grace period if the usage is greater
++ * or equal to the soft limit.
++ *
++ * Here we try to mimic the behavior of the current non-vz kernel.
++ */
++ if (di->dqb_valid & QIF_BLIMITS) {
++ ugid->qugid_stat.bhardlimit =
++ (__u64)di->dqb_bhardlimit << 10;
++ ugid->qugid_stat.bsoftlimit =
++ (__u64)di->dqb_bsoftlimit << 10;
++ if (di->dqb_bsoftlimit == 0 ||
++ ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit)
++ ugid->qugid_stat.btime = 0;
++ else if (!(di->dqb_valid & QIF_BTIME))
++ ugid->qugid_stat.btime = CURRENT_TIME_SECONDS
++ + qmblk->dq_ugid_info[type].bexpire;
++ else
++ ugid->qugid_stat.btime = di->dqb_btime;
++ }
++ if (di->dqb_valid & QIF_ILIMITS) {
++ ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit;
++ ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit;
++ if (di->dqb_isoftlimit == 0 ||
++ ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit)
++ ugid->qugid_stat.itime = 0;
++ else if (!(di->dqb_valid & QIF_ITIME))
++ ugid->qugid_stat.itime = CURRENT_TIME_SECONDS
++ + qmblk->dq_ugid_info[type].iexpire;
++ else
++ ugid->qugid_stat.itime = di->dqb_itime;
++ }
++ qmblk_data_write_unlock(qmblk);
++ vzquota_put_ugid(qmblk, ugid);
++
++ return 0;
++}
++
++static int vz_set_dqblk(struct super_block *sb, int type,
++ qid_t id, struct if_dqblk *di)
++{
++ struct vz_quota_master *qmblk;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ down(&vz_quota_sem);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++ err = __vz_set_dqblk(qmblk, type, id, di);
++out:
++ up(&vz_quota_sem);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++ return err;
++}
++
++static int vz_get_dqinfo(struct super_block *sb, int type,
++ struct if_dqinfo *ii)
++{
++ struct vz_quota_master *qmblk;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ down(&vz_quota_sem);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++
++ err = 0;
++ ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire;
++ ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire;
++ ii->dqi_flags = 0;
++ ii->dqi_valid = IIF_ALL;
++
++out:
++ up(&vz_quota_sem);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++ return err;
++}
++
++/* must be called under vz_quota_sem */
++static int __vz_set_dqinfo(struct vz_quota_master *qmblk,
++ int type, struct if_dqinfo *ii)
++{
++ if (ii->dqi_valid & IIF_FLAGS)
++ if (ii->dqi_flags & DQF_MASK)
++ return -EINVAL;
++
++ if (ii->dqi_valid & IIF_BGRACE)
++ qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace;
++ if (ii->dqi_valid & IIF_IGRACE)
++ qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace;
++ return 0;
++}
++
++static int vz_set_dqinfo(struct super_block *sb, int type,
++ struct if_dqinfo *ii)
++{
++ struct vz_quota_master *qmblk;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ down(&vz_quota_sem);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++ err = __vz_set_dqinfo(qmblk, type, ii);
++out:
++ up(&vz_quota_sem);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++ return err;
++}
++
++#ifdef CONFIG_QUOTA_COMPAT
++
++#define Q_GETQUOTI_SIZE 1024
++
++#define UGID2DQBLK(dst, src) \
++ do { \
++ (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \
++ (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \
++ (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \
++ /* in 1K blocks */ \
++ (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \
++ /* in 1K blocks */ \
++ (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \
++ /* in bytes, 64 bit */ \
++ (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \
++ (dst)->dqb_btime = (src)->qugid_stat.btime; \
++ (dst)->dqb_itime = (src)->qugid_stat.itime; \
++ } while (0)
++
++static int vz_get_quoti(struct super_block *sb, int type, qid_t idx,
++ struct v2_disk_dqblk *dqblk)
++{
++ struct vz_quota_master *qmblk;
++ struct v2_disk_dqblk *data, *kbuf;
++ struct vz_quota_ugid *ugid;
++ int count;
++ int err;
++
++ qmblk = vzquota_find_qmblk(sb);
++ err = -ESRCH;
++ if (qmblk == NULL)
++ goto out;
++ err = -EIO;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out;
++
++ err = -ENOMEM;
++ kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf));
++ if (!kbuf)
++ goto out;
++
++ down(&vz_quota_sem);
++ down(&qmblk->dq_sem);
++ for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0;
++ ugid != NULL && count < Q_GETQUOTI_SIZE;
++ count++)
++ {
++ data = kbuf + count;
++ qmblk_data_read_lock(qmblk);
++ UGID2DQBLK(data, ugid);
++ qmblk_data_read_unlock(qmblk);
++ data->dqb_id = ugid->qugid_id;
++
++ /* Find next entry */
++ ugid = vzquota_get_next(qmblk, ugid);
++ BUG_ON(ugid != NULL && ugid->qugid_type != type);
++ }
++ up(&qmblk->dq_sem);
++ up(&vz_quota_sem);
++
++ err = count;
++ if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf)))
++ err = -EFAULT;
++
++ vfree(kbuf);
++out:
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qmblk);
++
++ return err;
++}
++
++#endif
++
++struct quotactl_ops vz_quotactl_operations = {
++ quota_on: vz_quota_on,
++ quota_off: vz_quota_off,
++ quota_sync: vz_quota_sync,
++ get_info: vz_get_dqinfo,
++ set_info: vz_set_dqinfo,
++ get_dqblk: vz_get_dqblk,
++ set_dqblk: vz_set_dqblk,
++#ifdef CONFIG_QUOTA_COMPAT
++ get_quoti: vz_get_quoti
++#endif
++};
++
++
++/* ----------------------------------------------------------------------
++ * Management interface for host system admins.
++ * --------------------------------------------------------------------- */
++
++static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size,
++ struct vz_quota_iface *u_ugid_buf)
++{
++ struct vz_quota_master *qmblk;
++ int ret;
++
++ down(&vz_quota_sem);
++
++ ret = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ ret = -EBUSY;
++ if (qmblk->dq_state != VZDQ_STARTING)
++ goto out; /* working quota doesn't accept new ugids */
++
++ ret = 0;
++ /* start to add ugids */
++ for (ret = 0; ret < ugid_size; ret++) {
++ struct vz_quota_iface ugid_buf;
++ struct vz_quota_ugid *ugid;
++
++ if (copy_from_user(&ugid_buf, u_ugid_buf, sizeof(ugid_buf)))
++ break;
++
++ if (ugid_buf.qi_type >= MAXQUOTAS)
++ break; /* bad quota type - this is the only check */
++
++ ugid = vzquota_find_ugid(qmblk,
++ ugid_buf.qi_id, ugid_buf.qi_type, 0);
++ if (ugid == VZ_QUOTA_UGBAD) {
++ qmblk->dq_flags |= VZDQUG_FIXED_SET;
++ break; /* limit reached */
++ }
++
++ /* update usage/limits
++ * we can copy the data without the lock, because the data
++ * cannot be modified in VZDQ_STARTING state */
++ ugid->qugid_stat = ugid_buf.qi_stat;
++
++ vzquota_put_ugid(qmblk, ugid);
++
++ u_ugid_buf++; /* next user buffer */
++ }
++out:
++ up(&vz_quota_sem);
++
++ return ret;
++}
++
++static int quota_ugid_setgrace(unsigned int quota_id,
++ struct dq_info u_dq_info[])
++{
++ struct vz_quota_master *qmblk;
++ struct dq_info dq_info[MAXQUOTAS];
++ struct dq_info *target;
++ int err, type;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EBUSY;
++ if (qmblk->dq_state != VZDQ_STARTING)
++ goto out; /* working quota doesn't accept changing options */
++
++ err = -EFAULT;
++ if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info)))
++ goto out;
++
++ err = 0;
++
++ /* update in qmblk */
++ for (type = 0; type < MAXQUOTAS; type ++) {
++ target = &qmblk->dq_ugid_info[type];
++ target->bexpire = dq_info[type].bexpire;
++ target->iexpire = dq_info[type].iexpire;
++ }
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size,
++ struct vz_quota_iface *u_ugid_buf)
++{
++ int type, count;
++ struct vz_quota_ugid *ugid;
++
++ if (QTREE_LEAFNUM(qmblk->dq_uid_tree) +
++ QTREE_LEAFNUM(qmblk->dq_gid_tree)
++ <= index)
++ return 0;
++
++ count = 0;
++
++ type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA;
++ if (type == GRPQUOTA)
++ index -= QTREE_LEAFNUM(qmblk->dq_uid_tree);
++
++ /* loop through ugid and then qgid quota */
++repeat:
++ for (ugid = vzquota_get_byindex(qmblk, index, type);
++ ugid != NULL && count < size;
++ ugid = vzquota_get_next(qmblk, ugid), count++)
++ {
++ struct vz_quota_iface ugid_buf;
++
++ /* form interface buffer and send in to user-level */
++ qmblk_data_read_lock(qmblk);
++ memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat,
++ sizeof(ugid_buf.qi_stat));
++ qmblk_data_read_unlock(qmblk);
++ ugid_buf.qi_id = ugid->qugid_id;
++ ugid_buf.qi_type = ugid->qugid_type;
++
++ memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf));
++ u_ugid_buf++; /* next portion of user buffer */
++ }
++
++ if (type == USRQUOTA && count < size) {
++ type = GRPQUOTA;
++ index = 0;
++ goto repeat;
++ }
++
++ return count;
++}
++
++static int quota_ugid_getstat(unsigned int quota_id,
++ int index, int size, struct vz_quota_iface *u_ugid_buf)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_iface *k_ugid_buf;
++ int err;
++
++ if (index < 0 || size < 0)
++ return -EINVAL;
++
++ if (size > INT_MAX / sizeof(struct vz_quota_iface))
++ return -EINVAL;
++
++ k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface));
++ if (k_ugid_buf == NULL)
++ return -ENOMEM;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ down(&qmblk->dq_sem);
++ err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf);
++ up(&qmblk->dq_sem);
++ if (err < 0)
++ goto out;
++
++ if (copy_to_user(u_ugid_buf, k_ugid_buf,
++ size * sizeof(struct vz_quota_iface)))
++ err = -EFAULT;
++
++out:
++ up(&vz_quota_sem);
++ vfree(k_ugid_buf);
++ return err;
++}
++
++static int quota_ugid_getgrace(unsigned int quota_id,
++ struct dq_info u_dq_info[])
++{
++ struct vz_quota_master *qmblk;
++ struct dq_info dq_info[MAXQUOTAS];
++ struct dq_info *target;
++ int err, type;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = 0;
++ /* update from qmblk */
++ for (type = 0; type < MAXQUOTAS; type ++) {
++ target = &qmblk->dq_ugid_info[type];
++ dq_info[type].bexpire = target->bexpire;
++ dq_info[type].iexpire = target->iexpire;
++ dq_info[type].flags = target->flags;
++ }
++
++ if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info)))
++ err = -EFAULT;
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++static int quota_ugid_getconfig(unsigned int quota_id,
++ struct vz_quota_ugid_stat *info)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid_stat kinfo;
++ int err;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = 0;
++ kinfo.limit = qmblk->dq_ugid_max;
++ kinfo.count = qmblk->dq_ugid_count;
++ kinfo.flags = qmblk->dq_flags;
++
++ if (copy_to_user(info, &kinfo, sizeof(kinfo)))
++ err = -EFAULT;
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++static int quota_ugid_setconfig(unsigned int quota_id,
++ struct vz_quota_ugid_stat *info)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid_stat kinfo;
++ int err;
++
++ down(&vz_quota_sem);
++
++ err = -ENOENT;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EFAULT;
++ if (copy_from_user(&kinfo, info, sizeof(kinfo)))
++ goto out;
++
++ err = 0;
++ qmblk->dq_ugid_max = kinfo.limit;
++ if (qmblk->dq_state == VZDQ_STARTING) {
++ qmblk->dq_flags = kinfo.flags;
++ if (qmblk->dq_flags & VZDQUG_ON)
++ qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA;
++ }
++
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++static int quota_ugid_setlimit(unsigned int quota_id,
++ struct vz_quota_ugid_setlimit *u_lim)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid_setlimit lim;
++ int err;
++
++ down(&vz_quota_sem);
++
++ err = -ESRCH;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EFAULT;
++ if (copy_from_user(&lim, u_lim, sizeof(lim)))
++ goto out;
++
++ err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb);
++
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++static int quota_ugid_setinfo(unsigned int quota_id,
++ struct vz_quota_ugid_setinfo *u_info)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid_setinfo info;
++ int err;
++
++ down(&vz_quota_sem);
++
++ err = -ESRCH;
++ qmblk = vzquota_find_master(quota_id);
++ if (qmblk == NULL)
++ goto out;
++
++ err = -EFAULT;
++ if (copy_from_user(&info, u_info, sizeof(info)))
++ goto out;
++
++ err = __vz_set_dqinfo(qmblk, info.type, &info.dqi);
++
++out:
++ up(&vz_quota_sem);
++
++ return err;
++}
++
++/*
++ * This is a system call to maintain UGID quotas
++ * Note this call is allowed to run ONLY from VE0
++ */
++long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub)
++{
++ int ret;
++
++ ret = -EPERM;
++ /* access allowed only from root of VE0 */
++ if (!capable(CAP_SYS_RESOURCE) ||
++ !capable(CAP_SYS_ADMIN))
++ goto out;
++
++ switch (qub->cmd) {
++ case VZ_DQ_UGID_GETSTAT:
++ ret = quota_ugid_getstat(qub->quota_id,
++ qub->ugid_index, qub->ugid_size,
++ (struct vz_quota_iface *)qub->addr);
++ break;
++ case VZ_DQ_UGID_ADDSTAT:
++ ret = quota_ugid_addstat(qub->quota_id, qub->ugid_size,
++ (struct vz_quota_iface *)qub->addr);
++ break;
++ case VZ_DQ_UGID_GETGRACE:
++ ret = quota_ugid_getgrace(qub->quota_id,
++ (struct dq_info *)qub->addr);
++ break;
++ case VZ_DQ_UGID_SETGRACE:
++ ret = quota_ugid_setgrace(qub->quota_id,
++ (struct dq_info *)qub->addr);
++ break;
++ case VZ_DQ_UGID_GETCONFIG:
++ ret = quota_ugid_getconfig(qub->quota_id,
++ (struct vz_quota_ugid_stat *)qub->addr);
++ break;
++ case VZ_DQ_UGID_SETCONFIG:
++ ret = quota_ugid_setconfig(qub->quota_id,
++ (struct vz_quota_ugid_stat *)qub->addr);
++ break;
++ case VZ_DQ_UGID_SETLIMIT:
++ ret = quota_ugid_setlimit(qub->quota_id,
++ (struct vz_quota_ugid_setlimit *)
++ qub->addr);
++ break;
++ case VZ_DQ_UGID_SETINFO:
++ ret = quota_ugid_setinfo(qub->quota_id,
++ (struct vz_quota_ugid_setinfo *)
++ qub->addr);
++ break;
++ default:
++ ret = -EINVAL;
++ goto out;
++ }
++out:
++ return ret;
++}
++
++static void ugid_quota_on_sb(struct super_block *sb)
++{
++ struct super_block *real_sb;
++ struct vz_quota_master *qmblk;
++
++ if (!sb->s_op->get_quota_root)
++ return;
++
++ real_sb = sb->s_op->get_quota_root(sb)->i_sb;
++ if (real_sb->dq_op != &vz_quota_operations)
++ return;
++
++ sb->dq_op = &vz_quota_operations2;
++ sb->s_qcop = &vz_quotactl_operations;
++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
++
++ qmblk = vzquota_find_qmblk(sb);
++ if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD))
++ return;
++ down(&vz_quota_sem);
++ if (qmblk->dq_flags & VZDQ_USRQUOTA)
++ sb->s_dquot.flags |= DQUOT_USR_ENABLED;
++ if (qmblk->dq_flags & VZDQ_GRPQUOTA)
++ sb->s_dquot.flags |= DQUOT_GRP_ENABLED;
++ up(&vz_quota_sem);
++ qmblk_put(qmblk);
++}
++
++static void ugid_quota_off_sb(struct super_block *sb)
++{
++ /* can't make quota off on mounted super block */
++ BUG_ON(sb->s_root != NULL);
++}
++
++static int ugid_notifier_call(struct vnotifier_block *self,
++ unsigned long n, void *data, int old_ret)
++{
++ struct virt_info_quota *viq;
++
++ viq = (struct virt_info_quota *)data;
++
++ switch (n) {
++ case VIRTINFO_QUOTA_ON:
++ ugid_quota_on_sb(viq->super);
++ break;
++ case VIRTINFO_QUOTA_OFF:
++ ugid_quota_off_sb(viq->super);
++ break;
++ case VIRTINFO_QUOTA_GETSTAT:
++ break;
++ default:
++ return old_ret;
++ }
++ return NOTIFY_OK;
++}
++
++static struct vnotifier_block ugid_notifier_block = {
++ .notifier_call = ugid_notifier_call,
++};
++
++/* ----------------------------------------------------------------------
++ * Init/exit.
++ * --------------------------------------------------------------------- */
++
++struct quota_format_type vz_quota_empty_v2_format = {
++ qf_fmt_id: QFMT_VFS_V0,
++ qf_ops: NULL,
++ qf_owner: THIS_MODULE
++};
++
++int vzquota_ugid_init()
++{
++ int err;
++
++ vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid",
++ sizeof(struct vz_quota_ugid),
++ 0, SLAB_HWCACHE_ALIGN,
++ NULL, NULL);
++ if (vz_quota_ugid_cachep == NULL)
++ goto err_slab;
++
++ err = register_quota_format(&vz_quota_empty_v2_format);
++ if (err)
++ goto err_reg;
++
++ virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block);
++ return 0;
++
++err_reg:
++ kmem_cache_destroy(vz_quota_ugid_cachep);
++ return err;
++
++err_slab:
++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
++ return -ENOMEM;
++}
++
++void vzquota_ugid_release()
++{
++ virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block);
++ unregister_quota_format(&vz_quota_empty_v2_format);
++
++ if (kmem_cache_destroy(vz_quota_ugid_cachep))
++ printk(KERN_ERR "VZQUOTA: kmem_cache_destroy failed\n");
++}
+diff -upr linux-2.6.16.orig/fs/vzdquot.c linux-2.6.16-026test015/fs/vzdquot.c
+--- linux-2.6.16.orig/fs/vzdquot.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/fs/vzdquot.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1705 @@
++/*
++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains the core of Virtuozzo disk quota implementation:
++ * maintenance of VZDQ information in inodes,
++ * external interfaces,
++ * module entry.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/dcache.h>
++#include <linux/quota.h>
++#include <linux/rcupdate.h>
++#include <linux/module.h>
++#include <asm/uaccess.h>
++#include <linux/vzctl.h>
++#include <linux/vzctl_quota.h>
++#include <linux/vzquota.h>
++#include <linux/virtinfo.h>
++#include <linux/vzdq_tree.h>
++
++/* ----------------------------------------------------------------------
++ *
++ * Locking
++ *
++ * ---------------------------------------------------------------------- */
++
++/*
++ * Serializes on/off and all other do_vzquotactl operations.
++ * Protects qmblk hash.
++ */
++struct semaphore vz_quota_sem;
++
++/*
++ * Data access locks
++ * inode_qmblk
++ * protects qmblk pointers in all inodes and qlnk content in general
++ * (but not qmblk content);
++ * also protects related qmblk invalidation procedures;
++ * can't be per-inode because of vzquota_dtree_qmblk complications
++ * and problems with serialization with quota_on,
++ * but can be per-superblock;
++ * qmblk_data
++ * protects qmblk fields (such as current usage)
++ * quota_data
++ * protects charge/uncharge operations, thus, implies
++ * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock
++ * (to protect ugid pointers).
++ *
++ * Lock order:
++ * inode_qmblk_lock -> dcache_lock
++ * inode_qmblk_lock -> qmblk_data
++ */
++static spinlock_t vzdq_qmblk_lock = SPIN_LOCK_UNLOCKED;
++
++inline void inode_qmblk_lock(struct super_block *sb)
++{
++ spin_lock(&vzdq_qmblk_lock);
++}
++
++inline void inode_qmblk_unlock(struct super_block *sb)
++{
++ spin_unlock(&vzdq_qmblk_lock);
++}
++
++inline void qmblk_data_read_lock(struct vz_quota_master *qmblk)
++{
++ spin_lock(&qmblk->dq_data_lock);
++}
++
++inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk)
++{
++ spin_unlock(&qmblk->dq_data_lock);
++}
++
++inline void qmblk_data_write_lock(struct vz_quota_master *qmblk)
++{
++ spin_lock(&qmblk->dq_data_lock);
++}
++
++inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk)
++{
++ spin_unlock(&qmblk->dq_data_lock);
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Master hash table handling.
++ *
++ * SMP not safe, serialied by vz_quota_sem within quota syscalls
++ *
++ * --------------------------------------------------------------------- */
++
++static kmem_cache_t *vzquota_cachep;
++
++/*
++ * Hash function.
++ */
++#define QHASH_BITS 6
++#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS)
++#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1)
++
++struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE];
++int vzquota_hash_size = VZ_QUOTA_HASH_SIZE;
++
++static inline int vzquota_hash_func(unsigned int qid)
++{
++ return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK);
++}
++
++/**
++ * vzquota_alloc_master - alloc and instantiate master quota record
++ *
++ * Returns:
++ * pointer to newly created record if SUCCESS
++ * -ENOMEM if out of memory
++ * -EEXIST if record with given quota_id already exist
++ */
++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
++ struct vz_quota_stat *qstat)
++{
++ int err;
++ struct vz_quota_master *qmblk;
++
++ err = -EEXIST;
++ if (vzquota_find_master(quota_id) != NULL)
++ goto out;
++
++ err = -ENOMEM;
++ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL);
++ if (qmblk == NULL)
++ goto out;
++#ifdef CONFIG_VZ_QUOTA_UGID
++ qmblk->dq_uid_tree = quotatree_alloc();
++ if (!qmblk->dq_uid_tree)
++ goto out_free;
++
++ qmblk->dq_gid_tree = quotatree_alloc();
++ if (!qmblk->dq_gid_tree)
++ goto out_free_tree;
++#endif
++
++ qmblk->dq_state = VZDQ_STARTING;
++ init_MUTEX(&qmblk->dq_sem);
++ spin_lock_init(&qmblk->dq_data_lock);
++
++ qmblk->dq_id = quota_id;
++ qmblk->dq_stat = qstat->dq_stat;
++ qmblk->dq_info = qstat->dq_info;
++ qmblk->dq_root_dentry = NULL;
++ qmblk->dq_root_mnt = NULL;
++ qmblk->dq_sb = NULL;
++ qmblk->dq_ugid_count = 0;
++ qmblk->dq_ugid_max = 0;
++ qmblk->dq_flags = 0;
++ memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info));
++ INIT_LIST_HEAD(&qmblk->dq_ilink_list);
++
++ atomic_set(&qmblk->dq_count, 1);
++
++ /* insert in hash chain */
++ list_add(&qmblk->dq_hash,
++ &vzquota_hash_table[vzquota_hash_func(quota_id)]);
++
++ /* success */
++ return qmblk;
++
++out_free_tree:
++ quotatree_free(qmblk->dq_uid_tree, NULL);
++out_free:
++ kmem_cache_free(vzquota_cachep, qmblk);
++out:
++ return ERR_PTR(err);
++}
++
++static struct vz_quota_master *vzquota_alloc_fake(void)
++{
++ struct vz_quota_master *qmblk;
++
++ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL);
++ if (qmblk == NULL)
++ return NULL;
++ memset(qmblk, 0, sizeof(*qmblk));
++ qmblk->dq_state = VZDQ_STOPING;
++ qmblk->dq_flags = VZDQ_NOQUOT;
++ spin_lock_init(&qmblk->dq_data_lock);
++ INIT_LIST_HEAD(&qmblk->dq_ilink_list);
++ atomic_set(&qmblk->dq_count, 1);
++ return qmblk;
++}
++
++/**
++ * vzquota_find_master - find master record with given id
++ *
++ * Returns qmblk without touching its refcounter.
++ * Called under vz_quota_sem.
++ */
++struct vz_quota_master *vzquota_find_master(unsigned int quota_id)
++{
++ int i;
++ struct vz_quota_master *qp;
++
++ i = vzquota_hash_func(quota_id);
++ list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) {
++ if (qp->dq_id == quota_id)
++ return qp;
++ }
++ return NULL;
++}
++
++/**
++ * vzquota_free_master - release resources taken by qmblk, freeing memory
++ *
++ * qmblk is assumed to be already taken out from the hash.
++ * Should be called outside vz_quota_sem.
++ */
++void vzquota_free_master(struct vz_quota_master *qmblk)
++{
++#ifdef CONFIG_VZ_QUOTA_UGID
++ vzquota_kill_ugid(qmblk);
++#endif
++ BUG_ON(!list_empty(&qmblk->dq_ilink_list));
++ kmem_cache_free(vzquota_cachep, qmblk);
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Passing quota information through current
++ *
++ * Used in inode -> qmblk lookup at inode creation stage (since at that
++ * time there are no links between the inode being created and its parent
++ * directory).
++ *
++ * --------------------------------------------------------------------- */
++
++#define VZDQ_CUR_MAGIC 0x57d0fee2
++
++static inline int vzquota_cur_qmblk_check(void)
++{
++ return current->magic == VZDQ_CUR_MAGIC;
++}
++
++static inline struct inode *vzquota_cur_qmblk_fetch(void)
++{
++ return current->ino;
++}
++
++static inline void vzquota_cur_qmblk_set(struct inode *data)
++{
++ struct task_struct *tsk;
++
++ tsk = current;
++ tsk->magic = VZDQ_CUR_MAGIC;
++ tsk->ino = data;
++}
++
++#if 0
++static inline void vzquota_cur_qmblk_reset(void)
++{
++ current->magic = 0;
++}
++#endif
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Superblock quota operations
++ *
++ * --------------------------------------------------------------------- */
++
++/*
++ * Kernel structure abuse.
++ * We use files[0] pointer as an int variable:
++ * reference counter of how many quota blocks uses this superblock.
++ * files[1] is used for generations structure which helps us to track
++ * when traversing of dentries is really required.
++ */
++#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master
++#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\
++ &sb->s_dquot.dqio_sem)
++
++#if defined(VZ_QUOTA_UNLOAD)
++
++#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count
++
++struct dquot_operations *orig_dq_op;
++struct quotactl_ops *orig_dq_cop;
++
++/**
++ * quota_get_super - account for new a quoted tree under the superblock
++ *
++ * One superblock can have multiple directory subtrees with different VZ
++ * quotas. We keep a counter of such subtrees and set VZ quota operations or
++ * reset the default ones.
++ *
++ * Called under vz_quota_sem (from quota_on).
++ */
++int vzquota_get_super(struct super_block *sb)
++{
++ if (sb->dq_op != &vz_quota_operations) {
++ down(&sb->s_dquot.dqonoff_sem);
++ if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) {
++ up(&sb->s_dquot.dqonoff_sem);
++ return -EEXIST;
++ }
++ if (orig_dq_op == NULL && sb->dq_op != NULL)
++ orig_dq_op = sb->dq_op;
++ sb->dq_op = &vz_quota_operations;
++ if (orig_dq_cop == NULL && sb->s_qcop != NULL)
++ orig_dq_cop = sb->s_qcop;
++ /* XXX this may race with sys_quotactl */
++#ifdef CONFIG_VZ_QUOTA_UGID
++ sb->s_qcop = &vz_quotactl_operations;
++#else
++ sb->s_qcop = NULL;
++#endif
++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
++
++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format;
++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format;
++ /*
++ * To get quotaops.h call us we need to mark superblock
++ * as having quota. These flags mark the moment when
++ * our dq_op start to be called.
++ *
++ * The ordering of dq_op and s_dquot.flags assignment
++ * needs to be enforced, but other CPUs do not do rmb()
++ * between s_dquot.flags and dq_op accesses.
++ */
++ wmb(); synchronize_sched();
++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED;
++ __module_get(THIS_MODULE);
++ up(&sb->s_dquot.dqonoff_sem);
++ }
++ /* protected by vz_quota_sem */
++ __VZ_QUOTA_SBREF(sb)++;
++ return 0;
++}
++
++/**
++ * quota_put_super - release superblock when one quota tree goes away
++ *
++ * Called under vz_quota_sem.
++ */
++void vzquota_put_super(struct super_block *sb)
++{
++ int count;
++
++ count = --__VZ_QUOTA_SBREF(sb);
++ if (count == 0) {
++ down(&sb->s_dquot.dqonoff_sem);
++ sb->s_dquot.flags = 0;
++ wmb(); synchronize_sched();
++ sema_init(&sb->s_dquot.dqio_sem, 1);
++ sb->s_qcop = orig_dq_cop;
++ sb->dq_op = orig_dq_op;
++ inode_qmblk_lock(sb);
++ quota_gen_put(SB_QGEN(sb));
++ SB_QGEN(sb) = NULL;
++ /* release qlnk's without qmblk */
++ remove_inode_quota_links_list(&non_vzquota_inodes_lh,
++ sb, NULL);
++ /*
++ * Races with quota initialization:
++ * after this inode_qmblk_unlock all inode's generations are
++ * invalidated, quota_inode_qmblk checks superblock operations.
++ */
++ inode_qmblk_unlock(sb);
++ /*
++ * Module refcounting: in theory, this is the best place
++ * to call module_put(THIS_MODULE).
++ * In reality, it can't be done because we can't be sure that
++ * other CPUs do not enter our code segment through dq_op
++ * cached long time ago. Quotaops interface isn't supposed to
++ * go into modules currently (that is, into unloadable
++ * modules). By omitting module_put, our module isn't
++ * unloadable.
++ */
++ up(&sb->s_dquot.dqonoff_sem);
++ }
++}
++
++#else
++
++struct vzquota_new_sop {
++ struct super_operations new_op;
++ struct super_operations *old_op;
++};
++
++/**
++ * vzquota_shutdown_super - callback on umount
++ */
++void vzquota_shutdown_super(struct super_block *sb)
++{
++ struct vz_quota_master *qmblk;
++ struct vzquota_new_sop *sop;
++
++ qmblk = __VZ_QUOTA_NOQUOTA(sb);
++ __VZ_QUOTA_NOQUOTA(sb) = NULL;
++ if (qmblk != NULL)
++ qmblk_put(qmblk);
++ sop = container_of(sb->s_op, struct vzquota_new_sop, new_op);
++ sb->s_op = sop->old_op;
++ kfree(sop);
++ (*sb->s_op->put_super)(sb);
++}
++
++/**
++ * vzquota_get_super - account for new a quoted tree under the superblock
++ *
++ * One superblock can have multiple directory subtrees with different VZ
++ * quotas.
++ *
++ * Called under vz_quota_sem (from vzquota_on).
++ */
++int vzquota_get_super(struct super_block *sb)
++{
++ struct vz_quota_master *qnew;
++ struct vzquota_new_sop *sop;
++ int err;
++
++ down(&sb->s_dquot.dqonoff_sem);
++ err = -EEXIST;
++ if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) &&
++ sb->dq_op != &vz_quota_operations)
++ goto out_up;
++
++ /*
++ * This allocation code should be under sb->dq_op check below, but
++ * it doesn't really matter...
++ */
++ if (__VZ_QUOTA_NOQUOTA(sb) == NULL) {
++ qnew = vzquota_alloc_fake();
++ if (qnew == NULL)
++ goto out_up;
++ __VZ_QUOTA_NOQUOTA(sb) = qnew;
++ }
++
++ if (sb->dq_op != &vz_quota_operations) {
++ sop = kmalloc(sizeof(*sop), GFP_KERNEL);
++ if (sop == NULL) {
++ vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb));
++ __VZ_QUOTA_NOQUOTA(sb) = NULL;
++ goto out_up;
++ }
++ memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op));
++ sop->new_op.put_super = &vzquota_shutdown_super;
++ sop->old_op = sb->s_op;
++ sb->s_op = &sop->new_op;
++
++ sb->dq_op = &vz_quota_operations;
++#ifdef CONFIG_VZ_QUOTA_UGID
++ sb->s_qcop = &vz_quotactl_operations;
++#else
++ sb->s_qcop = NULL;
++#endif
++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb));
++
++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
++ /* these 2 list heads are checked in sync_dquots() */
++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++ sb->s_dquot.info[USRQUOTA].dqi_format =
++ &vz_quota_empty_v2_format;
++ sb->s_dquot.info[GRPQUOTA].dqi_format =
++ &vz_quota_empty_v2_format;
++
++ /*
++ * To get quotaops.h to call us we need to mark superblock
++ * as having quota. These flags mark the moment when
++ * our dq_op start to be called.
++ *
++ * The ordering of dq_op and s_dquot.flags assignment
++ * needs to be enforced, but other CPUs do not do rmb()
++ * between s_dquot.flags and dq_op accesses.
++ */
++ wmb(); synchronize_sched();
++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED;
++ }
++ err = 0;
++
++out_up:
++ up(&sb->s_dquot.dqonoff_sem);
++ return err;
++}
++
++/**
++ * vzquota_put_super - one quota tree less on this superblock
++ *
++ * Called under vz_quota_sem.
++ */
++void vzquota_put_super(struct super_block *sb)
++{
++ /*
++ * Even if this put is the last one,
++ * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop
++ * won't be called and the remaining qmblk references won't be put.
++ */
++}
++
++#endif
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Helpers for inode -> qmblk link maintenance
++ *
++ * --------------------------------------------------------------------- */
++
++#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd)
++#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT)
++#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops)
++extern struct inode_operations vfs_empty_iops;
++
++static int VZ_QUOTA_IS_ACTUAL(struct inode *inode)
++{
++ struct vz_quota_master *qmblk;
++
++ qmblk = INODE_QLNK(inode)->qmblk;
++ if (qmblk == VZ_QUOTA_BAD)
++ return 1;
++ if (qmblk == __VZ_QUOTA_EMPTY)
++ return 0;
++ if (qmblk->dq_flags & VZDQ_NOACT)
++ /* not actual (invalidated) qmblk */
++ return 0;
++ return 1;
++}
++
++static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk)
++{
++ return qlnk->qmblk == __VZ_QUOTA_EMPTY;
++}
++
++static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk)
++{
++ qlnk->qmblk = __VZ_QUOTA_EMPTY;
++ qlnk->origin = VZ_QUOTAO_SETE;
++}
++
++void vzquota_qlnk_init(struct vz_quota_ilink *qlnk)
++{
++ memset(qlnk, 0, sizeof(*qlnk));
++ INIT_LIST_HEAD(&qlnk->list);
++ vzquota_qlnk_set_empty(qlnk);
++ qlnk->origin = VZ_QUOTAO_INIT;
++}
++
++void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk)
++{
++ might_sleep();
++ if (vzquota_qlnk_is_empty(qlnk))
++ return;
++#if defined(CONFIG_VZ_QUOTA_UGID)
++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) {
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid *quid, *qgid;
++ qmblk = qlnk->qmblk;
++ quid = qlnk->qugid[USRQUOTA];
++ qgid = qlnk->qugid[GRPQUOTA];
++ if (quid != NULL || qgid != NULL) {
++ down(&qmblk->dq_sem);
++ if (qgid != NULL)
++ vzquota_put_ugid(qmblk, qgid);
++ if (quid != NULL)
++ vzquota_put_ugid(qmblk, quid);
++ up(&qmblk->dq_sem);
++ }
++ }
++#endif
++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD)
++ qmblk_put(qlnk->qmblk);
++ qlnk->origin = VZ_QUOTAO_DESTR;
++}
++
++/**
++ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents
++ * @qlt: temporary
++ * @qli: inode's
++ *
++ * Locking is provided by the caller (depending on the context).
++ * After swap, @qli is inserted into the corresponding dq_ilink_list,
++ * @qlt list is reinitialized.
++ */
++static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt,
++ struct vz_quota_ilink *qli)
++{
++ struct vz_quota_master *qb;
++ struct vz_quota_ugid *qu;
++ int i;
++
++ qb = qlt->qmblk;
++ qlt->qmblk = qli->qmblk;
++ qli->qmblk = qb;
++ list_del_init(&qli->list);
++ if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD)
++ list_add(&qli->list, &qb->dq_ilink_list);
++ INIT_LIST_HEAD(&qlt->list);
++ qli->origin = VZ_QUOTAO_SWAP;
++
++ for (i = 0; i < MAXQUOTAS; i++) {
++ qu = qlt->qugid[i];
++ qlt->qugid[i] = qli->qugid[i];
++ qli->qugid[i] = qu;
++ }
++}
++
++/**
++ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks
++ *
++ * Called under dcache_lock and inode_qmblk locks.
++ * Returns 1 if locks were dropped inside, 0 if atomic.
++ */
++static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk,
++ struct inode *inode)
++{
++ if (vzquota_qlnk_is_empty(qlnk))
++ return 0;
++ if (qlnk->qmblk == VZ_QUOTA_BAD) {
++ vzquota_qlnk_set_empty(qlnk);
++ return 0;
++ }
++ spin_unlock(&dcache_lock);
++ inode_qmblk_unlock(inode->i_sb);
++ vzquota_qlnk_destroy(qlnk);
++ vzquota_qlnk_init(qlnk);
++ inode_qmblk_lock(inode->i_sb);
++ spin_lock(&dcache_lock);
++ return 1;
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++/**
++ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content
++ *
++ * Similar to vzquota_qlnk_reinit_locked, called under different locks.
++ */
++static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk,
++ struct inode *inode,
++ struct vz_quota_master *qmblk)
++{
++ if (vzquota_qlnk_is_empty(qlnk))
++ return 0;
++ /* may be optimized if qlnk->qugid all NULLs */
++ qmblk_data_write_unlock(qmblk);
++ inode_qmblk_unlock(inode->i_sb);
++ vzquota_qlnk_destroy(qlnk);
++ vzquota_qlnk_init(qlnk);
++ inode_qmblk_lock(inode->i_sb);
++ qmblk_data_write_lock(qmblk);
++ return 1;
++}
++#endif
++
++/**
++ * vzquota_qlnk_fill - fill vz_quota_ilink content
++ * @qlnk: vz_quota_ilink to fill
++ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid)
++ * @qmblk: qmblk to which this @qlnk will belong
++ *
++ * Called under dcache_lock and inode_qmblk locks.
++ * Returns 1 if locks were dropped inside, 0 if atomic.
++ * @qlnk is expected to be empty.
++ */
++static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk,
++ struct inode *inode,
++ struct vz_quota_master *qmblk)
++{
++ if (qmblk != VZ_QUOTA_BAD)
++ qmblk_get(qmblk);
++ qlnk->qmblk = qmblk;
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++ if (qmblk != VZ_QUOTA_BAD &&
++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
++ (qmblk->dq_flags & VZDQUG_ON)) {
++ struct vz_quota_ugid *quid, *qgid;
++
++ spin_unlock(&dcache_lock);
++ inode_qmblk_unlock(inode->i_sb);
++
++ down(&qmblk->dq_sem);
++ quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0);
++ qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0);
++ up(&qmblk->dq_sem);
++
++ inode_qmblk_lock(inode->i_sb);
++ spin_lock(&dcache_lock);
++ qlnk->qugid[USRQUOTA] = quid;
++ qlnk->qugid[GRPQUOTA] = qgid;
++ return 1;
++ }
++#endif
++
++ return 0;
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++/**
++ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid
++ *
++ * This function is a helper for vzquota_transfer, and differs from
++ * vzquota_qlnk_fill only by locking.
++ */
++static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk,
++ struct inode *inode,
++ struct iattr *iattr,
++ int mask,
++ struct vz_quota_master *qmblk)
++{
++ qmblk_get(qmblk);
++ qlnk->qmblk = qmblk;
++
++ if (mask) {
++ struct vz_quota_ugid *quid, *qgid;
++
++ quid = qgid = NULL; /* to make gcc happy */
++ if (!(mask & (1 << USRQUOTA)))
++ quid = vzquota_get_ugid(INODE_QLNK(inode)->
++ qugid[USRQUOTA]);
++ if (!(mask & (1 << GRPQUOTA)))
++ qgid = vzquota_get_ugid(INODE_QLNK(inode)->
++ qugid[GRPQUOTA]);
++
++ qmblk_data_write_unlock(qmblk);
++ inode_qmblk_unlock(inode->i_sb);
++
++ down(&qmblk->dq_sem);
++ if (mask & (1 << USRQUOTA))
++ quid = __vzquota_find_ugid(qmblk, iattr->ia_uid,
++ USRQUOTA, 0);
++ if (mask & (1 << GRPQUOTA))
++ qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid,
++ GRPQUOTA, 0);
++ up(&qmblk->dq_sem);
++
++ inode_qmblk_lock(inode->i_sb);
++ qmblk_data_write_lock(qmblk);
++ qlnk->qugid[USRQUOTA] = quid;
++ qlnk->qugid[GRPQUOTA] = qgid;
++ return 1;
++ }
++
++ return 0;
++}
++#endif
++
++/**
++ * __vzquota_inode_init - make sure inode's qlnk is initialized
++ *
++ * May be called if qlnk is already initialized, detects this situation itself.
++ * Called under inode_qmblk_lock.
++ */
++static void __vzquota_inode_init(struct inode *inode, unsigned char origin)
++{
++ if (inode->i_dquot[USRQUOTA] == NODQUOT) {
++ vzquota_qlnk_init(INODE_QLNK(inode));
++ inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT;
++ }
++ INODE_QLNK(inode)->origin = origin;
++}
++
++/**
++ * vzquota_inode_drop - destroy VZ quota information in the inode
++ *
++ * Inode must not be externally accessible or dirty.
++ */
++static void vzquota_inode_drop(struct inode *inode)
++{
++ struct vz_quota_ilink qlnk;
++
++ vzquota_qlnk_init(&qlnk);
++ inode_qmblk_lock(inode->i_sb);
++ vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode));
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DRCAL;
++ inode->i_dquot[USRQUOTA] = NODQUOT;
++ inode_qmblk_unlock(inode->i_sb);
++ vzquota_qlnk_destroy(&qlnk);
++}
++
++/**
++ * vzquota_inode_qmblk_set - initialize inode's qlnk
++ * @inode: inode to be initialized
++ * @qmblk: quota master block to which this inode should belong (may be BAD)
++ * @qlnk: placeholder to store data to resolve locking issues
++ *
++ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise.
++ * Called under dcache_lock and inode_qmblk locks.
++ * @qlnk will be destroyed in the caller chain.
++ *
++ * It is not mandatory to restart parent checks since quota on/off currently
++ * shrinks dentry tree and checks that there are not outside references.
++ * But if at some time that shink is removed, restarts will be required.
++ * Additionally, the restarts prevent inconsistencies if the dentry tree
++ * changes (inode is moved). This is not a big deal, but anyway...
++ */
++static int vzquota_inode_qmblk_set(struct inode *inode,
++ struct vz_quota_master *qmblk,
++ struct vz_quota_ilink *qlnk)
++{
++ if (qmblk == NULL) {
++ printk(KERN_ERR "VZDQ: NULL in set, "
++ "orig %u, dev %s, inode %lu, fs %s\n",
++ INODE_QLNK(inode)->origin,
++ inode->i_sb->s_id, inode->i_ino,
++ inode->i_sb->s_type->name);
++ printk(KERN_ERR "current %d (%s), VE %d\n",
++ current->pid, current->comm,
++ VEID(get_exec_env()));
++ dump_stack();
++ qmblk = VZ_QUOTA_BAD;
++ }
++ while (1) {
++ if (vzquota_qlnk_is_empty(qlnk) &&
++ vzquota_qlnk_fill(qlnk, inode, qmblk))
++ return 1;
++ if (qlnk->qmblk == qmblk)
++ break;
++ if (vzquota_qlnk_reinit_locked(qlnk, inode))
++ return 1;
++ }
++ vzquota_qlnk_swap(qlnk, INODE_QLNK(inode));
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_QSET;
++ return 0;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * vzquota_inode_qmblk (inode -> qmblk lookup) parts
++ *
++ * --------------------------------------------------------------------- */
++
++static int vzquota_dparents_check_attach(struct inode *inode)
++{
++ if (!list_empty(&inode->i_dentry))
++ return 0;
++ printk(KERN_ERR "VZDQ: no parent for "
++ "dev %s, inode %lu, fs %s\n",
++ inode->i_sb->s_id,
++ inode->i_ino,
++ inode->i_sb->s_type->name);
++ return -1;
++}
++
++static struct inode *vzquota_dparents_check_actual(struct inode *inode)
++{
++ struct dentry *de;
++
++ list_for_each_entry(de, &inode->i_dentry, d_alias) {
++ if (de->d_parent == de) /* detached dentry, perhaps */
++ continue;
++ /* first access to parent, make sure its qlnk initialized */
++ __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT);
++ if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode))
++ return de->d_parent->d_inode;
++ }
++ return NULL;
++}
++
++static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode)
++{
++ struct dentry *de;
++ struct vz_quota_master *qmblk;
++
++ qmblk = NULL;
++ list_for_each_entry(de, &inode->i_dentry, d_alias) {
++ if (de->d_parent == de) /* detached dentry, perhaps */
++ continue;
++ if (qmblk == NULL) {
++ qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk;
++ continue;
++ }
++ if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) {
++ printk(KERN_WARNING "VZDQ: multiple quotas for "
++ "dev %s, inode %lu, fs %s\n",
++ inode->i_sb->s_id,
++ inode->i_ino,
++ inode->i_sb->s_type->name);
++ qmblk = VZ_QUOTA_BAD;
++ break;
++ }
++ }
++ if (qmblk == NULL) {
++ printk(KERN_WARNING "VZDQ: not attached to tree, "
++ "dev %s, inode %lu, fs %s\n",
++ inode->i_sb->s_id,
++ inode->i_ino,
++ inode->i_sb->s_type->name);
++ qmblk = VZ_QUOTA_BAD;
++ }
++ return qmblk;
++}
++
++static void vzquota_dbranch_actualize(struct inode *inode,
++ struct inode *refinode)
++{
++ struct inode *pinode;
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ilink qlnk;
++
++ vzquota_qlnk_init(&qlnk);
++
++start:
++ if (inode == inode->i_sb->s_root->d_inode) {
++ /* filesystem root */
++ atomic_inc(&inode->i_count);
++ do {
++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
++ } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk));
++ goto out;
++ }
++
++ if (!vzquota_dparents_check_attach(inode)) {
++ pinode = vzquota_dparents_check_actual(inode);
++ if (pinode != NULL) {
++ inode = pinode;
++ goto start;
++ }
++ }
++
++ atomic_inc(&inode->i_count);
++ while (1) {
++ if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */
++ break;
++ /*
++ * Need to check parents again if we have slept inside
++ * vzquota_inode_qmblk_set() in the loop.
++ * If the state of parents is different, just return and repeat
++ * the actualizing process again from the inode passed to
++ * vzquota_inode_qmblk_recalc().
++ */
++ if (!vzquota_dparents_check_attach(inode)) {
++ if (vzquota_dparents_check_actual(inode) != NULL)
++ break;
++ qmblk = vzquota_dparents_check_same(inode);
++ } else
++ qmblk = VZ_QUOTA_BAD;
++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_ACT;
++ break;
++ }
++ }
++
++out:
++ spin_unlock(&dcache_lock);
++ inode_qmblk_unlock(refinode->i_sb);
++ vzquota_qlnk_destroy(&qlnk);
++ iput(inode);
++ inode_qmblk_lock(refinode->i_sb);
++ spin_lock(&dcache_lock);
++}
++
++static void vzquota_dtree_qmblk_recalc(struct inode *inode,
++ struct vz_quota_ilink *qlnk)
++{
++ struct inode *pinode;
++ struct vz_quota_master *qmblk;
++
++ if (inode == inode->i_sb->s_root->d_inode) {
++ /* filesystem root */
++ do {
++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
++ } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk));
++ return;
++ }
++
++start:
++ if (VZ_QUOTA_IS_ACTUAL(inode))
++ return;
++ /*
++ * Here qmblk is (re-)initialized for all ancestors.
++ * This is not a very efficient procedure, but it guarantees that
++ * the quota tree is consistent (that is, the inode doesn't have two
++ * ancestors with different qmblk).
++ */
++ if (!vzquota_dparents_check_attach(inode)) {
++ pinode = vzquota_dparents_check_actual(inode);
++ if (pinode != NULL) {
++ vzquota_dbranch_actualize(pinode, inode);
++ goto start;
++ }
++ qmblk = vzquota_dparents_check_same(inode);
++ } else
++ qmblk = VZ_QUOTA_BAD;
++
++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
++ goto start;
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DTREE;
++}
++
++static void vzquota_det_qmblk_recalc(struct inode *inode,
++ struct vz_quota_ilink *qlnk)
++{
++ struct inode *parent;
++ struct vz_quota_master *qmblk;
++ char *msg;
++ int cnt;
++ time_t timeout;
++
++ cnt = 0;
++ parent = NULL;
++start:
++ /*
++ * qmblk of detached inodes shouldn't be considered as not actual.
++ * They are not in any dentry tree, so quota on/off shouldn't affect
++ * them.
++ */
++ if (!vzquota_qlnk_is_empty(INODE_QLNK(inode)))
++ return;
++
++ timeout = 3;
++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb);
++ msg = "detached inode not in creation";
++ if (inode->i_op != VZ_QUOTA_EMPTY_IOPS)
++ goto fail;
++ qmblk = VZ_QUOTA_BAD;
++ msg = "unexpected creation context";
++ if (!vzquota_cur_qmblk_check())
++ goto fail;
++ timeout = 0;
++ parent = vzquota_cur_qmblk_fetch();
++ msg = "uninitialized parent";
++ if (vzquota_qlnk_is_empty(INODE_QLNK(parent)))
++ goto fail;
++ msg = "parent not in tree";
++ if (list_empty(&parent->i_dentry))
++ goto fail;
++ msg = "parent has 0 refcount";
++ if (!atomic_read(&parent->i_count))
++ goto fail;
++ msg = "parent has different sb";
++ if (parent->i_sb != inode->i_sb)
++ goto fail;
++ if (!VZ_QUOTA_IS_ACTUAL(parent)) {
++ vzquota_dbranch_actualize(parent, inode);
++ goto start;
++ }
++
++ qmblk = INODE_QLNK(parent)->qmblk;
++set:
++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk))
++ goto start;
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DET;
++ return;
++
++fail:
++ {
++ struct timeval tv, tvo;
++ do_gettimeofday(&tv);
++ memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo));
++ tv.tv_sec -= tvo.tv_sec;
++ if (tv.tv_usec < tvo.tv_usec) {
++ tv.tv_sec--;
++ tv.tv_usec += USEC_PER_SEC - tvo.tv_usec;
++ } else
++ tv.tv_usec -= tvo.tv_usec;
++ if (tv.tv_sec < timeout)
++ goto set;
++ printk(KERN_ERR "VZDQ: %s, orig %u,"
++ " dev %s, inode %lu, fs %s\n",
++ msg, INODE_QLNK(inode)->origin,
++ inode->i_sb->s_id, inode->i_ino,
++ inode->i_sb->s_type->name);
++ if (!cnt++) {
++ printk(KERN_ERR "current %d (%s), VE %d,"
++ " time %ld.%06ld\n",
++ current->pid, current->comm,
++ VEID(get_exec_env()),
++ tv.tv_sec, tv.tv_usec);
++ dump_stack();
++ }
++ if (parent != NULL)
++ printk(KERN_ERR "VZDQ: parent of %lu is %lu\n",
++ inode->i_ino, parent->i_ino);
++ }
++ goto set;
++}
++
++static void vzquota_inode_qmblk_recalc(struct inode *inode,
++ struct vz_quota_ilink *qlnk)
++{
++ spin_lock(&dcache_lock);
++ if (!list_empty(&inode->i_dentry))
++ vzquota_dtree_qmblk_recalc(inode, qlnk);
++ else
++ vzquota_det_qmblk_recalc(inode, qlnk);
++ spin_unlock(&dcache_lock);
++}
++
++/**
++ * vzquota_inode_qmblk - obtain inode's qmblk
++ *
++ * Returns qmblk with refcounter taken, %NULL if not under
++ * VZ quota or %VZ_QUOTA_BAD.
++ *
++ * FIXME: This function should be removed when vzquota_find_qmblk /
++ * get_quota_root / vzquota_dstat code is cleaned up.
++ */
++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ilink qlnk;
++
++ might_sleep();
++
++ if (inode->i_sb->dq_op != &vz_quota_operations)
++ return NULL;
++#if defined(VZ_QUOTA_UNLOAD)
++#error Make sure qmblk does not disappear
++#endif
++
++ vzquota_qlnk_init(&qlnk);
++ inode_qmblk_lock(inode->i_sb);
++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++
++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
++ !VZ_QUOTA_IS_ACTUAL(inode))
++ vzquota_inode_qmblk_recalc(inode, &qlnk);
++
++ qmblk = INODE_QLNK(inode)->qmblk;
++ if (qmblk != VZ_QUOTA_BAD) {
++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb))
++ qmblk_get(qmblk);
++ else
++ qmblk = NULL;
++ }
++
++ inode_qmblk_unlock(inode->i_sb);
++ vzquota_qlnk_destroy(&qlnk);
++ return qmblk;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Calls from quota operations
++ *
++ * --------------------------------------------------------------------- */
++
++/**
++ * vzquota_inode_init_call - call from DQUOT_INIT
++ */
++void vzquota_inode_init_call(struct inode *inode)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_datast data;
++
++ /* initializes inode's quota inside */
++ qmblk = vzquota_inode_data(inode, &data);
++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD)
++ vzquota_data_unlock(inode, &data);
++
++ /*
++ * The check is needed for repeated new_inode() calls from a single
++ * ext3 call like create or mkdir in case of -ENOSPC.
++ */
++ spin_lock(&dcache_lock);
++ if (!list_empty(&inode->i_dentry))
++ vzquota_cur_qmblk_set(inode);
++ spin_unlock(&dcache_lock);
++}
++
++/**
++ * vzquota_inode_drop_call - call from DQUOT_DROP
++ */
++void vzquota_inode_drop_call(struct inode *inode)
++{
++ vzquota_inode_drop(inode);
++}
++
++/**
++ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs
++ * @inode: the inode
++ * @data: storage space
++ *
++ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk.
++ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD:
++ * qmblk in inode's qlnk is the same as returned,
++ * ugid pointers inside inode's qlnk are valid,
++ * some locks are taken (and should be released by vzquota_data_unlock).
++ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken.
++ */
++struct vz_quota_master *vzquota_inode_data(struct inode *inode,
++ struct vz_quota_datast *data)
++{
++ struct vz_quota_master *qmblk;
++
++ might_sleep();
++
++ vzquota_qlnk_init(&data->qlnk);
++ inode_qmblk_lock(inode->i_sb);
++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++
++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
++ !VZ_QUOTA_IS_ACTUAL(inode))
++ vzquota_inode_qmblk_recalc(inode, &data->qlnk);
++
++ qmblk = INODE_QLNK(inode)->qmblk;
++ if (qmblk != VZ_QUOTA_BAD) {
++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) {
++ /*
++ * Note that in the current implementation,
++ * inode_qmblk_lock can theoretically be dropped here.
++ * This place is serialized with quota_off because
++ * quota_off fails when there are extra dentry
++ * references and syncs inodes before removing quota
++ * information from them.
++ * However, quota usage information should stop being
++ * updated immediately after vzquota_off.
++ */
++ qmblk_data_write_lock(qmblk);
++ } else {
++ inode_qmblk_unlock(inode->i_sb);
++ qmblk = NULL;
++ }
++ } else {
++ inode_qmblk_unlock(inode->i_sb);
++ }
++ return qmblk;
++}
++
++void vzquota_data_unlock(struct inode *inode,
++ struct vz_quota_datast *data)
++{
++ qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk);
++ inode_qmblk_unlock(inode->i_sb);
++ vzquota_qlnk_destroy(&data->qlnk);
++}
++
++#if defined(CONFIG_VZ_QUOTA_UGID)
++/**
++ * vzquota_inode_transfer_call - call from vzquota_transfer
++ */
++int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_datast data;
++ struct vz_quota_ilink qlnew;
++ int mask;
++ int ret;
++
++ might_sleep();
++ vzquota_qlnk_init(&qlnew);
++start:
++ qmblk = vzquota_inode_data(inode, &data);
++ ret = NO_QUOTA;
++ if (qmblk == VZ_QUOTA_BAD)
++ goto out_destr;
++ ret = QUOTA_OK;
++ if (qmblk == NULL)
++ goto out_destr;
++ qmblk_get(qmblk);
++
++ ret = QUOTA_OK;
++ if (!(qmblk->dq_flags & VZDQUG_ON))
++ /* no ugid quotas */
++ goto out_unlock;
++
++ mask = 0;
++ if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid)
++ mask |= 1 << USRQUOTA;
++ if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid)
++ mask |= 1 << GRPQUOTA;
++ while (1) {
++ if (vzquota_qlnk_is_empty(&qlnew) &&
++ vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk))
++ break;
++ if (qlnew.qmblk == INODE_QLNK(inode)->qmblk &&
++ qlnew.qmblk == qmblk)
++ goto finish;
++ if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk))
++ break;
++ }
++
++ /* prepare for restart */
++ vzquota_data_unlock(inode, &data);
++ qmblk_put(qmblk);
++ goto start;
++
++finish:
++ /* all references obtained successfully */
++ ret = vzquota_transfer_usage(inode, mask, &qlnew);
++ if (!ret) {
++ vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode));
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_TRANS;
++ }
++out_unlock:
++ vzquota_data_unlock(inode, &data);
++ qmblk_put(qmblk);
++out_destr:
++ vzquota_qlnk_destroy(&qlnew);
++ return ret;
++}
++#endif
++
++int vzquota_rename_check(struct inode *inode,
++ struct inode *old_dir, struct inode *new_dir)
++{
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ilink qlnk1, qlnk2;
++ int c, ret;
++
++ if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb)
++ return -1;
++
++ might_sleep();
++
++ vzquota_qlnk_init(&qlnk1);
++ vzquota_qlnk_init(&qlnk2);
++ inode_qmblk_lock(inode->i_sb);
++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++ __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL);
++ __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL);
++
++ do {
++ c = 0;
++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) ||
++ !VZ_QUOTA_IS_ACTUAL(inode)) {
++ vzquota_inode_qmblk_recalc(inode, &qlnk1);
++ c++;
++ }
++ if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) ||
++ !VZ_QUOTA_IS_ACTUAL(new_dir)) {
++ vzquota_inode_qmblk_recalc(new_dir, &qlnk2);
++ c++;
++ }
++ } while (c);
++
++ ret = 0;
++ qmblk = INODE_QLNK(inode)->qmblk;
++ if (qmblk != INODE_QLNK(new_dir)->qmblk) {
++ ret = -1;
++ if (qmblk != VZ_QUOTA_BAD &&
++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) &&
++ qmblk->dq_root_dentry->d_inode == inode &&
++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk,
++ inode->i_sb) &&
++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk,
++ inode->i_sb))
++ /* quota root rename is allowed */
++ ret = 0;
++ }
++
++ inode_qmblk_unlock(inode->i_sb);
++ vzquota_qlnk_destroy(&qlnk2);
++ vzquota_qlnk_destroy(&qlnk1);
++ return ret;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * qmblk-related parts of on/off operations
++ *
++ * --------------------------------------------------------------------- */
++
++/**
++ * vzquota_check_dtree - check dentry tree if quota on/off is allowed
++ *
++ * This function doesn't allow quota to be turned on/off if some dentries in
++ * the tree have external references.
++ * In addition to technical reasons, it enforces user-space correctness:
++ * current usage (taken from or reported to the user space) can be meaningful
++ * and accurate only if the tree is not being modified.
++ * Side effect: additional vfsmount structures referencing the tree (bind
++ * mounts of tree nodes to some other places) are not allowed at on/off time.
++ */
++int vzquota_check_dtree(struct vz_quota_master *qmblk, int off)
++{
++ struct dentry *dentry;
++ int err, count;
++
++ err = -EBUSY;
++ dentry = qmblk->dq_root_dentry;
++
++ if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root)
++ goto unhashed;
++
++ /* attempt to shrink */
++ if (!list_empty(&dentry->d_subdirs)) {
++ spin_unlock(&dcache_lock);
++ inode_qmblk_unlock(dentry->d_sb);
++ shrink_dcache_parent(dentry);
++ inode_qmblk_lock(dentry->d_sb);
++ spin_lock(&dcache_lock);
++ if (!list_empty(&dentry->d_subdirs))
++ goto out;
++
++ count = 1;
++ if (dentry == dentry->d_sb->s_root)
++ count += 2; /* sb and mnt refs */
++ if (atomic_read(&dentry->d_count) < count) {
++ printk(KERN_ERR "%s: too small count %d vs %d.\n",
++ __FUNCTION__,
++ atomic_read(&dentry->d_count), count);
++ goto out;
++ }
++ if (atomic_read(&dentry->d_count) > count)
++ goto out;
++ }
++
++ err = 0;
++out:
++ return err;
++
++unhashed:
++ /*
++ * Quota root is removed.
++ * Allow to turn quota off, but not on.
++ */
++ if (off)
++ err = 0;
++ goto out;
++}
++
++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
++ struct vz_quota_master *qmblk)
++{
++ struct vz_quota_ilink qlnk;
++ struct vz_quota_master *qold, *qnew;
++ int err;
++
++ might_sleep();
++
++ qold = NULL;
++ qnew = vzquota_alloc_fake();
++ if (qnew == NULL)
++ return -ENOMEM;
++
++ vzquota_qlnk_init(&qlnk);
++ inode_qmblk_lock(sb);
++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL);
++
++ spin_lock(&dcache_lock);
++ while (1) {
++ err = vzquota_check_dtree(qmblk, 0);
++ if (err)
++ break;
++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk))
++ break;
++ }
++ INODE_QLNK(inode)->origin = VZ_QUOTAO_ON;
++ spin_unlock(&dcache_lock);
++
++ if (!err) {
++ qold = __VZ_QUOTA_NOQUOTA(sb);
++ qold->dq_flags |= VZDQ_NOACT;
++ __VZ_QUOTA_NOQUOTA(sb) = qnew;
++ }
++
++ inode_qmblk_unlock(sb);
++ vzquota_qlnk_destroy(&qlnk);
++ if (qold != NULL)
++ qmblk_put(qold);
++
++ return err;
++}
++
++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk)
++{
++ int ret;
++
++ ret = 0;
++ inode_qmblk_lock(sb);
++
++ spin_lock(&dcache_lock);
++ if (vzquota_check_dtree(qmblk, 1))
++ ret = -EBUSY;
++ spin_unlock(&dcache_lock);
++
++ if (!ret)
++ qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT;
++ inode_qmblk_unlock(sb);
++ return ret;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * External interfaces
++ *
++ * ---------------------------------------------------------------------*/
++
++static int vzquota_ioctl(struct inode *ino, struct file *file,
++ unsigned int cmd, unsigned long arg)
++{
++ int err;
++ struct vzctl_quotactl qb;
++ struct vzctl_quotaugidctl qub;
++
++ switch (cmd) {
++ case VZCTL_QUOTA_CTL:
++ err = -ENOTTY;
++ break;
++ case VZCTL_QUOTA_NEW_CTL:
++ err = -EFAULT;
++ if (copy_from_user(&qb, (void *)arg, sizeof(qb)))
++ break;
++ err = do_vzquotactl(qb.cmd, qb.quota_id,
++ qb.qstat, qb.ve_root);
++ break;
++#ifdef CONFIG_VZ_QUOTA_UGID
++ case VZCTL_QUOTA_UGID_CTL:
++ err = -EFAULT;
++ if (copy_from_user(&qub, (void *)arg, sizeof(qub)))
++ break;
++ err = do_vzquotaugidctl(&qub);
++ break;
++#endif
++ default:
++ err = -ENOTTY;
++ }
++ might_sleep(); /* debug */
++ return err;
++}
++
++static struct vzioctlinfo vzdqcalls = {
++ .type = VZDQCTLTYPE,
++ .func = vzquota_ioctl,
++ .owner = THIS_MODULE,
++};
++
++/**
++ * vzquota_dstat - get quota usage info for virtual superblock
++ */
++static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat)
++{
++ struct vz_quota_master *qmblk;
++
++ qmblk = vzquota_find_qmblk(super);
++ if (qmblk == NULL)
++ return -ENOENT;
++ if (qmblk == VZ_QUOTA_BAD) {
++ memset(qstat, 0, sizeof(*qstat));
++ return 0;
++ }
++
++ qmblk_data_read_lock(qmblk);
++ memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat));
++ qmblk_data_read_unlock(qmblk);
++ qmblk_put(qmblk);
++ return 0;
++}
++
++
++/* ----------------------------------------------------------------------
++ *
++ * Init/exit helpers
++ *
++ * ---------------------------------------------------------------------*/
++
++static int vzquota_cache_init(void)
++{
++ int i;
++
++ vzquota_cachep = kmem_cache_create("vz_quota_master",
++ sizeof(struct vz_quota_master),
++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
++ if (vzquota_cachep == NULL) {
++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n");
++ goto nomem2;
++ }
++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
++ INIT_LIST_HEAD(&vzquota_hash_table[i]);
++
++ return 0;
++
++nomem2:
++ return -ENOMEM;
++}
++
++static void vzquota_cache_release(void)
++{
++ int i;
++
++ /* sanity check */
++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++)
++ if (!list_empty(&vzquota_hash_table[i]))
++ BUG();
++
++ /* release caches */
++ if (kmem_cache_destroy(vzquota_cachep))
++ printk(KERN_ERR
++ "VZQUOTA: vz_quota_master kmem_cache_destroy failed\n");
++ vzquota_cachep = NULL;
++}
++
++static int quota_notifier_call(struct vnotifier_block *self,
++ unsigned long n, void *data, int err)
++{
++ struct virt_info_quota *viq;
++ struct super_block *sb;
++
++ viq = (struct virt_info_quota *)data;
++ switch (n) {
++ case VIRTINFO_QUOTA_ON:
++ err = NOTIFY_BAD;
++ if (!try_module_get(THIS_MODULE))
++ break;
++ sb = viq->super;
++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info));
++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list);
++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list);
++ err = NOTIFY_OK;
++ break;
++ case VIRTINFO_QUOTA_OFF:
++ module_put(THIS_MODULE);
++ err = NOTIFY_OK;
++ break;
++ case VIRTINFO_QUOTA_GETSTAT:
++ err = NOTIFY_BAD;
++ if (vzquota_dstat(viq->super, viq->qstat))
++ break;
++ err = NOTIFY_OK;
++ break;
++ }
++ return err;
++}
++
++struct vnotifier_block quota_notifier_block = {
++ .notifier_call = quota_notifier_call,
++ .priority = INT_MAX,
++};
++
++/* ----------------------------------------------------------------------
++ *
++ * Init/exit procedures
++ *
++ * ---------------------------------------------------------------------*/
++
++static int __init vzquota_init(void)
++{
++ int err;
++
++ if ((err = vzquota_cache_init()) != 0)
++ goto out_cache;
++
++ if ((err = vzquota_proc_init()) != 0)
++ goto out_proc;
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++ if ((err = vzquota_ugid_init()) != 0)
++ goto out_ugid;
++#endif
++
++ init_MUTEX(&vz_quota_sem);
++ vzioctl_register(&vzdqcalls);
++ virtinfo_notifier_register(VITYPE_QUOTA, &quota_notifier_block);
++#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS)
++ vzaquota_init();
++#endif
++
++ return 0;
++
++#ifdef CONFIG_VZ_QUOTA_UGID
++out_ugid:
++ vzquota_proc_release();
++#endif
++out_proc:
++ vzquota_cache_release();
++out_cache:
++ return err;
++}
++
++#if defined(VZ_QUOTA_UNLOAD)
++static void __exit vzquota_release(void)
++{
++ virtinfo_notifier_unregister(VITYPE_QUOTA, &quota_notifier_block);
++ vzioctl_unregister(&vzdqcalls);
++#ifdef CONFIG_VZ_QUOTA_UGID
++#ifdef CONFIG_PROC_FS
++ vzaquota_fini();
++#endif
++ vzquota_ugid_release();
++#endif
++ vzquota_proc_release();
++ vzquota_cache_release();
++}
++#endif
++
++MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Disk Quota");
++MODULE_LICENSE("GPL v2");
++
++module_init(vzquota_init)
++#if defined(VZ_QUOTA_UNLOAD)
++module_exit(vzquota_release)
++#endif
+diff -upr linux-2.6.16.orig/fs/xattr.c linux-2.6.16-026test015/fs/xattr.c
+--- linux-2.6.16.orig/fs/xattr.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/xattr.c 2006-07-04 14:41:37.000000000 +0400
+@@ -58,7 +58,7 @@ xattr_permission(struct inode *inode, co
+ return -EPERM;
+ }
+
+- return permission(inode, mask, NULL);
++ return permission(inode, mask, NULL, NULL);
+ }
+
+ int
+diff -upr linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_aops.c linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_aops.c
+--- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_aops.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_aops.c 2006-07-04 14:41:36.000000000 +0400
+@@ -616,7 +616,7 @@ xfs_is_delayed_page(
+ acceptable = (type == IOMAP_UNWRITTEN);
+ else if (buffer_delay(bh))
+ acceptable = (type == IOMAP_DELAY);
+- else if (buffer_mapped(bh))
++ else if (buffer_dirty(bh) && buffer_mapped(bh))
+ acceptable = (type == 0);
+ else
+ break;
+diff -upr linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_iops.c linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_iops.c
+--- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_iops.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_iops.c 2006-07-04 14:41:37.000000000 +0400
+@@ -615,7 +615,8 @@ STATIC int
+ linvfs_permission(
+ struct inode *inode,
+ int mode,
+- struct nameidata *nd)
++ struct nameidata *nd,
++ struct exec_perm *perm)
+ {
+ vnode_t *vp = LINVFS_GET_VP(inode);
+ int error;
+@@ -673,8 +674,7 @@ linvfs_setattr(
+ if (ia_valid & ATTR_ATIME) {
+ vattr.va_mask |= XFS_AT_ATIME;
+ vattr.va_atime = attr->ia_atime;
+- if (ia_valid & ATTR_ATIME_SET)
+- inode->i_atime = attr->ia_atime;
++ inode->i_atime = attr->ia_atime;
+ }
+ if (ia_valid & ATTR_MTIME) {
+ vattr.va_mask |= XFS_AT_MTIME;
+diff -upr linux-2.6.16.orig/include/asm-arm26/tlbflush.h linux-2.6.16-026test015/include/asm-arm26/tlbflush.h
+--- linux-2.6.16.orig/include/asm-arm26/tlbflush.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-arm26/tlbflush.h 2006-07-04 14:41:38.000000000 +0400
+@@ -25,7 +25,7 @@ static inline void memc_update_all(void)
+ {
+ struct task_struct *p;
+ cpu_memc_update_all(init_mm.pgd);
+- for_each_process(p) {
++ for_each_process_all(p) {
+ if (!p->mm)
+ continue;
+ cpu_memc_update_all(p->mm->pgd);
+diff -upr linux-2.6.16.orig/include/asm-generic/atomic.h linux-2.6.16-026test015/include/asm-generic/atomic.h
+--- linux-2.6.16.orig/include/asm-generic/atomic.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-generic/atomic.h 2006-07-04 14:41:37.000000000 +0400
+@@ -66,6 +66,13 @@ static inline void atomic_long_sub(long
+ atomic64_sub(i, v);
+ }
+
++static inline int atomic_long_add_negative(long i, atomic_long_t *l)
++{
++ atomic64_t *v = (atomic64_t *)l;
++
++ return atomic64_add_negative(i, v);
++}
++
+ #else
+
+ typedef atomic_t atomic_long_t;
+@@ -113,5 +120,12 @@ static inline void atomic_long_sub(long
+ atomic_sub(i, v);
+ }
+
++static inline int atomic_long_add_negative(long i, atomic_long_t *l)
++{
++ atomic_t *v = (atomic_t *)l;
++
++ return atomic_add_negative(i, v);
++}
++
+ #endif
+ #endif
+diff -upr linux-2.6.16.orig/include/asm-generic/pgtable.h linux-2.6.16-026test015/include/asm-generic/pgtable.h
+--- linux-2.6.16.orig/include/asm-generic/pgtable.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-generic/pgtable.h 2006-07-04 14:41:36.000000000 +0400
+@@ -159,17 +159,8 @@ static inline void ptep_set_wrprotect(st
+ #define lazy_mmu_prot_update(pte) do { } while (0)
+ #endif
+
+-#ifndef __HAVE_ARCH_MULTIPLE_ZERO_PAGE
++#ifndef __HAVE_ARCH_MOVE_PTE
+ #define move_pte(pte, prot, old_addr, new_addr) (pte)
+-#else
+-#define move_pte(pte, prot, old_addr, new_addr) \
+-({ \
+- pte_t newpte = (pte); \
+- if (pte_present(pte) && pfn_valid(pte_pfn(pte)) && \
+- pte_page(pte) == ZERO_PAGE(old_addr)) \
+- newpte = mk_pte(ZERO_PAGE(new_addr), (prot)); \
+- newpte; \
+-})
+ #endif
+
+ /*
+diff -upr linux-2.6.16.orig/include/asm-i386/bug.h linux-2.6.16-026test015/include/asm-i386/bug.h
+--- linux-2.6.16.orig/include/asm-i386/bug.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/bug.h 2006-07-04 14:41:37.000000000 +0400
+@@ -14,7 +14,10 @@
+ #ifdef CONFIG_DEBUG_BUGVERBOSE
+ #define BUG() \
+ __asm__ __volatile__( "ud2\n" \
++ "\t.byte 0x66\n"\
++ "\t.byte 0xb8\n" /* mov $xxx, %ax */\
+ "\t.word %c0\n" \
++ "\t.byte 0xb8\n" /* mov $xxx, %eax */\
+ "\t.long %c1\n" \
+ : : "i" (__LINE__), "i" (__FILE__))
+ #else
+diff -upr linux-2.6.16.orig/include/asm-i386/cpufeature.h linux-2.6.16-026test015/include/asm-i386/cpufeature.h
+--- linux-2.6.16.orig/include/asm-i386/cpufeature.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/cpufeature.h 2006-07-04 14:41:36.000000000 +0400
+@@ -70,6 +70,7 @@
+ #define X86_FEATURE_P3 (3*32+ 6) /* P3 */
+ #define X86_FEATURE_P4 (3*32+ 7) /* P4 */
+ #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
++#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */
+
+ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */
+diff -upr linux-2.6.16.orig/include/asm-i386/elf.h linux-2.6.16-026test015/include/asm-i386/elf.h
+--- linux-2.6.16.orig/include/asm-i386/elf.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/elf.h 2006-07-04 14:41:39.000000000 +0400
+@@ -108,7 +108,7 @@ typedef struct user_fxsr_struct elf_fpxr
+ For the moment, we have only optimizations for the Intel generations,
+ but that could change... */
+
+-#define ELF_PLATFORM (system_utsname.machine)
++#define ELF_PLATFORM (ve_utsname.machine)
+
+ #ifdef __KERNEL__
+ #define SET_PERSONALITY(ex, ibcs2) do { } while (0)
+@@ -136,8 +136,10 @@ extern void __kernel_vsyscall;
+
+ #define ARCH_DLINFO \
+ do { \
++ if (sysctl_at_vsyscall) { \
+ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \
++ } \
+ } while (0)
+
+ /*
+diff -upr linux-2.6.16.orig/include/asm-i386/i387.h linux-2.6.16-026test015/include/asm-i386/i387.h
+--- linux-2.6.16.orig/include/asm-i386/i387.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/i387.h 2006-07-04 14:41:36.000000000 +0400
+@@ -13,6 +13,7 @@
+
+ #include <linux/sched.h>
+ #include <linux/init.h>
++#include <linux/kernel_stat.h>
+ #include <asm/processor.h>
+ #include <asm/sigcontext.h>
+ #include <asm/user.h>
+@@ -38,17 +39,38 @@ extern void init_fpu(struct task_struct
+ extern void kernel_fpu_begin(void);
+ #define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
+
++/* We need a safe address that is cheap to find and that is already
++ in L1 during context switch. The best choices are unfortunately
++ different for UP and SMP */
++#ifdef CONFIG_SMP
++#define safe_address (__per_cpu_offset[0])
++#else
++#define safe_address (kstat_cpu(0).cpustat.user)
++#endif
++
+ /*
+ * These must be called with preempt disabled
+ */
+ static inline void __save_init_fpu( struct task_struct *tsk )
+ {
++ /* Use more nops than strictly needed in case the compiler
++ varies code */
+ alternative_input(
+- "fnsave %1 ; fwait ;" GENERIC_NOP2,
+- "fxsave %1 ; fnclex",
++ "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
++ "fxsave %[fx]\n"
++ "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
+ X86_FEATURE_FXSR,
+- "m" (tsk->thread.i387.fxsave)
+- :"memory");
++ [fx] "m" (tsk->thread.i387.fxsave),
++ [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory");
++ /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
++ is pending. Clear the x87 state here by setting it to fixed
++ values. safe_address is a random variable that should be in L1 */
++ alternative_input(
++ GENERIC_NOP8 GENERIC_NOP2,
++ "emms\n\t" /* clear stack tags */
++ "fildl %[addr]", /* set F?P to defined value */
++ X86_FEATURE_FXSAVE_LEAK,
++ [addr] "m" (safe_address));
+ task_thread_info(tsk)->status &= ~TS_USEDFPU;
+ }
+
+diff -upr linux-2.6.16.orig/include/asm-i386/mman.h linux-2.6.16-026test015/include/asm-i386/mman.h
+--- linux-2.6.16.orig/include/asm-i386/mman.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/mman.h 2006-07-04 14:41:37.000000000 +0400
+@@ -10,6 +10,7 @@
+ #define MAP_NORESERVE 0x4000 /* don't check for reservations */
+ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
+ #define MAP_NONBLOCK 0x10000 /* do not block on IO */
++#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */
+
+ #define MCL_CURRENT 1 /* lock all current mappings */
+ #define MCL_FUTURE 2 /* lock all future mappings */
+diff -upr linux-2.6.16.orig/include/asm-i386/nmi.h linux-2.6.16-026test015/include/asm-i386/nmi.h
+--- linux-2.6.16.orig/include/asm-i386/nmi.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/nmi.h 2006-07-04 14:41:37.000000000 +0400
+@@ -17,6 +17,7 @@ typedef int (*nmi_callback_t)(struct pt_
+ * set. Return 1 if the NMI was handled.
+ */
+ void set_nmi_callback(nmi_callback_t callback);
++void set_nmi_ipi_callback(nmi_callback_t callback);
+
+ /**
+ * unset_nmi_callback
+@@ -24,5 +25,6 @@ void set_nmi_callback(nmi_callback_t cal
+ * Remove the handler previously set.
+ */
+ void unset_nmi_callback(void);
++void unset_nmi_ipi_callback(void);
+
+ #endif /* ASM_NMI_H */
+diff -upr linux-2.6.16.orig/include/asm-i386/pgtable-2level.h linux-2.6.16-026test015/include/asm-i386/pgtable-2level.h
+--- linux-2.6.16.orig/include/asm-i386/pgtable-2level.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/pgtable-2level.h 2006-07-04 14:41:36.000000000 +0400
+@@ -18,6 +18,9 @@
+ #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
+ #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
+
++#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
++#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
++
+ #define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte_low, 0))
+ #define pte_same(a, b) ((a).pte_low == (b).pte_low)
+ #define pte_page(x) pfn_to_page(pte_pfn(x))
+diff -upr linux-2.6.16.orig/include/asm-i386/pgtable-3level.h linux-2.6.16-026test015/include/asm-i386/pgtable-3level.h
+--- linux-2.6.16.orig/include/asm-i386/pgtable-3level.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/pgtable-3level.h 2006-07-04 14:41:36.000000000 +0400
+@@ -85,6 +85,26 @@ static inline void pud_clear (pud_t * pu
+ #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
+ pmd_index(address))
+
++/*
++ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
++ * entry, so clear the bottom half first and enforce ordering with a compiler
++ * barrier.
++ */
++static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
++{
++ ptep->pte_low = 0;
++ smp_wmb();
++ ptep->pte_high = 0;
++}
++
++static inline void pmd_clear(pmd_t *pmd)
++{
++ u32 *tmp = (u32 *)pmd;
++ *tmp = 0;
++ smp_wmb();
++ *(tmp + 1) = 0;
++}
++
+ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+ {
+ pte_t res;
+diff -upr linux-2.6.16.orig/include/asm-i386/pgtable.h linux-2.6.16-026test015/include/asm-i386/pgtable.h
+--- linux-2.6.16.orig/include/asm-i386/pgtable.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/pgtable.h 2006-07-04 14:41:36.000000000 +0400
+@@ -204,12 +204,10 @@ extern unsigned long long __PAGE_KERNEL,
+ extern unsigned long pg0[];
+
+ #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
+-#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
+
+ /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
+ #define pmd_none(x) (!(unsigned long)pmd_val(x))
+ #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
+-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
+ #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
+
+
+@@ -269,7 +267,7 @@ static inline pte_t ptep_get_and_clear_f
+ pte_t pte;
+ if (full) {
+ pte = *ptep;
+- *ptep = __pte(0);
++ pte_clear(mm, addr, ptep);
+ } else {
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ }
+diff -upr linux-2.6.16.orig/include/asm-i386/thread_info.h linux-2.6.16-026test015/include/asm-i386/thread_info.h
+--- linux-2.6.16.orig/include/asm-i386/thread_info.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/thread_info.h 2006-07-04 14:41:39.000000000 +0400
+@@ -101,13 +101,13 @@ register unsigned long current_stack_poi
+ ({ \
+ struct thread_info *ret; \
+ \
+- ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \
++ ret = kmalloc(THREAD_SIZE, GFP_KERNEL_UBC); \
+ if (ret) \
+ memset(ret, 0, THREAD_SIZE); \
+ ret; \
+ })
+ #else
+-#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL)
++#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL_UBC)
+ #endif
+
+ #define free_thread_info(info) kfree(info)
+@@ -142,7 +142,8 @@ register unsigned long current_stack_poi
+ #define TIF_SECCOMP 8 /* secure computing */
+ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */
+ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
+-#define TIF_MEMDIE 17
++#define TIF_FREEZE 17 /* Freeze request, atomic version of PF_FREEZE */
++#define TIF_MEMDIE 18
+
+ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
+ #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
+diff -upr linux-2.6.16.orig/include/asm-i386/timex.h linux-2.6.16-026test015/include/asm-i386/timex.h
+--- linux-2.6.16.orig/include/asm-i386/timex.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/timex.h 2006-07-04 14:41:38.000000000 +0400
+@@ -36,13 +36,17 @@ static inline cycles_t get_cycles (void)
+ {
+ unsigned long long ret=0;
+
+-#ifndef CONFIG_X86_TSC
+- if (!cpu_has_tsc)
+- return 0;
+-#endif
+-
+ #if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
+ rdtscll(ret);
++#elif defined(CONFIG_VE)
++ /*
++ * get_cycles is used in the following calculations:
++ * - VPS idle and iowait times in kernel/shced.h
++ * - task's sleep time to be shown with SyRq-t
++ * - kstat latencies in linux/vzstat.h
++ * - sched latency via wakeup_stamp in linux/ve_task.h
++ */
++#warning "some of VPS statistics won't be correct without get_cycles() (kstat_lat, ve_idle, etc)"
+ #endif
+ return ret;
+ }
+diff -upr linux-2.6.16.orig/include/asm-i386/unistd.h linux-2.6.16-026test015/include/asm-i386/unistd.h
+--- linux-2.6.16.orig/include/asm-i386/unistd.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-i386/unistd.h 2006-07-04 14:41:39.000000000 +0400
+@@ -316,8 +316,16 @@
+ #define __NR_pselect6 308
+ #define __NR_ppoll 309
+ #define __NR_unshare 310
+-
+-#define NR_syscalls 311
++#define __NR_fairsched_mknod 500 /* FairScheduler syscalls */
++#define __NR_fairsched_rmnod 501
++#define __NR_fairsched_chwt 502
++#define __NR_fairsched_mvpr 503
++#define __NR_fairsched_rate 504
++#define __NR_getluid 510
++#define __NR_setluid 511
++#define __NR_setublimit 512
++#define __NR_ubstat 513
++#define NR_syscalls 513
+
+ /*
+ * user-visible error numbers are in the range -1 - -128: see
+diff -upr linux-2.6.16.orig/include/asm-ia64/mman.h linux-2.6.16-026test015/include/asm-ia64/mman.h
+--- linux-2.6.16.orig/include/asm-ia64/mman.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-ia64/mman.h 2006-07-04 14:41:37.000000000 +0400
+@@ -18,6 +18,7 @@
+ #define MAP_NORESERVE 0x04000 /* don't check for reservations */
+ #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */
+ #define MAP_NONBLOCK 0x10000 /* do not block on IO */
++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */
+
+ #define MCL_CURRENT 1 /* lock all current mappings */
+ #define MCL_FUTURE 2 /* lock all future mappings */
+diff -upr linux-2.6.16.orig/include/asm-ia64/pgalloc.h linux-2.6.16-026test015/include/asm-ia64/pgalloc.h
+--- linux-2.6.16.orig/include/asm-ia64/pgalloc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-ia64/pgalloc.h 2006-07-04 14:41:37.000000000 +0400
+@@ -20,6 +20,8 @@
+ #include <linux/page-flags.h>
+ #include <linux/threads.h>
+
++#include <ub/ub_mem.h>
++
+ #include <asm/mmu_context.h>
+
+ DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist);
+@@ -38,7 +40,7 @@ static inline long pgtable_quicklist_tot
+ return ql_size;
+ }
+
+-static inline void *pgtable_quicklist_alloc(void)
++static inline void *pgtable_quicklist_alloc(int charge)
+ {
+ unsigned long *ret = NULL;
+
+@@ -46,13 +48,19 @@ static inline void *pgtable_quicklist_al
+
+ ret = pgtable_quicklist;
+ if (likely(ret != NULL)) {
++ if (ub_page_charge(virt_to_page(ret), 0,
++ charge ? __GFP_UBC|__GFP_SOFT_UBC : 0))
++ goto out;
++
+ pgtable_quicklist = (unsigned long *)(*ret);
+ ret[0] = 0;
+ --pgtable_quicklist_size;
++out:
+ preempt_enable();
+ } else {
+ preempt_enable();
+- ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
++ ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO |
++ (charge ? __GFP_UBC | __GFP_SOFT_UBC : 0));
+ }
+
+ return ret;
+@@ -70,6 +78,7 @@ static inline void pgtable_quicklist_fre
+ #endif
+
+ preempt_disable();
++ ub_page_uncharge(virt_to_page(pgtable_entry), 0);
+ *(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist;
+ pgtable_quicklist = (unsigned long *)pgtable_entry;
+ ++pgtable_quicklist_size;
+@@ -78,7 +87,7 @@ static inline void pgtable_quicklist_fre
+
+ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+- return pgtable_quicklist_alloc();
++ return pgtable_quicklist_alloc(1);
+ }
+
+ static inline void pgd_free(pgd_t * pgd)
+@@ -95,7 +104,7 @@ pgd_populate(struct mm_struct *mm, pgd_t
+
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+- return pgtable_quicklist_alloc();
++ return pgtable_quicklist_alloc(1);
+ }
+
+ static inline void pud_free(pud_t * pud)
+@@ -113,7 +122,7 @@ pud_populate(struct mm_struct *mm, pud_t
+
+ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+- return pgtable_quicklist_alloc();
++ return pgtable_quicklist_alloc(1);
+ }
+
+ static inline void pmd_free(pmd_t * pmd)
+@@ -138,13 +147,13 @@ pmd_populate_kernel(struct mm_struct *mm
+ static inline struct page *pte_alloc_one(struct mm_struct *mm,
+ unsigned long addr)
+ {
+- return virt_to_page(pgtable_quicklist_alloc());
++ return virt_to_page(pgtable_quicklist_alloc(1));
+ }
+
+ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+ unsigned long addr)
+ {
+- return pgtable_quicklist_alloc();
++ return pgtable_quicklist_alloc(0);
+ }
+
+ static inline void pte_free(struct page *pte)
+diff -upr linux-2.6.16.orig/include/asm-ia64/processor.h linux-2.6.16-026test015/include/asm-ia64/processor.h
+--- linux-2.6.16.orig/include/asm-ia64/processor.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-ia64/processor.h 2006-07-04 14:41:38.000000000 +0400
+@@ -306,7 +306,7 @@ struct thread_struct {
+ regs->loadrs = 0; \
+ regs->r8 = current->mm->dumpable; /* set "don't zap registers" flag */ \
+ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \
+- if (unlikely(!current->mm->dumpable)) { \
++ if (unlikely(!current->mm->dumpable || !current->mm->vps_dumpable)) { \
+ /* \
+ * Zap scratch regs to avoid leaking bits between processes with different \
+ * uid/privileges. \
+diff -upr linux-2.6.16.orig/include/asm-ia64/thread_info.h linux-2.6.16-026test015/include/asm-ia64/thread_info.h
+--- linux-2.6.16.orig/include/asm-ia64/thread_info.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-ia64/thread_info.h 2006-07-04 14:41:37.000000000 +0400
+@@ -94,6 +94,7 @@ struct thread_info {
+ #define TIF_MEMDIE 17
+ #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */
+ #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */
++#define TIF_FREEZE 20 /* Freeze request, atomic version of PF_FREEZE */
+
+ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
+ #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
+diff -upr linux-2.6.16.orig/include/asm-ia64/unistd.h linux-2.6.16-026test015/include/asm-ia64/unistd.h
+--- linux-2.6.16.orig/include/asm-ia64/unistd.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-ia64/unistd.h 2006-07-04 14:41:39.000000000 +0400
+@@ -285,12 +285,22 @@
+ #define __NR_faccessat 1293
+ /* 1294, 1295 reserved for pselect/ppoll */
+ #define __NR_unshare 1296
++#define __NR_fairsched_mknod 1500
++#define __NR_fairsched_rmnod 1501
++#define __NR_fairsched_chwt 1502
++#define __NR_fairsched_mvpr 1503
++#define __NR_fairsched_rate 1504
++#define __NR_getluid 1505
++#define __NR_setluid 1506
++#define __NR_setublimit 1507
++#define __NR_ubstat 1508
+
+ #ifdef __KERNEL__
+
+ #include <linux/config.h>
+
+-#define NR_syscalls 273 /* length of syscall table */
++/* length of syscall table */
++#define NR_syscalls (__NR_ubstat - __NR_ni_syscall + 1)
+
+ #define __ARCH_WANT_SYS_RT_SIGACTION
+
+diff -upr linux-2.6.16.orig/include/asm-m32r/smp.h linux-2.6.16-026test015/include/asm-m32r/smp.h
+--- linux-2.6.16.orig/include/asm-m32r/smp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-m32r/smp.h 2006-07-04 14:41:36.000000000 +0400
+@@ -67,7 +67,8 @@ extern volatile int cpu_2_physid[NR_CPUS
+ #define raw_smp_processor_id() (current_thread_info()->cpu)
+
+ extern cpumask_t cpu_callout_map;
+-#define cpu_possible_map cpu_callout_map
++extern cpumask_t cpu_possible_map;
++extern cpumask_t cpu_present_map;
+
+ static __inline__ int hard_smp_processor_id(void)
+ {
+diff -upr linux-2.6.16.orig/include/asm-m32r/uaccess.h linux-2.6.16-026test015/include/asm-m32r/uaccess.h
+--- linux-2.6.16.orig/include/asm-m32r/uaccess.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-m32r/uaccess.h 2006-07-04 14:41:36.000000000 +0400
+@@ -5,17 +5,9 @@
+ * linux/include/asm-m32r/uaccess.h
+ *
+ * M32R version.
+- * Copyright (C) 2004 Hirokazu Takata <takata at linux-m32r.org>
++ * Copyright (C) 2004, 2006 Hirokazu Takata <takata at linux-m32r.org>
+ */
+
+-#undef UACCESS_DEBUG
+-
+-#ifdef UACCESS_DEBUG
+-#define UAPRINTK(args...) printk(args)
+-#else
+-#define UAPRINTK(args...)
+-#endif /* UACCESS_DEBUG */
+-
+ /*
+ * User space memory access functions
+ */
+@@ -38,27 +30,29 @@
+ #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
+
+ #ifdef CONFIG_MMU
++
+ #define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF)
+ #define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
+-#else
+-#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF)
+-#define USER_DS MAKE_MM_SEG(0xFFFFFFFF)
+-#endif /* CONFIG_MMU */
+-
+ #define get_ds() (KERNEL_DS)
+-#ifdef CONFIG_MMU
+ #define get_fs() (current_thread_info()->addr_limit)
+ #define set_fs(x) (current_thread_info()->addr_limit = (x))
+-#else
++
++#else /* not CONFIG_MMU */
++
++#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF)
++#define USER_DS MAKE_MM_SEG(0xFFFFFFFF)
++#define get_ds() (KERNEL_DS)
++
+ static inline mm_segment_t get_fs(void)
+ {
+- return USER_DS;
++ return USER_DS;
+ }
+
+ static inline void set_fs(mm_segment_t s)
+ {
+ }
+-#endif /* CONFIG_MMU */
++
++#endif /* not CONFIG_MMU */
+
+ #define segment_eq(a,b) ((a).seg == (b).seg)
+
+@@ -83,9 +77,9 @@ static inline void set_fs(mm_segment_t s
+ " subx %0, %0\n" \
+ " cmpu %4, %1\n" \
+ " subx %0, %5\n" \
+- : "=&r"(flag), "=r"(sum) \
+- : "1"(addr), "r"((int)(size)), \
+- "r"(current_thread_info()->addr_limit.seg), "r"(0) \
++ : "=&r" (flag), "=r" (sum) \
++ : "1" (addr), "r" ((int)(size)), \
++ "r" (current_thread_info()->addr_limit.seg), "r" (0) \
+ : "cbit" ); \
+ flag; })
+
+@@ -113,10 +107,10 @@ static inline void set_fs(mm_segment_t s
+ #else
+ static inline int access_ok(int type, const void *addr, unsigned long size)
+ {
+- extern unsigned long memory_start, memory_end;
+- unsigned long val = (unsigned long)addr;
++ extern unsigned long memory_start, memory_end;
++ unsigned long val = (unsigned long)addr;
+
+- return ((val >= memory_start) && ((val + size) < memory_end));
++ return ((val >= memory_start) && ((val + size) < memory_end));
+ }
+ #endif /* CONFIG_MMU */
+
+@@ -155,39 +149,6 @@ extern int fixup_exception(struct pt_reg
+ * accesses to the same area of user memory).
+ */
+
+-extern void __get_user_1(void);
+-extern void __get_user_2(void);
+-extern void __get_user_4(void);
+-
+-#ifndef MODULE
+-#define __get_user_x(size,ret,x,ptr) \
+- __asm__ __volatile__( \
+- " mv r0, %0\n" \
+- " mv r1, %1\n" \
+- " bl __get_user_" #size "\n" \
+- " mv %0, r0\n" \
+- " mv %1, r1\n" \
+- : "=r"(ret), "=r"(x) \
+- : "0"(ptr) \
+- : "r0", "r1", "r14" )
+-#else /* MODULE */
+-/*
+- * Use "jl" instead of "bl" for MODULE
+- */
+-#define __get_user_x(size,ret,x,ptr) \
+- __asm__ __volatile__( \
+- " mv r0, %0\n" \
+- " mv r1, %1\n" \
+- " seth lr, #high(__get_user_" #size ")\n" \
+- " or3 lr, lr, #low(__get_user_" #size ")\n" \
+- " jl lr\n" \
+- " mv %0, r0\n" \
+- " mv %1, r1\n" \
+- : "=r"(ret), "=r"(x) \
+- : "0"(ptr) \
+- : "r0", "r1", "r14" )
+-#endif
+-
+ /* Careful: we have to cast the result to the type of the pointer for sign
+ reasons */
+ /**
+@@ -208,20 +169,7 @@ extern void __get_user_4(void);
+ * On error, the variable @x is set to zero.
+ */
+ #define get_user(x,ptr) \
+-({ int __ret_gu; \
+- unsigned long __val_gu; \
+- __chk_user_ptr(ptr); \
+- switch(sizeof (*(ptr))) { \
+- case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break; \
+- case 2: __get_user_x(2,__ret_gu,__val_gu,ptr); break; \
+- case 4: __get_user_x(4,__ret_gu,__val_gu,ptr); break; \
+- default: __get_user_x(X,__ret_gu,__val_gu,ptr); break; \
+- } \
+- (x) = (__typeof__(*(ptr)))__val_gu; \
+- __ret_gu; \
+-})
+-
+-extern void __put_user_bad(void);
++ __get_user_check((x),(ptr),sizeof(*(ptr)))
+
+ /**
+ * put_user: - Write a simple value into user space.
+@@ -240,8 +188,7 @@ extern void __put_user_bad(void);
+ * Returns zero on success, or -EFAULT on error.
+ */
+ #define put_user(x,ptr) \
+- __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr)))
+-
++ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr)))
+
+ /**
+ * __get_user: - Get a simple variable from user space, with less checking.
+@@ -264,8 +211,64 @@ extern void __put_user_bad(void);
+ * On error, the variable @x is set to zero.
+ */
+ #define __get_user(x,ptr) \
+- __get_user_nocheck((x),(ptr),sizeof(*(ptr)))
++ __get_user_nocheck((x),(ptr),sizeof(*(ptr)))
+
++#define __get_user_nocheck(x,ptr,size) \
++({ \
++ long __gu_err = 0; \
++ unsigned long __gu_val; \
++ might_sleep(); \
++ __get_user_size(__gu_val,(ptr),(size),__gu_err); \
++ (x) = (__typeof__(*(ptr)))__gu_val; \
++ __gu_err; \
++})
++
++#define __get_user_check(x,ptr,size) \
++({ \
++ long __gu_err = -EFAULT; \
++ unsigned long __gu_val = 0; \
++ const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \
++ might_sleep(); \
++ if (access_ok(VERIFY_READ,__gu_addr,size)) \
++ __get_user_size(__gu_val,__gu_addr,(size),__gu_err); \
++ (x) = (__typeof__(*(ptr)))__gu_val; \
++ __gu_err; \
++})
++
++extern long __get_user_bad(void);
++
++#define __get_user_size(x,ptr,size,retval) \
++do { \
++ retval = 0; \
++ __chk_user_ptr(ptr); \
++ switch (size) { \
++ case 1: __get_user_asm(x,ptr,retval,"ub"); break; \
++ case 2: __get_user_asm(x,ptr,retval,"uh"); break; \
++ case 4: __get_user_asm(x,ptr,retval,""); break; \
++ default: (x) = __get_user_bad(); \
++ } \
++} while (0)
++
++#define __get_user_asm(x, addr, err, itype) \
++ __asm__ __volatile__( \
++ " .fillinsn\n" \
++ "1: ld"itype" %1,@%2\n" \
++ " .fillinsn\n" \
++ "2:\n" \
++ ".section .fixup,\"ax\"\n" \
++ " .balign 4\n" \
++ "3: ldi %0,%3\n" \
++ " seth r14,#high(2b)\n" \
++ " or3 r14,r14,#low(2b)\n" \
++ " jmp r14\n" \
++ ".previous\n" \
++ ".section __ex_table,\"a\"\n" \
++ " .balign 4\n" \
++ " .long 1b,3b\n" \
++ ".previous" \
++ : "=&r" (err), "=&r" (x) \
++ : "r" (addr), "i" (-EFAULT), "0" (err) \
++ : "r14", "memory")
+
+ /**
+ * __put_user: - Write a simple value into user space, with less checking.
+@@ -287,11 +290,13 @@ extern void __put_user_bad(void);
+ * Returns zero on success, or -EFAULT on error.
+ */
+ #define __put_user(x,ptr) \
+- __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr)))
++ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr)))
++
+
+ #define __put_user_nocheck(x,ptr,size) \
+ ({ \
+ long __pu_err; \
++ might_sleep(); \
+ __put_user_size((x),(ptr),(size),__pu_err); \
+ __pu_err; \
+ })
+@@ -308,28 +313,28 @@ extern void __put_user_bad(void);
+ })
+
+ #if defined(__LITTLE_ENDIAN__)
+-#define __put_user_u64(x, addr, err) \
+- __asm__ __volatile__( \
+- " .fillinsn\n" \
+- "1: st %L1,@%2\n" \
+- " .fillinsn\n" \
+- "2: st %H1,@(4,%2)\n" \
+- " .fillinsn\n" \
+- "3:\n" \
+- ".section .fixup,\"ax\"\n" \
+- " .balign 4\n" \
+- "4: ldi %0,%3\n" \
+- " seth r14,#high(3b)\n" \
+- " or3 r14,r14,#low(3b)\n" \
+- " jmp r14\n" \
+- ".previous\n" \
+- ".section __ex_table,\"a\"\n" \
+- " .balign 4\n" \
+- " .long 1b,4b\n" \
+- " .long 2b,4b\n" \
+- ".previous" \
+- : "=&r"(err) \
+- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \
++#define __put_user_u64(x, addr, err) \
++ __asm__ __volatile__( \
++ " .fillinsn\n" \
++ "1: st %L1,@%2\n" \
++ " .fillinsn\n" \
++ "2: st %H1,@(4,%2)\n" \
++ " .fillinsn\n" \
++ "3:\n" \
++ ".section .fixup,\"ax\"\n" \
++ " .balign 4\n" \
++ "4: ldi %0,%3\n" \
++ " seth r14,#high(3b)\n" \
++ " or3 r14,r14,#low(3b)\n" \
++ " jmp r14\n" \
++ ".previous\n" \
++ ".section __ex_table,\"a\"\n" \
++ " .balign 4\n" \
++ " .long 1b,4b\n" \
++ " .long 2b,4b\n" \
++ ".previous" \
++ : "=&r" (err) \
++ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \
+ : "r14", "memory")
+
+ #elif defined(__BIG_ENDIAN__)
+@@ -353,13 +358,15 @@ extern void __put_user_bad(void);
+ " .long 1b,4b\n" \
+ " .long 2b,4b\n" \
+ ".previous" \
+- : "=&r"(err) \
+- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \
++ : "=&r" (err) \
++ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \
+ : "r14", "memory")
+ #else
+ #error no endian defined
+ #endif
+
++extern void __put_user_bad(void);
++
+ #define __put_user_size(x,ptr,size,retval) \
+ do { \
+ retval = 0; \
+@@ -398,52 +405,8 @@ struct __large_struct { unsigned long bu
+ " .balign 4\n" \
+ " .long 1b,3b\n" \
+ ".previous" \
+- : "=&r"(err) \
+- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \
+- : "r14", "memory")
+-
+-#define __get_user_nocheck(x,ptr,size) \
+-({ \
+- long __gu_err; \
+- unsigned long __gu_val; \
+- __get_user_size(__gu_val,(ptr),(size),__gu_err); \
+- (x) = (__typeof__(*(ptr)))__gu_val; \
+- __gu_err; \
+-})
+-
+-extern long __get_user_bad(void);
+-
+-#define __get_user_size(x,ptr,size,retval) \
+-do { \
+- retval = 0; \
+- __chk_user_ptr(ptr); \
+- switch (size) { \
+- case 1: __get_user_asm(x,ptr,retval,"ub"); break; \
+- case 2: __get_user_asm(x,ptr,retval,"uh"); break; \
+- case 4: __get_user_asm(x,ptr,retval,""); break; \
+- default: (x) = __get_user_bad(); \
+- } \
+-} while (0)
+-
+-#define __get_user_asm(x, addr, err, itype) \
+- __asm__ __volatile__( \
+- " .fillinsn\n" \
+- "1: ld"itype" %1,@%2\n" \
+- " .fillinsn\n" \
+- "2:\n" \
+- ".section .fixup,\"ax\"\n" \
+- " .balign 4\n" \
+- "3: ldi %0,%3\n" \
+- " seth r14,#high(2b)\n" \
+- " or3 r14,r14,#low(2b)\n" \
+- " jmp r14\n" \
+- ".previous\n" \
+- ".section __ex_table,\"a\"\n" \
+- " .balign 4\n" \
+- " .long 1b,3b\n" \
+- ".previous" \
+- : "=&r"(err), "=&r"(x) \
+- : "r"(addr), "i"(-EFAULT), "0"(err) \
++ : "=&r" (err) \
++ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \
+ : "r14", "memory")
+
+ /*
+@@ -453,7 +416,6 @@ do { \
+ * anything, so this is accurate.
+ */
+
+-
+ /*
+ * Copy To/From Userspace
+ */
+@@ -511,8 +473,9 @@ do { \
+ " .long 2b,9b\n" \
+ " .long 3b,9b\n" \
+ ".previous\n" \
+- : "=&r"(__dst), "=&r"(__src), "=&r"(size), "=&r"(__c) \
+- : "0"(to), "1"(from), "2"(size), "3"(size / 4) \
++ : "=&r" (__dst), "=&r" (__src), "=&r" (size), \
++ "=&r" (__c) \
++ : "0" (to), "1" (from), "2" (size), "3" (size / 4) \
+ : "r14", "memory"); \
+ } while (0)
+
+@@ -573,8 +536,9 @@ do { \
+ " .long 2b,7b\n" \
+ " .long 3b,7b\n" \
+ ".previous\n" \
+- : "=&r"(__dst), "=&r"(__src), "=&r"(size), "=&r"(__c) \
+- : "0"(to), "1"(from), "2"(size), "3"(size / 4) \
++ : "=&r" (__dst), "=&r" (__src), "=&r" (size), \
++ "=&r" (__c) \
++ : "0" (to), "1" (from), "2" (size), "3" (size / 4) \
+ : "r14", "memory"); \
+ } while (0)
+
+@@ -676,7 +640,7 @@ unsigned long __generic_copy_from_user(v
+ #define copy_from_user(to,from,n) \
+ ({ \
+ might_sleep(); \
+-__generic_copy_from_user((to),(from),(n)); \
++ __generic_copy_from_user((to),(from),(n)); \
+ })
+
+ long __must_check strncpy_from_user(char *dst, const char __user *src,
+diff -upr linux-2.6.16.orig/include/asm-mips/bitops.h linux-2.6.16-026test015/include/asm-mips/bitops.h
+--- linux-2.6.16.orig/include/asm-mips/bitops.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-mips/bitops.h 2006-07-04 14:41:36.000000000 +0400
+@@ -654,7 +654,12 @@ static inline unsigned long fls(unsigned
+ {
+ #ifdef CONFIG_32BIT
+ #ifdef CONFIG_CPU_MIPS32
+- __asm__ ("clz %0, %1" : "=r" (word) : "r" (word));
++ __asm__ (
++ " .set mips32 \n"
++ " clz %0, %1 \n"
++ " .set mips0 \n"
++ : "=r" (word)
++ : "r" (word));
+
+ return 32 - word;
+ #else
+@@ -678,7 +683,12 @@ static inline unsigned long fls(unsigned
+ #ifdef CONFIG_64BIT
+ #ifdef CONFIG_CPU_MIPS64
+
+- __asm__ ("dclz %0, %1" : "=r" (word) : "r" (word));
++ __asm__ (
++ " .set mips64 \n"
++ " dclz %0, %1 \n"
++ " .set mips0 \n"
++ : "=r" (word)
++ : "r" (word));
+
+ return 64 - word;
+ #else
+diff -upr linux-2.6.16.orig/include/asm-mips/byteorder.h linux-2.6.16-026test015/include/asm-mips/byteorder.h
+--- linux-2.6.16.orig/include/asm-mips/byteorder.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-mips/byteorder.h 2006-07-04 14:41:36.000000000 +0400
+@@ -19,7 +19,9 @@
+ static __inline__ __attribute_const__ __u16 ___arch__swab16(__u16 x)
+ {
+ __asm__(
++ " .set mips32r2 \n"
+ " wsbh %0, %1 \n"
++ " .set mips0 \n"
+ : "=r" (x)
+ : "r" (x));
+
+@@ -30,8 +32,10 @@ static __inline__ __attribute_const__ __
+ static __inline__ __attribute_const__ __u32 ___arch__swab32(__u32 x)
+ {
+ __asm__(
++ " .set mips32r2 \n"
+ " wsbh %0, %1 \n"
+ " rotr %0, %0, 16 \n"
++ " .set mips0 \n"
+ : "=r" (x)
+ : "r" (x));
+
+diff -upr linux-2.6.16.orig/include/asm-mips/interrupt.h linux-2.6.16-026test015/include/asm-mips/interrupt.h
+--- linux-2.6.16.orig/include/asm-mips/interrupt.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-mips/interrupt.h 2006-07-04 14:41:36.000000000 +0400
+@@ -20,7 +20,9 @@ __asm__ (
+ " .set reorder \n"
+ " .set noat \n"
+ #ifdef CONFIG_CPU_MIPSR2
++ " .set mips32r2 \n"
+ " ei \n"
++ " .set mips0 \n"
+ #else
+ " mfc0 $1,$12 \n"
+ " ori $1,0x1f \n"
+@@ -63,7 +65,9 @@ __asm__ (
+ " .set push \n"
+ " .set noat \n"
+ #ifdef CONFIG_CPU_MIPSR2
++ " .set mips32r2 \n"
+ " di \n"
++ " .set mips0 \n"
+ #else
+ " mfc0 $1,$12 \n"
+ " ori $1,0x1f \n"
+@@ -103,8 +107,10 @@ __asm__ (
+ " .set reorder \n"
+ " .set noat \n"
+ #ifdef CONFIG_CPU_MIPSR2
++ " .set mips32r2 \n"
+ " di \\result \n"
+ " andi \\result, 1 \n"
++ " .set mips0 \n"
+ #else
+ " mfc0 \\result, $12 \n"
+ " ori $1, \\result, 0x1f \n"
+@@ -133,9 +139,11 @@ __asm__ (
+ * Slow, but doesn't suffer from a relativly unlikely race
+ * condition we're having since days 1.
+ */
++ " .set mips32r2 \n"
+ " beqz \\flags, 1f \n"
+ " di \n"
+ " ei \n"
++ " .set mips0 \n"
+ "1: \n"
+ #elif defined(CONFIG_CPU_MIPSR2)
+ /*
+diff -upr linux-2.6.16.orig/include/asm-mips/pgtable.h linux-2.6.16-026test015/include/asm-mips/pgtable.h
+--- linux-2.6.16.orig/include/asm-mips/pgtable.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-mips/pgtable.h 2006-07-04 14:41:36.000000000 +0400
+@@ -70,7 +70,15 @@ extern unsigned long zero_page_mask;
+ #define ZERO_PAGE(vaddr) \
+ (virt_to_page(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))
+
+-#define __HAVE_ARCH_MULTIPLE_ZERO_PAGE
++#define __HAVE_ARCH_MOVE_PTE
++#define move_pte(pte, prot, old_addr, new_addr) \
++({ \
++ pte_t newpte = (pte); \
++ if (pte_present(pte) && pfn_valid(pte_pfn(pte)) && \
++ pte_page(pte) == ZERO_PAGE(old_addr)) \
++ newpte = mk_pte(ZERO_PAGE(new_addr), (prot)); \
++ newpte; \
++})
+
+ extern void paging_init(void);
+
+diff -upr linux-2.6.16.orig/include/asm-mips/r4kcache.h linux-2.6.16-026test015/include/asm-mips/r4kcache.h
+--- linux-2.6.16.orig/include/asm-mips/r4kcache.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-mips/r4kcache.h 2006-07-04 14:41:36.000000000 +0400
+@@ -37,7 +37,7 @@
+ " cache %0, %1 \n" \
+ " .set pop \n" \
+ : \
+- : "i" (op), "m" (*(unsigned char *)(addr)))
++ : "i" (op), "R" (*(unsigned char *)(addr)))
+
+ static inline void flush_icache_line_indexed(unsigned long addr)
+ {
+diff -upr linux-2.6.16.orig/include/asm-powerpc/floppy.h linux-2.6.16-026test015/include/asm-powerpc/floppy.h
+--- linux-2.6.16.orig/include/asm-powerpc/floppy.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-powerpc/floppy.h 2006-07-04 14:41:36.000000000 +0400
+@@ -35,6 +35,7 @@
+ #ifdef CONFIG_PCI
+
+ #include <linux/pci.h>
++#include <asm/ppc-pci.h> /* for ppc64_isabridge_dev */
+
+ #define fd_dma_setup(addr,size,mode,io) powerpc_fd_dma_setup(addr,size,mode,io)
+
+@@ -52,12 +53,12 @@ static __inline__ int powerpc_fd_dma_set
+ if (bus_addr
+ && (addr != prev_addr || size != prev_size || dir != prev_dir)) {
+ /* different from last time -- unmap prev */
+- pci_unmap_single(NULL, bus_addr, prev_size, prev_dir);
++ pci_unmap_single(ppc64_isabridge_dev, bus_addr, prev_size, prev_dir);
+ bus_addr = 0;
+ }
+
+ if (!bus_addr) /* need to map it */
+- bus_addr = pci_map_single(NULL, addr, size, dir);
++ bus_addr = pci_map_single(ppc64_isabridge_dev, addr, size, dir);
+
+ /* remember this one as prev */
+ prev_addr = addr;
+diff -upr linux-2.6.16.orig/include/asm-powerpc/pgalloc.h linux-2.6.16-026test015/include/asm-powerpc/pgalloc.h
+--- linux-2.6.16.orig/include/asm-powerpc/pgalloc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-powerpc/pgalloc.h 2006-07-04 14:41:37.000000000 +0400
+@@ -33,7 +33,8 @@ extern kmem_cache_t *pgtable_cache[];
+
+ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+- return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
++ return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM],
++ GFP_KERNEL_UBC | __GFP_SOFT_UBC);
+ }
+
+ static inline void pgd_free(pgd_t *pgd)
+@@ -48,7 +49,7 @@ static inline void pgd_free(pgd_t *pgd)
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+ return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+- GFP_KERNEL|__GFP_REPEAT);
++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT);
+ }
+
+ static inline void pud_free(pud_t *pud)
+@@ -84,7 +85,7 @@ static inline void pmd_populate_kernel(s
+ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+ return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM],
+- GFP_KERNEL|__GFP_REPEAT);
++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT);
+ }
+
+ static inline void pmd_free(pmd_t *pmd)
+@@ -92,17 +93,21 @@ static inline void pmd_free(pmd_t *pmd)
+ kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
+ }
+
++static inline pte_t *__pte_alloc(gfp_t flags)
++{
++ return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], flags);
++}
++
+ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
+ unsigned long address)
+ {
+- return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM],
+- GFP_KERNEL|__GFP_REPEAT);
++ return __pte_alloc(GFP_KERNEL | __GFP_REPEAT);
+ }
+
+ static inline struct page *pte_alloc_one(struct mm_struct *mm,
+ unsigned long address)
+ {
+- return virt_to_page(pte_alloc_one_kernel(mm, address));
++ return virt_to_page(__pte_alloc(GFP_KERNEL_UBC | __GFP_SOFT_UBC));
+ }
+
+ static inline void pte_free_kernel(pte_t *pte)
+diff -upr linux-2.6.16.orig/include/asm-powerpc/unistd.h linux-2.6.16-026test015/include/asm-powerpc/unistd.h
+--- linux-2.6.16.orig/include/asm-powerpc/unistd.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-powerpc/unistd.h 2006-07-04 14:41:37.000000000 +0400
+@@ -301,8 +301,12 @@
+ #define __NR_pselect6 280
+ #define __NR_ppoll 281
+ #define __NR_unshare 282
+-
+-#define __NR_syscalls 283
++#define __NR_getluid 410
++#define __NR_setluid 411
++#define __NR_setublimit 412
++#define __NR_ubstat 413
++
++#define NR_syscalls 414
+
+ #ifdef __KERNEL__
+ #define __NR__exit __NR_exit
+diff -upr linux-2.6.16.orig/include/asm-s390/pgalloc.h linux-2.6.16-026test015/include/asm-s390/pgalloc.h
+--- linux-2.6.16.orig/include/asm-s390/pgalloc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-s390/pgalloc.h 2006-07-04 14:41:37.000000000 +0400
+@@ -34,12 +34,12 @@ static inline pgd_t *pgd_alloc(struct mm
+ int i;
+
+ #ifndef __s390x__
+- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,1);
++ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 1);
+ if (pgd != NULL)
+ for (i = 0; i < USER_PTRS_PER_PGD; i++)
+ pmd_clear(pmd_offset(pgd + i, i*PGDIR_SIZE));
+ #else /* __s390x__ */
+- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,2);
++ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2);
+ if (pgd != NULL)
+ for (i = 0; i < PTRS_PER_PGD; i++)
+ pgd_clear(pgd + i);
+@@ -72,7 +72,7 @@ static inline pmd_t * pmd_alloc_one(stru
+ pmd_t *pmd;
+ int i;
+
+- pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2);
++ pmd = (pmd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2);
+ if (pmd != NULL) {
+ for (i=0; i < PTRS_PER_PMD; i++)
+ pmd_clear(pmd+i);
+@@ -118,16 +118,13 @@ pmd_populate(struct mm_struct *mm, pmd_t
+ pmd_populate_kernel(mm, pmd, (pte_t *)((page-mem_map) << PAGE_SHIFT));
+ }
+
+-/*
+- * page table entry allocation/free routines.
+- */
+-static inline pte_t *
+-pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr)
++static inline pte_t *pte_alloc(struct mm_struct *mm, unsigned long vmaddr,
++ gfp_t mask)
+ {
+ pte_t *pte;
+ int i;
+
+- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
++ pte = (pte_t *)__get_free_page(mask);
+ if (pte != NULL) {
+ for (i=0; i < PTRS_PER_PTE; i++) {
+ pte_clear(mm, vmaddr, pte+i);
+@@ -137,10 +134,20 @@ pte_alloc_one_kernel(struct mm_struct *m
+ return pte;
+ }
+
++/*
++ * page table entry allocation/free routines.
++ */
++static inline pte_t *
++pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr)
++{
++ return pte_alloc(mm, vmaddr, GFP_KERNEL | __GFP_REPEAT);
++}
++
+ static inline struct page *
+ pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
+ {
+- pte_t *pte = pte_alloc_one_kernel(mm, vmaddr);
++ pte_t *pte = pte_alloc(mm, vmaddr, GFP_KERNEL_UBC | __GFP_SOFT_UBC |
++ __GFP_REPEAT);
+ if (pte)
+ return virt_to_page(pte);
+ return 0;
+diff -upr linux-2.6.16.orig/include/asm-sh64/pgalloc.h linux-2.6.16-026test015/include/asm-sh64/pgalloc.h
+--- linux-2.6.16.orig/include/asm-sh64/pgalloc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-sh64/pgalloc.h 2006-07-04 14:41:38.000000000 +0400
+@@ -173,7 +173,7 @@ static inline void set_pgdir(unsigned lo
+ pgd_t *pgd;
+
+ read_lock(&tasklist_lock);
+- for_each_process(p) {
++ for_each_process_all(p) {
+ if (!p->mm)
+ continue;
+ *pgd_offset(p->mm,address) = entry;
+diff -upr linux-2.6.16.orig/include/asm-sparc64/dma-mapping.h linux-2.6.16-026test015/include/asm-sparc64/dma-mapping.h
+--- linux-2.6.16.orig/include/asm-sparc64/dma-mapping.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-sparc64/dma-mapping.h 2006-07-04 14:41:36.000000000 +0400
+@@ -4,7 +4,146 @@
+ #include <linux/config.h>
+
+ #ifdef CONFIG_PCI
+-#include <asm-generic/dma-mapping.h>
++
++/* we implement the API below in terms of the existing PCI one,
++ * so include it */
++#include <linux/pci.h>
++/* need struct page definitions */
++#include <linux/mm.h>
++
++static inline int
++dma_supported(struct device *dev, u64 mask)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ return pci_dma_supported(to_pci_dev(dev), mask);
++}
++
++static inline int
++dma_set_mask(struct device *dev, u64 dma_mask)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ return pci_set_dma_mask(to_pci_dev(dev), dma_mask);
++}
++
++static inline void *
++dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
++ gfp_t flag)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ return __pci_alloc_consistent(to_pci_dev(dev), size, dma_handle, flag);
++}
++
++static inline void
++dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
++ dma_addr_t dma_handle)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_free_consistent(to_pci_dev(dev), size, cpu_addr, dma_handle);
++}
++
++static inline dma_addr_t
++dma_map_single(struct device *dev, void *cpu_addr, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ return pci_map_single(to_pci_dev(dev), cpu_addr, size, (int)direction);
++}
++
++static inline void
++dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_unmap_single(to_pci_dev(dev), dma_addr, size, (int)direction);
++}
++
++static inline dma_addr_t
++dma_map_page(struct device *dev, struct page *page,
++ unsigned long offset, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction);
++}
++
++static inline void
++dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_unmap_page(to_pci_dev(dev), dma_address, size, (int)direction);
++}
++
++static inline int
++dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
++}
++
++static inline void
++dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_unmap_sg(to_pci_dev(dev), sg, nhwentries, (int)direction);
++}
++
++static inline void
++dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle,
++ size, (int)direction);
++}
++
++static inline void
++dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle,
++ size, (int)direction);
++}
++
++static inline void
++dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg, nelems, (int)direction);
++}
++
++static inline void
++dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
++ enum dma_data_direction direction)
++{
++ BUG_ON(dev->bus != &pci_bus_type);
++
++ pci_dma_sync_sg_for_device(to_pci_dev(dev), sg, nelems, (int)direction);
++}
++
++static inline int
++dma_mapping_error(dma_addr_t dma_addr)
++{
++ return pci_dma_mapping_error(dma_addr);
++}
++
+ #else
+
+ struct device;
+diff -upr linux-2.6.16.orig/include/asm-sparc64/pci.h linux-2.6.16-026test015/include/asm-sparc64/pci.h
+--- linux-2.6.16.orig/include/asm-sparc64/pci.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-sparc64/pci.h 2006-07-04 14:41:36.000000000 +0400
+@@ -44,7 +44,9 @@ struct pci_dev;
+ /* Allocate and map kernel buffer using consistent mode DMA for a device.
+ * hwdev should be valid struct pci_dev pointer for PCI devices.
+ */
+-extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle);
++extern void *__pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp);
++#define pci_alloc_consistent(DEV,SZ,HANDLE) \
++ __pci_alloc_consistent(DEV,SZ,HANDLE,GFP_ATOMIC)
+
+ /* Free and unmap a consistent DMA buffer.
+ * cpu_addr is what was returned from pci_alloc_consistent,
+diff -upr linux-2.6.16.orig/include/asm-sparc64/pgtable.h linux-2.6.16-026test015/include/asm-sparc64/pgtable.h
+--- linux-2.6.16.orig/include/asm-sparc64/pgtable.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-sparc64/pgtable.h 2006-07-04 14:41:36.000000000 +0400
+@@ -335,6 +335,23 @@ static inline void set_pte_at(struct mm_
+ #define pte_clear(mm,addr,ptep) \
+ set_pte_at((mm), (addr), (ptep), __pte(0UL))
+
++#ifdef DCACHE_ALIASING_POSSIBLE
++#define __HAVE_ARCH_MOVE_PTE
++#define move_pte(pte, prot, old_addr, new_addr) \
++({ \
++ pte_t newpte = (pte); \
++ if (pte_present(pte)) { \
++ unsigned long this_pfn = pte_pfn(pte); \
++ \
++ if (pfn_valid(this_pfn) && \
++ (((old_addr) ^ (new_addr)) & (1 << 13))) \
++ flush_dcache_page_all(current->mm, \
++ pfn_to_page(this_pfn)); \
++ } \
++ newpte; \
++})
++#endif
++
+ extern pgd_t swapper_pg_dir[2048];
+ extern pmd_t swapper_low_pmd_dir[2048];
+
+diff -upr linux-2.6.16.orig/include/asm-x86_64/cpufeature.h linux-2.6.16-026test015/include/asm-x86_64/cpufeature.h
+--- linux-2.6.16.orig/include/asm-x86_64/cpufeature.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/cpufeature.h 2006-07-04 14:41:36.000000000 +0400
+@@ -64,6 +64,7 @@
+ #define X86_FEATURE_REP_GOOD (3*32+ 4) /* rep microcode works well on this CPU */
+ #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */
+ #define X86_FEATURE_SYNC_RDTSC (3*32+6) /* RDTSC syncs CPU core */
++#define X86_FEATURE_FXSAVE_LEAK (3*32+7) /* FIP/FOP/FDP leaks through FXSAVE */
+
+ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */
+diff -upr linux-2.6.16.orig/include/asm-x86_64/i387.h linux-2.6.16-026test015/include/asm-x86_64/i387.h
+--- linux-2.6.16.orig/include/asm-x86_64/i387.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/i387.h 2006-07-04 14:41:36.000000000 +0400
+@@ -72,6 +72,23 @@ extern int set_fpregs(struct task_struct
+ #define set_fpu_swd(t,val) ((t)->thread.i387.fxsave.swd = (val))
+ #define set_fpu_fxsr_twd(t,val) ((t)->thread.i387.fxsave.twd = (val))
+
++#define X87_FSW_ES (1 << 7) /* Exception Summary */
++
++/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
++ is pending. Clear the x87 state here by setting it to fixed
++ values. The kernel data segment can be sometimes 0 and sometimes
++ new user value. Both should be ok.
++ Use the PDA as safe address because it should be already in L1. */
++static inline void clear_fpu_state(struct i387_fxsave_struct *fx)
++{
++ if (unlikely(fx->swd & X87_FSW_ES))
++ asm volatile("fnclex");
++ alternative_input(ASM_NOP8 ASM_NOP2,
++ " emms\n" /* clear stack tags */
++ " fildl %%gs:0", /* load to clear state */
++ X86_FEATURE_FXSAVE_LEAK);
++}
++
+ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx)
+ {
+ int err;
+@@ -119,6 +136,7 @@ static inline int save_i387_checking(str
+ #endif
+ if (unlikely(err))
+ __clear_user(fx, sizeof(struct i387_fxsave_struct));
++ /* No need to clear here because the caller clears USED_MATH */
+ return err;
+ }
+
+@@ -149,7 +167,7 @@ static inline void __fxsave_clear(struct
+ "i" (offsetof(__typeof__(*tsk),
+ thread.i387.fxsave)));
+ #endif
+- __asm__ __volatile__("fnclex");
++ clear_fpu_state(&tsk->thread.i387.fxsave);
+ }
+
+ static inline void kernel_fpu_begin(void)
+diff -upr linux-2.6.16.orig/include/asm-x86_64/mman.h linux-2.6.16-026test015/include/asm-x86_64/mman.h
+--- linux-2.6.16.orig/include/asm-x86_64/mman.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/mman.h 2006-07-04 14:41:37.000000000 +0400
+@@ -12,6 +12,7 @@
+ #define MAP_NORESERVE 0x4000 /* don't check for reservations */
+ #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
+ #define MAP_NONBLOCK 0x10000 /* do not block on IO */
++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */
+
+ #define MCL_CURRENT 1 /* lock all current mappings */
+ #define MCL_FUTURE 2 /* lock all future mappings */
+diff -upr linux-2.6.16.orig/include/asm-x86_64/nmi.h linux-2.6.16-026test015/include/asm-x86_64/nmi.h
+--- linux-2.6.16.orig/include/asm-x86_64/nmi.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/nmi.h 2006-07-04 14:41:37.000000000 +0400
+@@ -24,6 +24,9 @@ void set_nmi_callback(nmi_callback_t cal
+ * Remove the handler previously set.
+ */
+ void unset_nmi_callback(void);
++
++void set_nmi_ipi_callback(nmi_callback_t callback);
++void unset_nmi_ipi_callback(void);
+
+ #ifdef CONFIG_PM
+
+diff -upr linux-2.6.16.orig/include/asm-x86_64/pgalloc.h linux-2.6.16-026test015/include/asm-x86_64/pgalloc.h
+--- linux-2.6.16.orig/include/asm-x86_64/pgalloc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/pgalloc.h 2006-07-04 14:41:37.000000000 +0400
+@@ -31,12 +31,14 @@ static inline void pmd_free(pmd_t *pmd)
+
+ static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
+ {
+- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
++ return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT|
++ __GFP_SOFT_UBC);
+ }
+
+ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+ {
+- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
++ return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT|
++ __GFP_SOFT_UBC);
+ }
+
+ static inline void pud_free (pud_t *pud)
+@@ -48,7 +50,8 @@ static inline void pud_free (pud_t *pud)
+ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
+ {
+ unsigned boundary;
+- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC|__GFP_REPEAT|
++ __GFP_SOFT_UBC);
+ if (!pgd)
+ return NULL;
+ /*
+@@ -77,7 +80,8 @@ static inline pte_t *pte_alloc_one_kerne
+
+ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+ {
+- void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
++ void *p = (void *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT|
++ __GFP_SOFT_UBC);
+ if (!p)
+ return NULL;
+ return virt_to_page(p);
+diff -upr linux-2.6.16.orig/include/asm-x86_64/processor.h linux-2.6.16-026test015/include/asm-x86_64/processor.h
+--- linux-2.6.16.orig/include/asm-x86_64/processor.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/processor.h 2006-07-04 14:41:39.000000000 +0400
+@@ -167,7 +167,7 @@ static inline void clear_in_cr4 (unsigne
+ /* This decides where the kernel will search for a free chunk of vm
+ * space during mmap's.
+ */
+-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000)
++#define IA32_PAGE_OFFSET 0xc0000000
+
+ #define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64)
+ #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64)
+diff -upr linux-2.6.16.orig/include/asm-x86_64/segment.h linux-2.6.16-026test015/include/asm-x86_64/segment.h
+--- linux-2.6.16.orig/include/asm-x86_64/segment.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/segment.h 2006-07-04 14:41:39.000000000 +0400
+@@ -3,29 +3,28 @@
+
+ #include <asm/cache.h>
+
+-#define __KERNEL_CS 0x10
+-#define __KERNEL_DS 0x18
+-
+-#define __KERNEL32_CS 0x38
+-
++#define GDT_ENTRY_BOOT_CS 2
++#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
++#define GDT_ENTRY_BOOT_DS 3
++#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
++#define GDT_ENTRY_TSS 4 /* needs two entries */
+ /*
+ * we cannot use the same code segment descriptor for user and kernel
+ * -- not even in the long flat mode, because of different DPL /kkeil
+ * The segment offset needs to contain a RPL. Grr. -AK
+ * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ */
++#define GDT_ENTRY_TLS_MIN 6
++#define GDT_ENTRY_TLS_MAX 8
+
+-#define __USER32_CS 0x23 /* 4*8+3 */
+-#define __USER_DS 0x2b /* 5*8+3 */
+-#define __USER_CS 0x33 /* 6*8+3 */
++#define GDT_ENTRY_LDT 9 /* needs two entries */
++#define __KERNEL32_CS 0x58 /* 11*8 */
++#define __KERNEL_CS 0x60 /* 12*8 */
++#define __KERNEL_DS 0x68 /* 13*8 */
++#define __USER32_CS 0x73 /* 14*8+3 */
++#define __USER_DS 0x7b /* 15*8+3 */
+ #define __USER32_DS __USER_DS
+-
+-#define GDT_ENTRY_TLS 1
+-#define GDT_ENTRY_TSS 8 /* needs two entries */
+-#define GDT_ENTRY_LDT 10 /* needs two entries */
+-#define GDT_ENTRY_TLS_MIN 12
+-#define GDT_ENTRY_TLS_MAX 14
+-/* 15 free */
++#define __USER_CS 0x83 /* 16*8+3 */
+
+ #define GDT_ENTRY_TLS_ENTRIES 3
+
+@@ -37,7 +36,7 @@
+ #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+
+ #define IDT_ENTRIES 256
+-#define GDT_ENTRIES 16
++#define GDT_ENTRIES 32
+ #define GDT_SIZE (GDT_ENTRIES * 8)
+ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+
+diff -upr linux-2.6.16.orig/include/asm-x86_64/signal.h linux-2.6.16-026test015/include/asm-x86_64/signal.h
+--- linux-2.6.16.orig/include/asm-x86_64/signal.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/signal.h 2006-07-04 14:41:39.000000000 +0400
+@@ -23,11 +23,6 @@ typedef struct {
+ unsigned long sig[_NSIG_WORDS];
+ } sigset_t;
+
+-
+-struct pt_regs;
+-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
+-
+-
+ #else
+ /* Here we must cater to libcs that poke about in kernel headers. */
+
+diff -upr linux-2.6.16.orig/include/asm-x86_64/thread_info.h linux-2.6.16-026test015/include/asm-x86_64/thread_info.h
+--- linux-2.6.16.orig/include/asm-x86_64/thread_info.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/thread_info.h 2006-07-04 14:41:39.000000000 +0400
+@@ -74,7 +74,7 @@ static inline struct thread_info *stack_
+
+ /* thread information allocation */
+ #define alloc_thread_info(tsk) \
+- ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER))
++ ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,THREAD_ORDER))
+ #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER)
+
+ #else /* !__ASSEMBLY__ */
+@@ -101,11 +101,13 @@ static inline struct thread_info *stack_
+ #define TIF_IRET 5 /* force IRET */
+ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
+ #define TIF_SECCOMP 8 /* secure computing */
++#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */
+ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */
+ #define TIF_IA32 17 /* 32bit process */
+ #define TIF_FORK 18 /* ret_from_fork */
+ #define TIF_ABI_PENDING 19
+-#define TIF_MEMDIE 20
++#define TIF_FREEZE 20
++#define TIF_MEMDIE 21
+
+ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE)
+ #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME)
+@@ -115,6 +117,7 @@ static inline struct thread_info *stack_
+ #define _TIF_IRET (1<<TIF_IRET)
+ #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
+ #define _TIF_SECCOMP (1<<TIF_SECCOMP)
++#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
+ #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG)
+ #define _TIF_IA32 (1<<TIF_IA32)
+ #define _TIF_FORK (1<<TIF_FORK)
+diff -upr linux-2.6.16.orig/include/asm-x86_64/unistd.h linux-2.6.16-026test015/include/asm-x86_64/unistd.h
+--- linux-2.6.16.orig/include/asm-x86_64/unistd.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/asm-x86_64/unistd.h 2006-07-04 14:41:39.000000000 +0400
+@@ -605,8 +605,26 @@ __SYSCALL(__NR_pselect6, sys_ni_syscall)
+ __SYSCALL(__NR_ppoll, sys_ni_syscall) /* for now */
+ #define __NR_unshare 272
+ __SYSCALL(__NR_unshare, sys_unshare)
+-
+-#define __NR_syscall_max __NR_unshare
++#define __NR_getluid 500
++__SYSCALL(__NR_getluid, sys_getluid)
++#define __NR_setluid 501
++__SYSCALL(__NR_setluid, sys_setluid)
++#define __NR_setublimit 502
++__SYSCALL(__NR_setublimit, sys_setublimit)
++#define __NR_ubstat 503
++__SYSCALL(__NR_ubstat, sys_ubstat)
++#define __NR_fairsched_mknod 504 /* FairScheduler syscalls */
++__SYSCALL(__NR_fairsched_mknod, sys_fairsched_mknod)
++#define __NR_fairsched_rmnod 505
++__SYSCALL(__NR_fairsched_rmnod, sys_fairsched_rmnod)
++#define __NR_fairsched_chwt 506
++__SYSCALL(__NR_fairsched_chwt, sys_fairsched_chwt)
++#define __NR_fairsched_mvpr 507
++__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr)
++#define __NR_fairsched_rate 508
++__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate)
++
++#define __NR_syscall_max __NR_fairsched_rate
+
+ #ifndef __NO_STUBS
+
+@@ -645,6 +663,7 @@ do { \
+ #define __ARCH_WANT_SYS_RT_SIGACTION
+ #define __ARCH_WANT_SYS_TIME
+ #define __ARCH_WANT_COMPAT_SYS_TIME
++#define __ARCH_WANT_SYS_RT_SIGSUSPEND
+ #endif
+
+ #ifndef __KERNEL_SYSCALLS__
+diff -upr linux-2.6.16.orig/include/linux/aio.h linux-2.6.16-026test015/include/linux/aio.h
+--- linux-2.6.16.orig/include/linux/aio.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/aio.h 2006-07-04 14:41:39.000000000 +0400
+@@ -247,4 +247,8 @@ static inline struct kiocb *list_kiocb(s
+ extern unsigned long aio_nr;
+ extern unsigned long aio_max_nr;
+
++void wait_for_all_aios(struct kioctx *ctx);
++extern kmem_cache_t *kioctx_cachep;
++extern void aio_kick_handler(void *);
++
+ #endif /* __LINUX__AIO_H */
+diff -upr linux-2.6.16.orig/include/linux/binfmts.h linux-2.6.16-026test015/include/linux/binfmts.h
+--- linux-2.6.16.orig/include/linux/binfmts.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/binfmts.h 2006-07-04 14:41:37.000000000 +0400
+@@ -2,6 +2,7 @@
+ #define _LINUX_BINFMTS_H
+
+ #include <linux/capability.h>
++#include <linux/fs.h>
+
+ struct pt_regs;
+
+@@ -28,6 +29,7 @@ struct linux_binprm{
+ int sh_bang;
+ struct file * file;
+ int e_uid, e_gid;
++ struct exec_perm perm;
+ kernel_cap_t cap_inheritable, cap_permitted, cap_effective;
+ void *security;
+ int argc, envc;
+diff -upr linux-2.6.16.orig/include/linux/capability.h linux-2.6.16-026test015/include/linux/capability.h
+--- linux-2.6.16.orig/include/linux/capability.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/capability.h 2006-07-04 14:41:38.000000000 +0400
+@@ -146,12 +146,9 @@ typedef __u32 kernel_cap_t;
+
+ #define CAP_NET_BROADCAST 11
+
+-/* Allow interface configuration */
+ /* Allow administration of IP firewall, masquerading and accounting */
+ /* Allow setting debug option on sockets */
+ /* Allow modification of routing tables */
+-/* Allow setting arbitrary process / process group ownership on
+- sockets */
+ /* Allow binding to any address for transparent proxying */
+ /* Allow setting TOS (type of service) */
+ /* Allow setting promiscuous mode */
+@@ -200,24 +197,19 @@ typedef __u32 kernel_cap_t;
+
+ /* Allow configuration of the secure attention key */
+ /* Allow administration of the random device */
+-/* Allow examination and configuration of disk quotas */
+ /* Allow configuring the kernel's syslog (printk behaviour) */
+ /* Allow setting the domainname */
+ /* Allow setting the hostname */
+ /* Allow calling bdflush() */
+-/* Allow mount() and umount(), setting up new smb connection */
++/* Allow setting up new smb connection */
+ /* Allow some autofs root ioctls */
+ /* Allow nfsservctl */
+ /* Allow VM86_REQUEST_IRQ */
+ /* Allow to read/write pci config on alpha */
+ /* Allow irix_prctl on mips (setstacksize) */
+ /* Allow flushing all cache on m68k (sys_cacheflush) */
+-/* Allow removing semaphores */
+-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+- and shared memory */
+ /* Allow locking/unlocking of shared memory segment */
+ /* Allow turning swap on/off */
+-/* Allow forged pids on socket credentials passing */
+ /* Allow setting readahead and flushing buffers on block devices */
+ /* Allow setting geometry in floppy driver */
+ /* Allow turning DMA on/off in xd driver */
+@@ -235,6 +227,8 @@ typedef __u32 kernel_cap_t;
+ arbitrary SCSI commands */
+ /* Allow setting encryption key on loopback filesystem */
+ /* Allow setting zone reclaim policy */
++/* Modify data journaling mode on ext3 filesystem (uses journaling
++ resources) */
+
+ #define CAP_SYS_ADMIN 21
+
+@@ -254,8 +248,6 @@ typedef __u32 kernel_cap_t;
+ /* Override resource limits. Set resource limits. */
+ /* Override quota limits. */
+ /* Override reserved space on ext2 filesystem */
+-/* Modify data journaling mode on ext3 filesystem (uses journaling
+- resources) */
+ /* NOTE: ext2 honors fsuid when checking for resource overrides, so
+ you can override using fsuid too */
+ /* Override size restrictions on IPC message queues */
+@@ -288,7 +280,52 @@ typedef __u32 kernel_cap_t;
+
+ #define CAP_AUDIT_CONTROL 30
+
++/*
++ * Important note: VZ capabilities do intersect with CAP_AUDIT
++ * this is due to compatibility reasons. Nothing bad.
++ * Both VZ and Audit/SELinux caps are disabled in VPSs.
++ */
++
++/* Allow access to all information. In the other case some structures will be
++ hiding to ensure different Virtual Environment non-interaction on the same
++ node */
++#define CAP_SETVEID 29
++
++#define CAP_VE_ADMIN 30
++
+ #ifdef __KERNEL__
++
++#include <linux/config.h>
++
++#ifdef CONFIG_VE
++
++/* Replacement for CAP_NET_ADMIN:
++ delegated rights to the Virtual environment of its network administration.
++ For now the following rights have been delegated:
++
++ Allow setting arbitrary process / process group ownership on sockets
++ Allow interface configuration
++ */
++#define CAP_VE_NET_ADMIN CAP_VE_ADMIN
++
++/* Replacement for CAP_SYS_ADMIN:
++ delegated rights to the Virtual environment of its administration.
++ For now the following rights have been delegated:
++ */
++/* Allow mount/umount/remount */
++/* Allow examination and configuration of disk quotas */
++/* Allow removing semaphores */
++/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
++ and shared memory */
++/* Allow locking/unlocking of shared memory segment */
++/* Allow forged pids on socket credentials passing */
++
++#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN
++#else
++#define CAP_VE_NET_ADMIN CAP_NET_ADMIN
++#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN
++#endif
++
+ /*
+ * Bounding set
+ */
+@@ -352,9 +389,14 @@ static inline kernel_cap_t cap_invert(ke
+ #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set)))
+
+ #define cap_clear(c) do { cap_t(c) = 0; } while(0)
++#ifndef CONFIG_VE
+ #define cap_set_full(c) do { cap_t(c) = ~0; } while(0)
++#else
++#define cap_set_full(c) \
++ do {cap_t(c) = ve_is_super(get_exec_env()) ? ~0 : \
++ get_exec_env()->cap_default; } while(0)
++#endif
+ #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0)
+-
+ #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK)
+
+ extern int capable(int cap);
+diff -upr linux-2.6.16.orig/include/linux/coda_linux.h linux-2.6.16-026test015/include/linux/coda_linux.h
+--- linux-2.6.16.orig/include/linux/coda_linux.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/coda_linux.h 2006-07-04 14:41:37.000000000 +0400
+@@ -38,7 +38,8 @@ extern struct file_operations coda_ioctl
+ int coda_open(struct inode *i, struct file *f);
+ int coda_flush(struct file *f);
+ int coda_release(struct inode *i, struct file *f);
+-int coda_permission(struct inode *inode, int mask, struct nameidata *nd);
++int coda_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *);
+ int coda_revalidate_inode(struct dentry *);
+ int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+ int coda_setattr(struct dentry *, struct iattr *);
+diff -upr linux-2.6.16.orig/include/linux/compat.h linux-2.6.16-026test015/include/linux/compat.h
+--- linux-2.6.16.orig/include/linux/compat.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/compat.h 2006-07-04 14:41:39.000000000 +0400
+@@ -181,5 +181,7 @@ static inline int compat_timespec_compar
+ return lhs->tv_nsec - rhs->tv_nsec;
+ }
+
++extern long compat_nanosleep_restart(struct restart_block *restart);
++
+ #endif /* CONFIG_COMPAT */
+ #endif /* _LINUX_COMPAT_H */
+diff -upr linux-2.6.16.orig/include/linux/cpt_image.h linux-2.6.16-026test015/include/linux/cpt_image.h
+--- linux-2.6.16.orig/include/linux/cpt_image.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/cpt_image.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1453 @@
++/*
++ *
++ * include/linux/cpt_image.h
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __CPT_IMAGE_H_
++#define __CPT_IMAGE_H_ 1
++
++#define CPT_NULL (~0ULL)
++#define CPT_NOINDEX (~0U)
++
++/*
++ * Image file layout.
++ *
++ * - major header
++ * - sections[]
++ *
++ * Each section is:
++ * - section header
++ * - array of objects
++ *
++ * All data records are arch independent, 64 bit aligned.
++ */
++
++enum _cpt_object_type
++{
++ CPT_OBJ_TASK = 0,
++ CPT_OBJ_MM,
++ CPT_OBJ_FS,
++ CPT_OBJ_FILES,
++ CPT_OBJ_FILE,
++ CPT_OBJ_SIGHAND_STRUCT,
++ CPT_OBJ_SIGNAL_STRUCT,
++ CPT_OBJ_TTY,
++ CPT_OBJ_SOCKET,
++ CPT_OBJ_SYSVSEM_UNDO,
++ CPT_OBJ_NAMESPACE,
++ CPT_OBJ_SYSV_SHM,
++ CPT_OBJ_INODE,
++ CPT_OBJ_UBC,
++ CPT_OBJ_SLM_SGREG,
++ CPT_OBJ_SLM_REGOBJ,
++ CPT_OBJ_SLM_MM,
++ CPT_OBJ_MAX,
++ /* The objects above are stored in memory while checkpointing */
++
++ CPT_OBJ_VMA = 1024,
++ CPT_OBJ_FILEDESC,
++ CPT_OBJ_SIGHANDLER,
++ CPT_OBJ_SIGINFO,
++ CPT_OBJ_LASTSIGINFO,
++ CPT_OBJ_SYSV_SEM,
++ CPT_OBJ_SKB,
++ CPT_OBJ_FLOCK,
++ CPT_OBJ_OPENREQ,
++ CPT_OBJ_VFSMOUNT,
++ CPT_OBJ_TRAILER,
++ CPT_OBJ_SYSVSEM_UNDO_REC,
++ CPT_OBJ_NET_DEVICE,
++ CPT_OBJ_NET_IFADDR,
++ CPT_OBJ_NET_ROUTE,
++ CPT_OBJ_NET_CONNTRACK,
++ CPT_OBJ_NET_CONNTRACK_EXPECT,
++ CPT_OBJ_AIO_CONTEXT,
++ CPT_OBJ_VEINFO,
++ CPT_OBJ_EPOLL,
++ CPT_OBJ_EPOLL_FILE,
++ CPT_OBJ_SKFILTER,
++ CPT_OBJ_SIGALTSTACK,
++ CPT_OBJ_SOCK_MCADDR,
++
++ CPT_OBJ_X86_REGS = 4096,
++ CPT_OBJ_X86_64_REGS,
++ CPT_OBJ_PAGES,
++ CPT_OBJ_COPYPAGES,
++ CPT_OBJ_REMAPPAGES,
++ CPT_OBJ_LAZYPAGES,
++ CPT_OBJ_NAME,
++ CPT_OBJ_BITS,
++ CPT_OBJ_REF,
++};
++
++#define CPT_ALIGN(n) (((n)+7)&~7)
++
++struct cpt_major_hdr
++{
++ __u8 cpt_signature[4]; /* Magic number */
++ __u16 cpt_hdrlen; /* Length of this header */
++ __u16 cpt_image_version; /* Format of this file; mbz */
++ __u16 cpt_os_arch; /* Architecture */
++#define CPT_OS_ARCH_I386 0
++#define CPT_OS_ARCH_EMT64 1
++#define CPT_OS_ARCH_IA64 2
++ __u16 __cpt_pad1;
++ __u32 cpt_os_version; /* Version of kernel, where image was done */
++ __u32 cpt_os_features; /* Kernel features: SMP etc. */
++ __u16 cpt_pagesize; /* Page size used by OS */
++ __u16 cpt_hz; /* HZ used by OS */
++ __u64 cpt_start_jiffies64; /* Jiffies */
++ __u32 cpt_start_sec; /* Seconds */
++ __u32 cpt_start_nsec; /* Nanoseconds */
++ __u32 cpt_cpu_caps[4]; /* CPU capabilities */
++ __u32 cpt_kernel_config[4]; /* Kernel config */
++ __u64 cpt_iptables_mask; /* Used netfilter modules */
++} __attribute__ ((aligned (8)));
++
++#define CPT_SIGNATURE0 0x79
++#define CPT_SIGNATURE1 0x1c
++#define CPT_SIGNATURE2 0x01
++#define CPT_SIGNATURE3 0x63
++
++#define CPT_CPU_X86_CMOV 0
++#define CPT_CPU_X86_FXSR 1
++#define CPT_CPU_X86_SSE 2
++#define CPT_CPU_X86_SSE2 3
++#define CPT_CPU_X86_MMX 4
++#define CPT_CPU_X86_3DNOW 5
++#define CPT_CPU_X86_3DNOW2 6
++#define CPT_CPU_X86_SEP 7
++#define CPT_CPU_X86_EMT64 8
++#define CPT_CPU_X86_IA64 9
++
++#define CPT_KERNEL_CONFIG_PAE 0
++
++struct cpt_section_hdr
++{
++ __u64 cpt_next;
++ __u32 cpt_section;
++ __u16 cpt_hdrlen;
++ __u16 cpt_align;
++} __attribute__ ((aligned (8)));
++
++enum
++{
++ CPT_SECT_ERROR, /* Error section, content is string */
++ CPT_SECT_VEINFO,
++ CPT_SECT_FILES, /* Files. Content is array of file objects */
++ CPT_SECT_TASKS,
++ CPT_SECT_MM,
++ CPT_SECT_FILES_STRUCT,
++ CPT_SECT_FS,
++ CPT_SECT_SIGHAND_STRUCT,
++ CPT_SECT_TTY,
++ CPT_SECT_SOCKET,
++ CPT_SECT_NAMESPACE,
++ CPT_SECT_SYSVSEM_UNDO,
++ CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and
++ * deleted dentires with inodes not
++ * referenced inside dumped process.
++ */
++ CPT_SECT_SYSV_SHM,
++ CPT_SECT_SYSV_SEM,
++ CPT_SECT_ORPHANS,
++ CPT_SECT_NET_DEVICE,
++ CPT_SECT_NET_IFADDR,
++ CPT_SECT_NET_ROUTE,
++ CPT_SECT_NET_IPTABLES,
++ CPT_SECT_NET_CONNTRACK,
++ CPT_SECT_NET_CONNTRACK_VE0,
++ CPT_SECT_UTSNAME,
++ CPT_SECT_TRAILER,
++ CPT_SECT_UBC,
++ CPT_SECT_SLM_SGREGS,
++ CPT_SECT_SLM_REGOBJS,
++/* Due to silly mistake we cannot index sections beyond this value */
++#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1)
++ CPT_SECT_EPOLL,
++ CPT_SECT_MAX
++};
++
++struct cpt_major_tail
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_lazypages;
++ __u32 cpt_64bit;
++ __u64 cpt_sections[CPT_SECT_MAX_INDEX];
++ __u32 cpt_nsect;
++ __u8 cpt_signature[4]; /* Magic number */
++} __attribute__ ((aligned (8)));
++
++
++/* Common object header. */
++struct cpt_object_hdr
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++} __attribute__ ((aligned (8)));
++
++enum _cpt_content_type {
++ CPT_CONTENT_VOID,
++ CPT_CONTENT_ARRAY,
++ CPT_CONTENT_DATA,
++ CPT_CONTENT_NAME,
++
++ CPT_CONTENT_STACK,
++ CPT_CONTENT_X86_FPUSTATE_OLD,
++ CPT_CONTENT_X86_FPUSTATE,
++ CPT_CONTENT_MM_CONTEXT,
++ CPT_CONTENT_SEMARRAY,
++ CPT_CONTENT_SEMUNDO,
++ CPT_CONTENT_NLMARRAY,
++ CPT_CONTENT_MAX
++};
++
++/* CPT_OBJ_BITS: encode array of bytes */
++struct cpt_obj_bits
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_size;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_REF: a reference to another object */
++struct cpt_obj_ref
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_pos;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_VEINFO: various ve specific data */
++struct cpt_veinfo_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ /* ipc ctls */
++ __u32 shm_ctl_max;
++ __u32 shm_ctl_all;
++ __u32 shm_ctl_mni;
++ __u32 msg_ctl_max;
++ __u32 msg_ctl_mni;
++ __u32 msg_ctl_mnb;
++ __u32 sem_ctl_arr[4];
++
++ /* start time */
++ __u64 start_timespec_delta;
++ __u64 start_jiffies_delta;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_FILE: one struct file */
++struct cpt_file_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_flags;
++ __u32 cpt_mode;
++ __u64 cpt_pos;
++ __u32 cpt_uid;
++ __u32 cpt_gid;
++
++ __u32 cpt_i_mode;
++ __u32 cpt_lflags;
++#define CPT_DENTRY_DELETED 1
++#define CPT_DENTRY_ROOT 2
++#define CPT_DENTRY_CLONING 4
++#define CPT_DENTRY_PROC 8
++#define CPT_DENTRY_EPOLL 0x10
++ __u64 cpt_inode;
++ __u64 cpt_priv;
++
++ __u32 cpt_fown_fd;
++ __u32 cpt_fown_pid;
++ __u32 cpt_fown_uid;
++ __u32 cpt_fown_euid;
++ __u32 cpt_fown_signo;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by file name, encoded as CPT_OBJ_NAME */
++
++struct cpt_epoll_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_file;
++} __attribute__ ((aligned (8)));
++/* Followed by array of struct cpt_epoll_file */
++
++struct cpt_epoll_file_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_file;
++ __u32 cpt_fd;
++ __u32 cpt_events;
++ __u64 cpt_data;
++ __u32 cpt_revents;
++ __u32 cpt_ready;
++} __attribute__ ((aligned (8)));
++
++
++/* CPT_OBJ_FILEDESC: one file descriptor */
++struct cpt_fd_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_fd;
++ __u32 cpt_flags;
++#define CPT_FD_FLAG_CLOSEEXEC 1
++ __u64 cpt_file;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_FILES: one files_struct */
++struct cpt_files_struct_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_index;
++ __u32 cpt_max_fds;
++ __u32 cpt_next_fd;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by array of cpt_fd_image */
++
++/* CPT_OBJ_FS: one fs_struct */
++struct cpt_fs_struct_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_umask;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */
++
++/* CPT_OBJ_INODE: one struct inode */
++struct cpt_inode_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_dev;
++ __u64 cpt_ino;
++ __u32 cpt_mode;
++ __u32 cpt_nlink;
++ __u32 cpt_uid;
++ __u32 cpt_gid;
++ __u64 cpt_rdev;
++ __u64 cpt_size;
++ __u64 cpt_blksize;
++ __u64 cpt_atime;
++ __u64 cpt_mtime;
++ __u64 cpt_ctime;
++ __u64 cpt_blocks;
++ __u32 cpt_sb;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++/* CPT_OBJ_VFSMOUNT: one vfsmount */
++struct cpt_vfsmount_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_mntflags;
++ __u32 cpt_flags;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_flock_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_owner;
++ __u32 cpt_pid;
++ __u64 cpt_start;
++ __u64 cpt_end;
++ __u32 cpt_flags;
++ __u32 cpt_type;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_tty_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_flags;
++ __u32 cpt_link;
++ __u32 cpt_index;
++ __u32 cpt_drv_type;
++ __u32 cpt_drv_subtype;
++ __u32 cpt_drv_flags;
++ __u8 cpt_packet;
++ __u8 cpt_stopped;
++ __u8 cpt_hw_stopped;
++ __u8 cpt_flow_stopped;
++
++ __u32 cpt_canon_data;
++ __u32 cpt_canon_head;
++ __u32 cpt_canon_column;
++ __u32 cpt_column;
++ __u8 cpt_ctrl_status;
++ __u8 cpt_erasing;
++ __u8 cpt_lnext;
++ __u8 cpt_icanon;
++ __u8 cpt_raw;
++ __u8 cpt_real_raw;
++ __u8 cpt_closing;
++ __u8 __cpt_pad1;
++ __u16 cpt_minimum_to_wake;
++ __u16 __cpt_pad2;
++ __u32 cpt_pgrp;
++ __u32 cpt_session;
++ __u32 cpt_c_line;
++ __u8 cpt_name[64];
++ __u16 cpt_ws_row;
++ __u16 cpt_ws_col;
++ __u16 cpt_ws_prow;
++ __u16 cpt_ws_pcol;
++ __u8 cpt_c_cc[32];
++ __u32 cpt_c_iflag;
++ __u32 cpt_c_oflag;
++ __u32 cpt_c_cflag;
++ __u32 cpt_c_lflag;
++ __u32 cpt_read_flags[4096/32];
++} __attribute__ ((aligned (8)));
++
++struct cpt_sock_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_file;
++ __u32 cpt_parent;
++ __u32 cpt_index;
++
++ __u64 cpt_ssflags;
++ __u16 cpt_type;
++ __u16 cpt_family;
++ __u8 cpt_sstate;
++ __u8 cpt_passcred;
++ __u8 cpt_state;
++ __u8 cpt_reuse;
++
++ __u8 cpt_zapped;
++ __u8 cpt_shutdown;
++ __u8 cpt_userlocks;
++ __u8 cpt_no_check;
++ __u8 cpt_debug;
++ __u8 cpt_rcvtstamp;
++ __u8 cpt_localroute;
++ __u8 cpt_protocol;
++
++ __u32 cpt_err;
++ __u32 cpt_err_soft;
++
++ __u16 cpt_max_ack_backlog;
++ __u16 __cpt_pad1;
++ __u32 cpt_priority;
++
++ __u32 cpt_rcvlowat;
++ __u32 cpt_bound_dev_if;
++
++ __u64 cpt_rcvtimeo;
++ __u64 cpt_sndtimeo;
++ __u32 cpt_rcvbuf;
++ __u32 cpt_sndbuf;
++ __u64 cpt_flags;
++ __u64 cpt_lingertime;
++ __u32 cpt_peer_pid;
++ __u32 cpt_peer_uid;
++
++ __u32 cpt_peer_gid;
++ __u32 cpt_laddrlen;
++ __u32 cpt_laddr[128/4];
++ __u32 cpt_raddrlen;
++ __u32 cpt_raddr[128/4];
++ /* AF_UNIX */
++ __u32 cpt_peer;
++
++ __u8 cpt_socketpair;
++ __u8 cpt_deleted;
++ __u16 __cpt_pad4;
++ __u32 __cpt_pad5;
++/*
++ struct sk_filter *sk_filter;
++ */
++
++ __u64 cpt_stamp;
++ __u32 cpt_daddr;
++ __u16 cpt_dport;
++ __u16 cpt_sport;
++
++ __u32 cpt_saddr;
++ __u32 cpt_rcv_saddr;
++
++ __u32 cpt_uc_ttl;
++ __u32 cpt_tos;
++
++ __u32 cpt_cmsg_flags;
++ __u32 cpt_mc_index;
++
++ __u32 cpt_mc_addr;
++/*
++ struct ip_options *opt;
++ */
++ __u8 cpt_hdrincl;
++ __u8 cpt_mc_ttl;
++ __u8 cpt_mc_loop;
++ __u8 cpt_pmtudisc;
++
++ __u8 cpt_recverr;
++ __u8 cpt_freebind;
++ __u16 cpt_idcounter;
++ __u32 cpt_cork_flags;
++
++ __u32 cpt_cork_fragsize;
++ __u32 cpt_cork_length;
++ __u32 cpt_cork_addr;
++ __u32 cpt_cork_saddr;
++ __u32 cpt_cork_daddr;
++ __u32 cpt_cork_oif;
++
++ __u32 cpt_udp_pending;
++ __u32 cpt_udp_corkflag;
++ __u16 cpt_udp_encap;
++ __u16 cpt_udp_len;
++ __u32 __cpt_pad7;
++
++ __u64 cpt_saddr6[2];
++ __u64 cpt_rcv_saddr6[2];
++ __u64 cpt_daddr6[2];
++ __u32 cpt_flow_label6;
++ __u32 cpt_frag_size6;
++ __u32 cpt_hop_limit6;
++ __u32 cpt_mcast_hops6;
++
++ __u32 cpt_mcast_oif6;
++ __u8 cpt_rxopt6;
++ __u8 cpt_mc_loop6;
++ __u8 cpt_recverr6;
++ __u8 cpt_sndflow6;
++
++ __u8 cpt_pmtudisc6;
++ __u8 cpt_ipv6only6;
++ __u8 cpt_mapped;
++ __u8 __cpt_pad8;
++ __u32 cpt_pred_flags;
++
++ __u32 cpt_rcv_nxt;
++ __u32 cpt_snd_nxt;
++
++ __u32 cpt_snd_una;
++ __u32 cpt_snd_sml;
++
++ __u32 cpt_rcv_tstamp;
++ __u32 cpt_lsndtime;
++
++ __u8 cpt_tcp_header_len;
++ __u8 cpt_ack_pending;
++ __u8 cpt_quick;
++ __u8 cpt_pingpong;
++ __u8 cpt_blocked;
++ __u8 __cpt_pad9;
++ __u16 __cpt_pad10;
++
++ __u32 cpt_ato;
++ __u32 cpt_ack_timeout;
++
++ __u32 cpt_lrcvtime;
++ __u16 cpt_last_seg_size;
++ __u16 cpt_rcv_mss;
++
++ __u32 cpt_snd_wl1;
++ __u32 cpt_snd_wnd;
++
++ __u32 cpt_max_window;
++ __u32 cpt_pmtu_cookie;
++
++ __u32 cpt_mss_cache;
++ __u16 cpt_mss_cache_std;
++ __u16 cpt_mss_clamp;
++
++ __u16 cpt_ext_header_len;
++ __u16 cpt_ext2_header_len;
++ __u8 cpt_ca_state;
++ __u8 cpt_retransmits;
++ __u8 cpt_reordering;
++ __u8 cpt_frto_counter;
++
++ __u32 cpt_frto_highmark;
++ __u8 cpt_adv_cong;
++ __u8 cpt_defer_accept;
++ __u8 cpt_backoff;
++ __u8 __cpt_pad11;
++
++ __u32 cpt_srtt;
++ __u32 cpt_mdev;
++
++ __u32 cpt_mdev_max;
++ __u32 cpt_rttvar;
++
++ __u32 cpt_rtt_seq;
++ __u32 cpt_rto;
++
++ __u32 cpt_packets_out;
++ __u32 cpt_left_out;
++
++ __u32 cpt_retrans_out;
++ __u32 cpt_snd_ssthresh;
++
++ __u32 cpt_snd_cwnd;
++ __u16 cpt_snd_cwnd_cnt;
++ __u16 cpt_snd_cwnd_clamp;
++
++ __u32 cpt_snd_cwnd_used;
++ __u32 cpt_snd_cwnd_stamp;
++
++ __u32 cpt_timeout;
++ __u32 cpt_ka_timeout;
++
++ __u32 cpt_rcv_wnd;
++ __u32 cpt_rcv_wup;
++
++ __u32 cpt_write_seq;
++ __u32 cpt_pushed_seq;
++
++ __u32 cpt_copied_seq;
++ __u8 cpt_tstamp_ok;
++ __u8 cpt_wscale_ok;
++ __u8 cpt_sack_ok;
++ __u8 cpt_saw_tstamp;
++
++ __u8 cpt_snd_wscale;
++ __u8 cpt_rcv_wscale;
++ __u8 cpt_nonagle;
++ __u8 cpt_keepalive_probes;
++ __u32 cpt_rcv_tsval;
++
++ __u32 cpt_rcv_tsecr;
++ __u32 cpt_ts_recent;
++
++ __u64 cpt_ts_recent_stamp;
++ __u16 cpt_user_mss;
++ __u8 cpt_dsack;
++ __u8 cpt_eff_sacks;
++ __u32 cpt_sack_array[2*5];
++ __u32 cpt_window_clamp;
++
++ __u32 cpt_rcv_ssthresh;
++ __u8 cpt_probes_out;
++ __u8 cpt_num_sacks;
++ __u16 cpt_advmss;
++
++ __u8 cpt_syn_retries;
++ __u8 cpt_ecn_flags;
++ __u16 cpt_prior_ssthresh;
++ __u32 cpt_lost_out;
++
++ __u32 cpt_sacked_out;
++ __u32 cpt_fackets_out;
++
++ __u32 cpt_high_seq;
++ __u32 cpt_retrans_stamp;
++
++ __u32 cpt_undo_marker;
++ __u32 cpt_undo_retrans;
++
++ __u32 cpt_urg_seq;
++ __u16 cpt_urg_data;
++ __u8 cpt_pending;
++ __u8 cpt_urg_mode;
++
++ __u32 cpt_snd_up;
++ __u32 cpt_keepalive_time;
++
++ __u32 cpt_keepalive_intvl;
++ __u32 cpt_linger2;
++
++ __u32 cpt_rcvrtt_rtt;
++ __u32 cpt_rcvrtt_seq;
++
++ __u32 cpt_rcvrtt_time;
++ __u32 __cpt_pad12;
++} __attribute__ ((aligned (8)));
++
++struct cpt_sockmc_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u16 cpt_family;
++ __u16 cpt_mode;
++ __u32 cpt_ifindex;
++ __u32 cpt_mcaddr[4];
++} __attribute__ ((aligned (8)));
++/* Followed by array of source addresses, each zero padded to 16 bytes */
++
++struct cpt_openreq_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_rcv_isn;
++ __u32 cpt_snt_isn;
++
++ __u16 cpt_rmt_port;
++ __u16 cpt_mss;
++ __u8 cpt_family;
++ __u8 cpt_retrans;
++ __u8 cpt_snd_wscale;
++ __u8 cpt_rcv_wscale;
++
++ __u8 cpt_tstamp_ok;
++ __u8 cpt_sack_ok;
++ __u8 cpt_wscale_ok;
++ __u8 cpt_ecn_ok;
++ __u8 cpt_acked;
++ __u8 __cpt_pad1;
++ __u16 __cpt_pad2;
++
++ __u32 cpt_window_clamp;
++ __u32 cpt_rcv_wnd;
++ __u32 cpt_ts_recent;
++ __u32 cpt_iif;
++ __u64 cpt_expires;
++
++ __u64 cpt_loc_addr[2];
++ __u64 cpt_rmt_addr[2];
++/*
++ struct ip_options *opt;
++ */
++
++} __attribute__ ((aligned (8)));
++
++struct cpt_skb_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_owner;
++ __u32 cpt_queue;
++#define CPT_SKB_NQ 0
++#define CPT_SKB_RQ 1
++#define CPT_SKB_WQ 2
++#define CPT_SKB_OFOQ 3
++
++ __u64 cpt_stamp;
++ __u32 cpt_len;
++ __u32 cpt_hspace;
++ __u32 cpt_tspace;
++ __u32 cpt_h;
++ __u32 cpt_nh;
++ __u32 cpt_mac;
++
++ __u64 cpt_cb[5];
++ __u32 cpt_mac_len;
++ __u32 cpt_csum;
++ __u8 cpt_local_df;
++ __u8 cpt_pkt_type;
++ __u8 cpt_ip_summed;
++ __u8 __cpt_pad1;
++ __u32 cpt_priority;
++ __u16 cpt_protocol;
++ __u16 cpt_security;
++ __u16 cpt_tso_segs;
++ __u16 cpt_tso_size;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_sysvshm_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_key;
++ __u64 cpt_uid;
++ __u64 cpt_gid;
++ __u64 cpt_cuid;
++ __u64 cpt_cgid;
++ __u64 cpt_mode;
++ __u64 cpt_seq;
++
++ __u32 cpt_id;
++ __u32 cpt_mlockuser;
++ __u64 cpt_segsz;
++ __u64 cpt_atime;
++ __u64 cpt_ctime;
++ __u64 cpt_dtime;
++ __u64 cpt_creator;
++ __u64 cpt_last;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_sysvsem_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_key;
++ __u64 cpt_uid;
++ __u64 cpt_gid;
++ __u64 cpt_cuid;
++ __u64 cpt_cgid;
++ __u64 cpt_mode;
++ __u64 cpt_seq;
++ __u32 cpt_id;
++ __u32 __cpt_pad1;
++
++ __u64 cpt_otime;
++ __u64 cpt_ctime;
++} __attribute__ ((aligned (8)));
++/* Content is array of pairs semval/sempid */
++
++struct cpt_sysvsem_undo_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_id;
++ __u32 cpt_nsem;
++} __attribute__ ((aligned (8)));
++
++
++struct cpt_mm_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_start_code;
++ __u64 cpt_end_code;
++ __u64 cpt_start_data;
++ __u64 cpt_end_data;
++ __u64 cpt_start_brk;
++ __u64 cpt_brk;
++ __u64 cpt_start_stack;
++ __u64 cpt_start_arg;
++ __u64 cpt_end_arg;
++ __u64 cpt_start_env;
++ __u64 cpt_end_env;
++ __u64 cpt_def_flags;
++ __u64 cpt_mmub;
++ __u8 cpt_dumpable;
++ __u8 cpt_vps_dumpable;
++ __u8 cpt_used_hugetlb;
++ __u8 __cpt_pad;
++} __attribute__ ((aligned (8)));
++
++struct cpt_page_block
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_start;
++ __u64 cpt_end;
++} __attribute__ ((aligned (8)));
++
++struct cpt_remappage_block
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_start;
++ __u64 cpt_end;
++ __u64 cpt_pgoff;
++} __attribute__ ((aligned (8)));
++
++struct cpt_copypage_block
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_start;
++ __u64 cpt_end;
++ __u64 cpt_source;
++} __attribute__ ((aligned (8)));
++
++struct cpt_lazypage_block
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_start;
++ __u64 cpt_end;
++ __u64 cpt_index;
++} __attribute__ ((aligned (8)));
++
++struct cpt_vma_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_file;
++ __u32 cpt_type;
++#define CPT_VMA_TYPE_0 0
++#define CPT_VMA_TYPE_SHM 1
++ __u32 cpt_anonvma;
++ __u64 cpt_anonvmaid;
++
++ __u64 cpt_start;
++ __u64 cpt_end;
++ __u64 cpt_flags;
++ __u64 cpt_pgprot;
++ __u64 cpt_pgoff;
++} __attribute__ ((aligned (8)));
++
++struct cpt_aio_ctx_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_max_reqs;
++ __u32 cpt_ring_pages;
++ __u32 cpt_tail;
++ __u32 cpt_nr;
++ __u64 cpt_mmap_base;
++ /* Data (io_event's) and struct aio_ring are stored in user space VM */
++} __attribute__ ((aligned (8)));
++
++
++/* Format of MM section.
++ *
++ * It is array of MM objects (mm_struct). Each MM object is
++ * header, encoding mm_struct, followed by array of VMA objects.
++ * Each VMA consists of VMA header, encoding vm_area_struct, and
++ * if the VMA contains copied pages, the header is followed by
++ * array of tuples start-end each followed by data.
++ *
++ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good?
++ */
++
++struct cpt_restart_block {
++ __u64 fn;
++#define CPT_RBL_0 0
++#define CPT_RBL_NANOSLEEP 1
++#define CPT_RBL_COMPAT_NANOSLEEP 2
++ __u64 arg0;
++ __u64 arg1;
++ __u64 arg2;
++ __u64 arg3;
++} __attribute__ ((aligned (8)));
++
++struct cpt_siginfo_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_qflags;
++ __u32 cpt_signo;
++ __u32 cpt_errno;
++ __u32 cpt_code;
++
++ __u64 cpt_sigval;
++ __u32 cpt_pid;
++ __u32 cpt_uid;
++ __u64 cpt_utime;
++ __u64 cpt_stime;
++
++ __u64 cpt_user;
++} __attribute__ ((aligned (8)));
++
++/* Portable presentaions for segment registers */
++
++#define CPT_SEG_ZERO 0
++#define CPT_SEG_TLS1 1
++#define CPT_SEG_TLS2 2
++#define CPT_SEG_TLS3 3
++#define CPT_SEG_USER32_DS 4
++#define CPT_SEG_USER32_CS 5
++#define CPT_SEG_USER64_DS 6
++#define CPT_SEG_USER64_CS 7
++#define CPT_SEG_LDT 256
++
++struct cpt_x86_regs
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_debugreg[8];
++ __u32 cpt_fs;
++ __u32 cpt_gs;
++
++ __u32 cpt_ebx;
++ __u32 cpt_ecx;
++ __u32 cpt_edx;
++ __u32 cpt_esi;
++ __u32 cpt_edi;
++ __u32 cpt_ebp;
++ __u32 cpt_eax;
++ __u32 cpt_xds;
++ __u32 cpt_xes;
++ __u32 cpt_orig_eax;
++ __u32 cpt_eip;
++ __u32 cpt_xcs;
++ __u32 cpt_eflags;
++ __u32 cpt_esp;
++ __u32 cpt_xss;
++ __u32 cpt_pad;
++};
++
++struct cpt_x86_64_regs
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_debugreg[8];
++
++ __u64 cpt_fsbase;
++ __u64 cpt_gsbase;
++ __u32 cpt_fsindex;
++ __u32 cpt_gsindex;
++ __u32 cpt_ds;
++ __u32 cpt_es;
++
++ __u64 cpt_r15;
++ __u64 cpt_r14;
++ __u64 cpt_r13;
++ __u64 cpt_r12;
++ __u64 cpt_rbp;
++ __u64 cpt_rbx;
++ __u64 cpt_r11;
++ __u64 cpt_r10;
++ __u64 cpt_r9;
++ __u64 cpt_r8;
++ __u64 cpt_rax;
++ __u64 cpt_rcx;
++ __u64 cpt_rdx;
++ __u64 cpt_rsi;
++ __u64 cpt_rdi;
++ __u64 cpt_orig_rax;
++ __u64 cpt_rip;
++ __u64 cpt_cs;
++ __u64 cpt_eflags;
++ __u64 cpt_rsp;
++ __u64 cpt_ss;
++};
++
++struct cpt_task_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_state;
++ __u64 cpt_flags;
++ __u64 cpt_ptrace;
++ __u32 cpt_prio;
++ __u32 cpt_static_prio;
++ __u32 cpt_policy;
++ __u32 cpt_rt_priority;
++
++ /* struct thread_info */
++ __u64 cpt_exec_domain;
++ __u64 cpt_thrflags;
++ __u64 cpt_thrstatus;
++ __u64 cpt_addr_limit;
++
++ __u64 cpt_personality;
++
++ __u64 cpt_mm;
++ __u64 cpt_files;
++ __u64 cpt_fs;
++ __u64 cpt_signal;
++ __u64 cpt_sighand;
++ __u64 cpt_sigblocked;
++ __u64 cpt_sigrblocked;
++ __u64 cpt_sigpending;
++ __u64 cpt_namespace;
++ __u64 cpt_sysvsem_undo;
++ __u32 cpt_pid;
++ __u32 cpt_tgid;
++ __u32 cpt_ppid;
++ __u32 cpt_rppid;
++ __u32 cpt_pgrp;
++ __u32 cpt_session;
++ __u32 cpt_old_pgrp;
++ __u32 __cpt_pad;
++ __u32 cpt_leader;
++ __u8 cpt_pn_state;
++ __u8 cpt_stopped_state;
++ __u8 cpt_sigsuspend_state;
++ __u8 cpt_64bit;
++ __u64 cpt_set_tid;
++ __u64 cpt_clear_tid;
++ __u32 cpt_exit_code;
++ __u32 cpt_exit_signal;
++ __u32 cpt_pdeath_signal;
++ __u32 cpt_user;
++ __u32 cpt_uid;
++ __u32 cpt_euid;
++ __u32 cpt_suid;
++ __u32 cpt_fsuid;
++ __u32 cpt_gid;
++ __u32 cpt_egid;
++ __u32 cpt_sgid;
++ __u32 cpt_fsgid;
++ __u32 cpt_ngids;
++ __u32 cpt_gids[32];
++ __u32 __cpt_pad2;
++ __u64 cpt_ecap;
++ __u64 cpt_icap;
++ __u64 cpt_pcap;
++ __u8 cpt_comm[16];
++ __u64 cpt_tls[3];
++ struct cpt_restart_block cpt_restart;
++ __u64 cpt_it_real_value; /* V0: jiffies, V1: nsec */
++ __u64 cpt_it_real_incr; /* V0: jiffies, V1: nsec */
++ __u64 cpt_it_prof_value;
++ __u64 cpt_it_prof_incr;
++ __u64 cpt_it_virt_value;
++ __u64 cpt_it_virt_incr;
++
++ __u16 cpt_used_math;
++ __u8 cpt_keepcap;
++ __u8 cpt_did_exec;
++ __u32 cpt_ptrace_message;
++
++ __u64 cpt_utime;
++ __u64 cpt_stime;
++ __u64 cpt_starttime; /* V0: jiffies, V1: timespec */
++ __u64 cpt_nvcsw;
++ __u64 cpt_nivcsw;
++ __u64 cpt_min_flt;
++ __u64 cpt_maj_flt;
++
++ __u64 cpt_sigsuspend_blocked;
++ __u64 cpt_cutime, cpt_cstime;
++ __u64 cpt_cnvcsw, cpt_cnivcsw;
++ __u64 cpt_cmin_flt, cpt_cmaj_flt;
++
++#define CPT_RLIM_NLIMITS 16
++ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS];
++ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS];
++
++ __u64 cpt_task_ub;
++ __u64 cpt_exec_ub;
++ __u64 cpt_mm_ub;
++ __u64 cpt_fork_sub;
++} __attribute__ ((aligned (8)));
++
++struct cpt_sigaltstack_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_stack;
++ __u32 cpt_stacksize;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++struct cpt_signal_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_leader;
++ __u8 cpt_pgrp_type;
++ __u8 cpt_old_pgrp_type;
++ __u8 cpt_session_type;
++#define CPT_PGRP_NORMAL 0
++#define CPT_PGRP_ORPHAN 1
++#define CPT_PGRP_STRAY 2
++ __u8 __cpt_pad1;
++ __u64 cpt_pgrp;
++ __u64 cpt_old_pgrp;
++ __u64 cpt_session;
++ __u64 cpt_sigpending;
++ __u64 cpt_ctty;
++
++ __u32 cpt_curr_target;
++ __u32 cpt_group_exit;
++ __u32 cpt_group_exit_code;
++ __u32 cpt_group_exit_task;
++ __u32 cpt_notify_count;
++ __u32 cpt_group_stop_count;
++ __u32 cpt_stop_state;
++ __u32 __cpt_pad2;
++
++ __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime;
++ __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw;
++ __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt;
++
++ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS];
++ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS];
++} __attribute__ ((aligned (8)));
++/* Followed by list of posix timers. */
++
++struct cpt_sighand_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++} __attribute__ ((aligned (8)));
++/* Followed by list of sighandles. */
++
++struct cpt_sighandler_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_signo;
++ __u32 __cpt_pad1;
++ __u64 cpt_handler;
++ __u64 cpt_restorer;
++ __u64 cpt_flags;
++ __u64 cpt_mask;
++} __attribute__ ((aligned (8)));
++
++struct cpt_netdev_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_index;
++ __u32 cpt_flags;
++ __u8 cpt_name[16];
++} __attribute__ ((aligned (8)));
++
++struct cpt_ifaddr_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u32 cpt_index;
++ __u8 cpt_family;
++ __u8 cpt_masklen;
++ __u8 cpt_flags;
++ __u8 cpt_scope;
++ __u32 cpt_address[4];
++ __u32 cpt_peer[4];
++ __u32 cpt_broadcast[4];
++ __u8 cpt_label[16];
++} __attribute__ ((aligned (8)));
++
++struct cpt_ipct_tuple
++{
++ __u32 cpt_src;
++ __u16 cpt_srcport;
++ __u16 __cpt_pad1;
++
++ __u32 cpt_dst;
++ __u16 cpt_dstport;
++ __u8 cpt_protonum;
++ __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */
++} __attribute__ ((aligned (8)));
++
++struct cpt_nat_manip
++{
++ __u8 cpt_direction;
++ __u8 cpt_hooknum;
++ __u8 cpt_maniptype;
++ __u8 __cpt_pad1;
++
++ __u32 cpt_manip_addr;
++ __u16 cpt_manip_port;
++ __u16 __cpt_pad2;
++ __u32 __cpt_pad3;
++} __attribute__ ((aligned (8)));
++
++struct cpt_nat_seq
++{
++ __u32 cpt_correction_pos;
++ __u32 cpt_offset_before;
++ __u32 cpt_offset_after;
++ __u32 __cpt_pad1;
++} __attribute__ ((aligned (8)));
++
++struct cpt_ip_connexpect_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_timeout;
++ __u32 cpt_sibling_conntrack; /* Index of child conntrack */
++ __u32 cpt_seq; /* id in 2.6.15 */
++
++ struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */
++ struct cpt_ipct_tuple cpt_tuple;
++ struct cpt_ipct_tuple cpt_mask;
++
++ /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */
++ __u32 cpt_help[3]; /* NU 2.6.15 */
++ __u16 cpt_manip_proto;
++ __u8 cpt_dir;
++ __u8 cpt_flags;
++} __attribute__ ((aligned (8)));
++
++struct cpt_ip_conntrack_image
++{
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ struct cpt_ipct_tuple cpt_tuple[2];
++ __u64 cpt_status;
++ __u64 cpt_timeout;
++ __u32 cpt_index;
++ __u8 cpt_ct_helper;
++ __u8 cpt_nat_helper;
++ __u16 cpt_pad1;
++
++ /* union ip_conntrack_proto. Used by tcp and icmp. */
++ __u32 cpt_proto_data[12];
++
++ /* union ip_conntrack_help. Used by ftp and pptp helper.
++ * We do not support pptp...
++ */
++ __u32 cpt_help_data[6];
++
++ /* nat info */
++ __u32 cpt_initialized; /* NU 2.6.15 */
++ __u32 cpt_num_manips; /* NU 2.6.15 */
++ struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */
++
++ struct cpt_nat_seq cpt_nat_seq[2];
++
++ __u32 cpt_masq_index;
++ __u32 cpt_id;
++ __u32 cpt_mark;
++} __attribute__ ((aligned (8)));
++
++struct cpt_beancounter_image {
++ __u64 cpt_next;
++ __u32 cpt_object;
++ __u16 cpt_hdrlen;
++ __u16 cpt_content;
++
++ __u64 cpt_parent;
++ __u32 cpt_id;
++ __u32 __cpt_pad;
++ __u64 cpt_parms[32 * 6 * 2];
++} __attribute__ ((aligned (8)));
++
++#ifdef __KERNEL__
++
++static inline void *cpt_ptr_import(__u64 ptr)
++{
++ return (void*)(unsigned long)ptr;
++}
++
++static inline __u64 cpt_ptr_export(void __user *ptr)
++{
++ return (__u64)(unsigned long)ptr;
++}
++
++static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr)
++{
++ memcpy(sig, &ptr, sizeof(*sig));
++}
++
++static inline __u64 cpt_sigset_export(sigset_t *sig)
++{
++ return *(__u64*)sig;
++}
++
++static inline __u64 cpt_timespec_export(struct timespec *tv)
++{
++ return (((u64)tv->tv_sec) << 32) + tv->tv_nsec;
++}
++
++static inline void cpt_timespec_import(struct timespec *tv, __u64 val)
++{
++ tv->tv_sec = val>>32;
++ tv->tv_nsec = (val&0xFFFFFFFF);
++}
++
++static inline __u64 cpt_timeval_export(struct timeval *tv)
++{
++ return (((u64)tv->tv_sec) << 32) + tv->tv_usec;
++}
++
++static inline void cpt_timeval_import(struct timeval *tv, __u64 val)
++{
++ tv->tv_sec = val>>32;
++ tv->tv_usec = (val&0xFFFFFFFF);
++}
++
++#endif
++
++#endif /* __CPT_IMAGE_H_ */
+diff -upr linux-2.6.16.orig/include/linux/cpt_ioctl.h linux-2.6.16-026test015/include/linux/cpt_ioctl.h
+--- linux-2.6.16.orig/include/linux/cpt_ioctl.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/cpt_ioctl.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,41 @@
++/*
++ *
++ * include/linux/cpt_ioctl.h
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _CPT_IOCTL_H_
++#define _CPT_IOCTL_H_ 1
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++
++#define CPTCTLTYPE '-'
++#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int)
++#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int)
++#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int)
++#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int)
++#define CPT_SUSPEND _IO(CPTCTLTYPE, 5)
++#define CPT_DUMP _IO(CPTCTLTYPE, 6)
++#define CPT_UNDUMP _IO(CPTCTLTYPE, 7)
++#define CPT_RESUME _IO(CPTCTLTYPE, 8)
++#define CPT_KILL _IO(CPTCTLTYPE, 9)
++#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10)
++#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int)
++#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12)
++#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int)
++#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int)
++#define CPT_PAGEIND _IO(CPTCTLTYPE, 15)
++#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int)
++#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int)
++#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int)
++#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int)
++#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int)
++#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int)
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/cpu.h linux-2.6.16-026test015/include/linux/cpu.h
+--- linux-2.6.16.orig/include/linux/cpu.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/cpu.h 2006-07-04 14:41:36.000000000 +0400
+@@ -32,7 +32,7 @@ struct cpu {
+ };
+
+ extern int register_cpu(struct cpu *, int, struct node *);
+-extern struct sys_device *get_cpu_sysdev(int cpu);
++extern struct sys_device *get_cpu_sysdev(unsigned cpu);
+ #ifdef CONFIG_HOTPLUG_CPU
+ extern void unregister_cpu(struct cpu *, struct node *);
+ #endif
+diff -upr linux-2.6.16.orig/include/linux/cpumask.h linux-2.6.16-026test015/include/linux/cpumask.h
+--- linux-2.6.16.orig/include/linux/cpumask.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/cpumask.h 2006-07-04 14:41:36.000000000 +0400
+@@ -408,6 +408,7 @@ extern cpumask_t cpu_present_map;
+ })
+
+ #define for_each_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map)
++#define for_each_possible_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map)
+ #define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map)
+ #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
+
+diff -upr linux-2.6.16.orig/include/linux/dcache.h linux-2.6.16-026test015/include/linux/dcache.h
+--- linux-2.6.16.orig/include/linux/dcache.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/dcache.h 2006-07-04 14:41:38.000000000 +0400
+@@ -9,6 +9,8 @@
+ #include <linux/cache.h>
+ #include <linux/rcupdate.h>
+
++#include <ub/ub_dcache.h>
++
+ struct nameidata;
+ struct vfsmount;
+
+@@ -111,6 +113,9 @@ struct dentry {
+ struct dcookie_struct *d_cookie; /* cookie, if any */
+ #endif
+ int d_mounted;
++#ifdef CONFIG_USER_RESOURCE
++ struct dentry_beancounter dentry_bc;
++#endif
+ unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */
+ };
+
+@@ -161,7 +166,11 @@ d_iput: no no no yes
+
+ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
+ #define DCACHE_UNHASHED 0x0010
++#define DCACHE_VIRTUAL 0x0100 /* ve accessible */
++
++extern void mark_tree_virtual(struct vfsmount *m, struct dentry *d);
+
++extern kmem_cache_t *dentry_cache;
+ extern spinlock_t dcache_lock;
+
+ /**
+@@ -215,7 +224,7 @@ extern struct dentry * d_alloc_anon(stru
+ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
+ extern void shrink_dcache_sb(struct super_block *);
+ extern void shrink_dcache_parent(struct dentry *);
+-extern void shrink_dcache_anon(struct hlist_head *);
++extern void shrink_dcache_anon(struct super_block *);
+ extern int d_invalidate(struct dentry *);
+
+ /* only used at mount-time */
+@@ -277,6 +286,7 @@ extern struct dentry * __d_lookup(struct
+ /* validate "insecure" dentry pointer */
+ extern int d_validate(struct dentry *, struct dentry *);
+
++extern int d_root_check(struct dentry *, struct vfsmount *);
+ extern char * d_path(struct dentry *, struct vfsmount *, char *, int);
+
+ /* Allocation counts.. */
+@@ -297,6 +307,8 @@ extern char * d_path(struct dentry *, st
+ static inline struct dentry *dget(struct dentry *dentry)
+ {
+ if (dentry) {
++ if (ub_dget_testone(dentry))
++ BUG();
+ BUG_ON(!atomic_read(&dentry->d_count));
+ atomic_inc(&dentry->d_count);
+ }
+@@ -340,6 +352,8 @@ extern struct dentry *lookup_create(stru
+
+ extern int sysctl_vfs_cache_pressure;
+
++extern int check_area_access_ve(struct dentry *, struct vfsmount *);
++extern int check_area_execute_ve(struct dentry *, struct vfsmount *);
+ #endif /* __KERNEL__ */
+
+ #endif /* __LINUX_DCACHE_H */
+diff -upr linux-2.6.16.orig/include/linux/devpts_fs.h linux-2.6.16-026test015/include/linux/devpts_fs.h
+--- linux-2.6.16.orig/include/linux/devpts_fs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/devpts_fs.h 2006-07-04 14:41:38.000000000 +0400
+@@ -21,6 +21,15 @@ int devpts_pty_new(struct tty_struct *tt
+ struct tty_struct *devpts_get_tty(int number); /* get tty structure */
+ void devpts_pty_kill(int number); /* unlink */
+
++struct devpts_config {
++ int setuid;
++ int setgid;
++ uid_t uid;
++ gid_t gid;
++ umode_t mode;
++};
++
++extern struct devpts_config devpts_config;
+ #else
+
+ /* Dummy stubs in the no-pty case */
+diff -upr linux-2.6.16.orig/include/linux/elfcore.h linux-2.6.16-026test015/include/linux/elfcore.h
+--- linux-2.6.16.orig/include/linux/elfcore.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/elfcore.h 2006-07-04 14:41:39.000000000 +0400
+@@ -7,6 +7,8 @@
+ #include <linux/user.h>
+ #include <linux/ptrace.h>
+
++extern int sysctl_at_vsyscall;
++
+ struct elf_siginfo
+ {
+ int si_signo; /* signal number */
+diff -upr linux-2.6.16.orig/include/linux/eventpoll.h linux-2.6.16-026test015/include/linux/eventpoll.h
+--- linux-2.6.16.orig/include/linux/eventpoll.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/eventpoll.h 2006-07-04 14:41:39.000000000 +0400
+@@ -85,6 +85,91 @@ static inline void eventpoll_release(str
+ eventpoll_release_file(file);
+ }
+
++struct epoll_filefd {
++ struct file *file;
++ int fd;
++};
++
++/*
++ * This structure is stored inside the "private_data" member of the file
++ * structure and rapresent the main data sructure for the eventpoll
++ * interface.
++ */
++struct eventpoll {
++ /* Protect the this structure access */
++ rwlock_t lock;
++
++ /*
++ * This semaphore is used to ensure that files are not removed
++ * while epoll is using them. This is read-held during the event
++ * collection loop and it is write-held during the file cleanup
++ * path, the epoll file exit code and the ctl operations.
++ */
++ struct rw_semaphore sem;
++
++ /* Wait queue used by sys_epoll_wait() */
++ wait_queue_head_t wq;
++
++ /* Wait queue used by file->poll() */
++ wait_queue_head_t poll_wait;
++
++ /* List of ready file descriptors */
++ struct list_head rdllist;
++
++ /* RB-Tree root used to store monitored fd structs */
++ struct rb_root rbr;
++};
++
++/*
++ * Each file descriptor added to the eventpoll interface will
++ * have an entry of this type linked to the hash.
++ */
++struct epitem {
++ /* RB-Tree node used to link this structure to the eventpoll rb-tree */
++ struct rb_node rbn;
++
++ /* List header used to link this structure to the eventpoll ready list */
++ struct list_head rdllink;
++
++ /* The file descriptor information this item refers to */
++ struct epoll_filefd ffd;
++
++ /* Number of active wait queue attached to poll operations */
++ int nwait;
++
++ /* List containing poll wait queues */
++ struct list_head pwqlist;
++
++ /* The "container" of this item */
++ struct eventpoll *ep;
++
++ /* The structure that describe the interested events and the source fd */
++ struct epoll_event event;
++
++ /*
++ * Used to keep track of the usage count of the structure. This avoids
++ * that the structure will desappear from underneath our processing.
++ */
++ atomic_t usecnt;
++
++ /* List header used to link this item to the "struct file" items list */
++ struct list_head fllink;
++
++ /* List header used to link the item to the transfer list */
++ struct list_head txlink;
++
++ /*
++ * This is used during the collection/transfer of events to userspace
++ * to pin items empty events set.
++ */
++ unsigned int revents;
++};
++
++extern struct semaphore epsem;
++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
++int ep_insert(struct eventpoll *ep, struct epoll_event *event,
++ struct file *tfile, int fd);
++void ep_release_epitem(struct epitem *epi);
+
+ #else
+
+diff -upr linux-2.6.16.orig/include/linux/fairsched.h linux-2.6.16-026test015/include/linux/fairsched.h
+--- linux-2.6.16.orig/include/linux/fairsched.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/fairsched.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,118 @@
++#ifndef __LINUX_FAIRSCHED_H__
++#define __LINUX_FAIRSCHED_H__
++
++/*
++ * Fair Scheduler
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/cache.h>
++#include <asm/timex.h>
++
++#define FAIRSCHED_HAS_CPU_BINDING 0
++
++typedef struct { cycles_t t; } fschtag_t;
++typedef struct { unsigned long d; } fschdur_t;
++typedef struct { cycles_t v; } fschvalue_t;
++
++struct vcpu_scheduler;
++
++struct fairsched_node {
++ struct list_head runlist;
++
++ /*
++ * Fair Scheduler fields
++ *
++ * nr_running >= nr_ready (!= if delayed)
++ */
++ fschtag_t start_tag;
++ int nr_ready;
++ int nr_runnable;
++ int nr_pcpu;
++
++ /*
++ * Rate limitator fields
++ */
++ cycles_t last_updated_at;
++ fschvalue_t value; /* leaky function value */
++ cycles_t delay; /* removed from schedule till */
++ unsigned char delayed;
++
++ /*
++ * Configuration
++ *
++ * Read-only most of the time.
++ */
++ unsigned weight ____cacheline_aligned_in_smp;
++ /* fairness weight */
++ unsigned char rate_limited;
++ unsigned rate; /* max CPU share */
++ fschtag_t max_latency;
++ unsigned min_weight;
++
++ struct list_head nodelist;
++ int id;
++#ifdef CONFIG_VE
++ struct ve_struct *owner_env;
++#endif
++ struct vcpu_scheduler *vsched;
++};
++
++#ifdef CONFIG_FAIRSCHED
++
++#define FSCHWEIGHT_MAX ((1 << 16) - 1)
++#define FSCHRATE_SHIFT 10
++
++/*
++ * Fairsched nodes used in boot process.
++ */
++extern struct fairsched_node fairsched_init_node;
++extern struct fairsched_node fairsched_idle_node;
++
++/*
++ * For proc output.
++ */
++extern unsigned fairsched_nr_cpus;
++extern void fairsched_cpu_online_map(int id, cpumask_t *mask);
++
++/* I hope vsched_id is always equal to fairsched node id --SAW */
++#define task_fairsched_node_id(p) task_vsched_id(p)
++
++/*
++ * Core functions.
++ */
++extern void fairsched_incrun(struct fairsched_node *node);
++extern void fairsched_decrun(struct fairsched_node *node);
++extern void fairsched_inccpu(struct fairsched_node *node);
++extern void fairsched_deccpu(struct fairsched_node *node);
++extern struct fairsched_node *fairsched_schedule(
++ struct fairsched_node *prev_node,
++ struct fairsched_node *cur_node,
++ int cur_node_active,
++ cycles_t time);
++
++/*
++ * Management functions.
++ */
++void fairsched_init_early(void);
++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
++ unsigned int newid);
++asmlinkage int sys_fairsched_rmnod(unsigned int id);
++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid);
++
++#else /* CONFIG_FAIRSCHED */
++
++#define task_fairsched_node_id(p) 0
++#define fairsched_incrun(p) do { } while (0)
++#define fairsched_decrun(p) do { } while (0)
++#define fairsched_deccpu(p) do { } while (0)
++#define fairsched_cpu_online_map(id, mask) do { *(mask) = cpu_online_map; } while (0)
++
++#endif /* CONFIG_FAIRSCHED */
++
++#endif /* __LINUX_FAIRSCHED_H__ */
+diff -upr linux-2.6.16.orig/include/linux/faudit.h linux-2.6.16-026test015/include/linux/faudit.h
+--- linux-2.6.16.orig/include/linux/faudit.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/faudit.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,46 @@
++/*
++ * include/linux/faudit.h
++ *
++ * Copyright (C) 2005 SWSoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __FAUDIT_H_
++#define __FAUDIT_H_
++
++#include <linux/config.h>
++#include <linux/virtinfo.h>
++
++struct vfsmount;
++struct dentry;
++struct super_block;
++struct kstatfs;
++struct kstat;
++struct pt_regs;
++
++struct faudit_regs_arg {
++ int err;
++ struct pt_regs *regs;
++};
++
++struct faudit_stat_arg {
++ int err;
++ struct vfsmount *mnt;
++ struct dentry *dentry;
++ struct kstat *stat;
++};
++
++struct faudit_statfs_arg {
++ int err;
++ struct super_block *sb;
++ struct kstatfs *stat;
++};
++
++#define VIRTINFO_FAUDIT (0)
++#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0)
++#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1)
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/fb.h linux-2.6.16-026test015/include/linux/fb.h
+--- linux-2.6.16.orig/include/linux/fb.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/fb.h 2006-07-04 14:41:36.000000000 +0400
+@@ -839,12 +839,10 @@ struct fb_info {
+ #define FB_LEFT_POS(bpp) (32 - bpp)
+ #define FB_SHIFT_HIGH(val, bits) ((val) >> (bits))
+ #define FB_SHIFT_LOW(val, bits) ((val) << (bits))
+-#define FB_BIT_NR(b) (7 - (b))
+ #else
+ #define FB_LEFT_POS(bpp) (0)
+ #define FB_SHIFT_HIGH(val, bits) ((val) << (bits))
+ #define FB_SHIFT_LOW(val, bits) ((val) >> (bits))
+-#define FB_BIT_NR(b) (b)
+ #endif
+
+ /*
+diff -upr linux-2.6.16.orig/include/linux/fs.h linux-2.6.16-026test015/include/linux/fs.h
+--- linux-2.6.16.orig/include/linux/fs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/fs.h 2006-07-04 14:41:39.000000000 +0400
+@@ -7,6 +7,7 @@
+ */
+
+ #include <linux/config.h>
++#include <linux/ve_owner.h>
+ #include <linux/limits.h>
+ #include <linux/ioctl.h>
+
+@@ -64,6 +65,7 @@ extern int dir_notify_enable;
+ #define FMODE_LSEEK 4
+ #define FMODE_PREAD 8
+ #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */
++#define FMODE_QUOTACTL 4
+
+ #define RW_MASK 1
+ #define RWA_MASK 2
+@@ -83,6 +85,7 @@ extern int dir_notify_enable;
+ /* public flags for file_system_type */
+ #define FS_REQUIRES_DEV 1
+ #define FS_BINARY_MOUNTDATA 2
++#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */
+ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
+ #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon
+ * as nfs_rename() will be cleaned up
+@@ -297,6 +300,9 @@ struct iattr {
+ * Includes for diskquotas.
+ */
+ #include <linux/quota.h>
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++#include <linux/vzquota_qlnk.h>
++#endif
+
+ /**
+ * enum positive_aop_returns - aop return codes with specific semantics
+@@ -493,6 +499,9 @@ struct inode {
+ #ifdef CONFIG_QUOTA
+ struct dquot *i_dquot[MAXQUOTAS];
+ #endif
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++ struct vz_quota_ilink i_qlnk;
++#endif
+ /* These three should probably be a union */
+ struct list_head i_devices;
+ struct pipe_inode_info *i_pipe;
+@@ -527,6 +536,8 @@ struct inode {
+ #endif
+ };
+
++extern kmem_cache_t *inode_cachep;
++
+ /*
+ * NOTE: in a 32bit arch with a preemptable kernel and
+ * an UP compile the i_size_read/write must be atomic
+@@ -588,6 +599,20 @@ static inline unsigned imajor(struct ino
+
+ extern struct block_device *I_BDEV(struct inode *inode);
+
++struct exec_perm {
++ umode_t mode;
++ uid_t uid, gid;
++ int set;
++};
++
++static inline void set_exec_perm(struct exec_perm *perm, struct inode *ino)
++{
++ perm->set = 1;
++ perm->mode = ino->i_mode;
++ perm->uid = ino->i_uid;
++ perm->gid = ino->i_gid;
++}
++
+ struct fown_struct {
+ rwlock_t lock; /* protects pid, uid, euid fields */
+ int pid; /* pid or -pgrp where SIGIO should be sent */
+@@ -646,7 +671,10 @@ struct file {
+ spinlock_t f_ep_lock;
+ #endif /* #ifdef CONFIG_EPOLL */
+ struct address_space *f_mapping;
++ struct ve_struct *owner_env;
+ };
++DCL_VE_OWNER_PROTO(FILP, struct file, owner_env)
++
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+ #define file_list_unlock() spin_unlock(&files_lock);
+@@ -710,6 +738,9 @@ struct file_lock {
+ struct file *fl_file;
+ unsigned char fl_flags;
+ unsigned char fl_type;
++#ifdef CONFIG_USER_RESOURCE
++ unsigned char fl_charged;
++#endif
+ loff_t fl_start;
+ loff_t fl_end;
+
+@@ -902,7 +933,7 @@ static inline void unlock_super(struct s
+ /*
+ * VFS helper functions..
+ */
+-extern int vfs_permission(struct nameidata *, int);
++extern int vfs_permission(struct nameidata *, int, struct exec_perm *);
+ extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
+ extern int vfs_mkdir(struct inode *, struct dentry *, int);
+ extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
+@@ -1041,7 +1072,8 @@ struct inode_operations {
+ void * (*follow_link) (struct dentry *, struct nameidata *);
+ void (*put_link) (struct dentry *, struct nameidata *, void *);
+ void (*truncate) (struct inode *);
+- int (*permission) (struct inode *, int, struct nameidata *);
++ int (*permission) (struct inode *, int, struct nameidata *,
++ struct exec_perm *);
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
+ int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
+@@ -1089,6 +1121,8 @@ struct super_operations {
+
+ ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
+ ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
++
++ struct inode *(*get_quota_root)(struct super_block *);
+ };
+
+ /* Inode state bits. Protected by inode_lock. */
+@@ -1246,8 +1280,14 @@ struct file_system_type {
+ struct module *owner;
+ struct file_system_type * next;
+ struct list_head fs_supers;
++ struct ve_struct *owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(FSTYPE, struct file_system_type, owner_env)
++
++void get_filesystem(struct file_system_type *fs);
++void put_filesystem(struct file_system_type *fs);
++
+ struct super_block *get_sb_bdev(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data,
+ int (*fill_super)(struct super_block *, void *, int));
+@@ -1285,6 +1325,7 @@ extern struct vfsmount *kern_mount(struc
+ extern int may_umount_tree(struct vfsmount *);
+ extern int may_umount(struct vfsmount *);
+ extern void umount_tree(struct vfsmount *, int, struct list_head *);
++#define kern_umount mntput
+ extern void release_mounts(struct list_head *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+@@ -1292,6 +1333,7 @@ extern void mnt_set_mountpoint(struct vf
+ struct vfsmount *);
+
+ extern int vfs_statfs(struct super_block *, struct kstatfs *);
++extern int faudit_statfs(struct super_block *, struct kstatfs *);
+
+ /* /sys/fs */
+ extern struct subsystem fs_subsys;
+@@ -1383,6 +1425,7 @@ extern int bd_claim(struct block_device
+ extern void bd_release(struct block_device *);
+
+ /* fs/char_dev.c */
++#define CHRDEV_MAJOR_HASH_SIZE 255
+ extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
+ extern int register_chrdev_region(dev_t, unsigned, const char *);
+ extern int register_chrdev(unsigned int, const char *,
+@@ -1390,25 +1433,17 @@ extern int register_chrdev(unsigned int,
+ extern int unregister_chrdev(unsigned int, const char *);
+ extern void unregister_chrdev_region(dev_t, unsigned);
+ extern int chrdev_open(struct inode *, struct file *);
+-extern int get_chrdev_list(char *);
+-extern void *acquire_chrdev_list(void);
+-extern int count_chrdev_list(void);
+-extern void *get_next_chrdev(void *);
+-extern int get_chrdev_info(void *, int *, char **);
+-extern void release_chrdev_list(void *);
++extern void chrdev_show(struct seq_file *,off_t);
+
+ /* fs/block_dev.c */
++#define BLKDEV_MAJOR_HASH_SIZE 255
+ #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */
+ extern const char *__bdevname(dev_t, char *buffer);
+ extern const char *bdevname(struct block_device *bdev, char *buffer);
+-extern struct block_device *lookup_bdev(const char *);
++extern struct block_device *lookup_bdev(const char *, int mode);
+ extern struct block_device *open_bdev_excl(const char *, int, void *);
+ extern void close_bdev_excl(struct block_device *);
+-extern void *acquire_blkdev_list(void);
+-extern int count_blkdev_list(void);
+-extern void *get_next_blkdev(void *);
+-extern int get_blkdev_info(void *, int *, char **);
+-extern void release_blkdev_list(void *);
++extern void blkdev_show(struct seq_file *,off_t);
+
+ extern void init_special_inode(struct inode *, umode_t, dev_t);
+
+@@ -1433,7 +1468,7 @@ extern int fs_may_remount_ro(struct supe
+ #define bio_data_dir(bio) ((bio)->bi_rw & 1)
+
+ extern int check_disk_change(struct block_device *);
+-extern int invalidate_inodes(struct super_block *);
++extern int invalidate_inodes(struct super_block *, int);
+ extern int __invalidate_device(struct block_device *);
+ extern int invalidate_partition(struct gendisk *, int);
+ unsigned long invalidate_mapping_pages(struct address_space *mapping,
+@@ -1463,9 +1498,10 @@ extern int do_remount_sb(struct super_bl
+ void *data, int force);
+ extern sector_t bmap(struct inode *, sector_t);
+ extern int notify_change(struct dentry *, struct iattr *);
+-extern int permission(struct inode *, int, struct nameidata *);
++extern int permission(struct inode *, int, struct nameidata *,
++ struct exec_perm *);
+ extern int generic_permission(struct inode *, int,
+- int (*check_acl)(struct inode *, int));
++ int (*check_acl)(struct inode *, int), struct exec_perm *);
+
+ extern int get_write_access(struct inode *);
+ extern int deny_write_access(struct file *);
+@@ -1484,7 +1520,9 @@ extern int open_namei(int dfd, const cha
+ extern int may_open(struct nameidata *, int, int);
+
+ extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
+-extern struct file * open_exec(const char *);
++
++struct linux_binprm;
++extern struct file * open_exec(const char *, struct linux_binprm *);
+
+ /* fs/dcache.c -- generic fs support functions */
+ extern int is_subdir(struct dentry *, struct dentry *);
+diff -upr linux-2.6.16.orig/include/linux/genhd.h linux-2.6.16-026test015/include/linux/genhd.h
+--- linux-2.6.16.orig/include/linux/genhd.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/genhd.h 2006-07-04 14:41:38.000000000 +0400
+@@ -421,6 +421,7 @@ static inline struct block_device *bdget
+ return bdget(MKDEV(disk->major, disk->first_minor) + index);
+ }
+
++extern struct subsystem block_subsys;
+ #endif
+
+ #endif
+diff -upr linux-2.6.16.orig/include/linux/gfp.h linux-2.6.16-026test015/include/linux/gfp.h
+--- linux-2.6.16.orig/include/linux/gfp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/gfp.h 2006-07-04 14:41:37.000000000 +0400
+@@ -47,6 +47,8 @@ struct vm_area_struct;
+ #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */
+ #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
+ #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
++#define __GFP_UBC ((__force gfp_t)0x40000u)/* charge kmem in buddy and slab */
++#define __GFP_SOFT_UBC ((__force gfp_t)0x80000u)/* use soft charging */
+
+ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */
+ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+@@ -55,14 +57,17 @@ struct vm_area_struct;
+ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
+ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
+ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
+- __GFP_NOMEMALLOC|__GFP_HARDWALL)
++ __GFP_NOMEMALLOC|__GFP_HARDWALL| \
++ __GFP_UBC|__GFP_SOFT_UBC)
+
+ /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
+ #define GFP_ATOMIC (__GFP_HIGH)
+ #define GFP_NOIO (__GFP_WAIT)
+ #define GFP_NOFS (__GFP_WAIT | __GFP_IO)
+ #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS)
++#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC)
+ #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
++#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC)
+ #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \
+ __GFP_HIGHMEM)
+
+diff -upr linux-2.6.16.orig/include/linux/hrtimer.h linux-2.6.16-026test015/include/linux/hrtimer.h
+--- linux-2.6.16.orig/include/linux/hrtimer.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/hrtimer.h 2006-07-04 14:41:39.000000000 +0400
+@@ -140,4 +140,9 @@ extern void hrtimer_run_queues(void);
+ /* Bootup initialization: */
+ extern void __init hrtimers_init(void);
+
++extern long nanosleep_restart(struct restart_block *restart);
++
++extern ktime_t schedule_hrtimer(struct hrtimer *timer,
++ const enum hrtimer_mode mode);
++
+ #endif
+diff -upr linux-2.6.16.orig/include/linux/i2o.h linux-2.6.16-026test015/include/linux/i2o.h
+--- linux-2.6.16.orig/include/linux/i2o.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/i2o.h 2006-07-04 14:41:36.000000000 +0400
+@@ -1116,8 +1116,11 @@ static inline struct i2o_message *i2o_ms
+
+ mmsg->mfa = readl(c->in_port);
+ if (unlikely(mmsg->mfa >= c->in_queue.len)) {
++ u32 mfa = mmsg->mfa;
++
+ mempool_free(mmsg, c->in_msg.mempool);
+- if(mmsg->mfa == I2O_QUEUE_EMPTY)
++
++ if (mfa == I2O_QUEUE_EMPTY)
+ return ERR_PTR(-EBUSY);
+ return ERR_PTR(-EFAULT);
+ }
+diff -upr linux-2.6.16.orig/include/linux/inetdevice.h linux-2.6.16-026test015/include/linux/inetdevice.h
+--- linux-2.6.16.orig/include/linux/inetdevice.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/inetdevice.h 2006-07-04 14:41:38.000000000 +0400
+@@ -34,6 +34,12 @@ struct ipv4_devconf
+ };
+
+ extern struct ipv4_devconf ipv4_devconf;
++extern struct ipv4_devconf ipv4_devconf_dflt;
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_ipv4_devconf (*(get_exec_env()->_ipv4_devconf))
++#else
++#define ve_ipv4_devconf ipv4_devconf
++#endif
+
+ struct in_device
+ {
+@@ -60,29 +66,29 @@ struct in_device
+ };
+
+ #define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding)
+-#define IN_DEV_MFORWARD(in_dev) (ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding)
+-#define IN_DEV_RPFILTER(in_dev) (ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter)
+-#define IN_DEV_SOURCE_ROUTE(in_dev) (ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route)
+-#define IN_DEV_BOOTP_RELAY(in_dev) (ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay)
+-
+-#define IN_DEV_LOG_MARTIANS(in_dev) (ipv4_devconf.log_martians || (in_dev)->cnf.log_martians)
+-#define IN_DEV_PROXY_ARP(in_dev) (ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp)
+-#define IN_DEV_SHARED_MEDIA(in_dev) (ipv4_devconf.shared_media || (in_dev)->cnf.shared_media)
+-#define IN_DEV_TX_REDIRECTS(in_dev) (ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects)
+-#define IN_DEV_SEC_REDIRECTS(in_dev) (ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects)
++#define IN_DEV_MFORWARD(in_dev) (ve_ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding)
++#define IN_DEV_RPFILTER(in_dev) (ve_ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter)
++#define IN_DEV_SOURCE_ROUTE(in_dev) (ve_ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route)
++#define IN_DEV_BOOTP_RELAY(in_dev) (ve_ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay)
++
++#define IN_DEV_LOG_MARTIANS(in_dev) (ve_ipv4_devconf.log_martians || (in_dev)->cnf.log_martians)
++#define IN_DEV_PROXY_ARP(in_dev) (ve_ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp)
++#define IN_DEV_SHARED_MEDIA(in_dev) (ve_ipv4_devconf.shared_media || (in_dev)->cnf.shared_media)
++#define IN_DEV_TX_REDIRECTS(in_dev) (ve_ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects)
++#define IN_DEV_SEC_REDIRECTS(in_dev) (ve_ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects)
+ #define IN_DEV_IDTAG(in_dev) ((in_dev)->cnf.tag)
+ #define IN_DEV_MEDIUM_ID(in_dev) ((in_dev)->cnf.medium_id)
+ #define IN_DEV_PROMOTE_SECONDARIES(in_dev) (ipv4_devconf.promote_secondaries || (in_dev)->cnf.promote_secondaries)
+
+ #define IN_DEV_RX_REDIRECTS(in_dev) \
+ ((IN_DEV_FORWARD(in_dev) && \
+- (ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \
++ (ve_ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \
+ || (!IN_DEV_FORWARD(in_dev) && \
+- (ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects)))
++ (ve_ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects)))
+
+-#define IN_DEV_ARPFILTER(in_dev) (ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter)
+-#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce))
+-#define IN_DEV_ARP_IGNORE(in_dev) (max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore))
++#define IN_DEV_ARPFILTER(in_dev) (ve_ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter)
++#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ve_ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce))
++#define IN_DEV_ARP_IGNORE(in_dev) (max(ve_ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore))
+
+ struct in_ifaddr
+ {
+@@ -113,6 +119,7 @@ extern u32 inet_select_addr(const struc
+ extern u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope);
+ extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask);
+ extern void inet_forward_change(void);
++extern void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy);
+
+ static __inline__ int inet_ifa_match(u32 addr, struct in_ifaddr *ifa)
+ {
+@@ -180,6 +187,10 @@ static inline void in_dev_put(struct in_
+ #define __in_dev_put(idev) atomic_dec(&(idev)->refcnt)
+ #define in_dev_hold(idev) atomic_inc(&(idev)->refcnt)
+
++struct ve_struct;
++extern int devinet_sysctl_init(struct ve_struct *);
++extern void devinet_sysctl_fini(struct ve_struct *);
++extern void devinet_sysctl_free(struct ve_struct *);
+ #endif /* __KERNEL__ */
+
+ static __inline__ __u32 inet_make_mask(int logmask)
+diff -upr linux-2.6.16.orig/include/linux/ipv6.h linux-2.6.16-026test015/include/linux/ipv6.h
+--- linux-2.6.16.orig/include/linux/ipv6.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/ipv6.h 2006-07-04 14:41:39.000000000 +0400
+@@ -415,12 +415,13 @@ static inline struct raw6_sock *raw6_sk(
+ #define inet_v6_ipv6only(__sk) 0
+ #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+
+-#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\
++#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif,__ve)\
+ (((__sk)->sk_hash == (__hash)) && \
+ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
+ ((__sk)->sk_family == AF_INET6) && \
+ ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \
+ ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \
++ ve_accessible_strict(VE_OWNER_SK(__sk), (__ve)) && \
+ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+
+ #endif /* __KERNEL__ */
+diff -upr linux-2.6.16.orig/include/linux/jbd.h linux-2.6.16-026test015/include/linux/jbd.h
+--- linux-2.6.16.orig/include/linux/jbd.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/jbd.h 2006-07-04 14:41:37.000000000 +0400
+@@ -245,10 +245,15 @@ typedef struct journal_superblock_s
+ #define J_ASSERT(assert) \
+ do { \
+ if (!(assert)) { \
++ unsigned long stack; \
+ printk (KERN_EMERG \
+ "Assertion failure in %s() at %s:%d: \"%s\"\n", \
+ __FUNCTION__, __FILE__, __LINE__, # assert); \
+- BUG(); \
++ printk("Stack=%p current=%p pid=%d ve=%d comm='%s'\n", \
++ &stack, current, current->pid, \
++ get_exec_env()->veid, \
++ current->comm); \
++ dump_stack(); \
+ } \
+ } while (0)
+
+diff -upr linux-2.6.16.orig/include/linux/jiffies.h linux-2.6.16-026test015/include/linux/jiffies.h
+--- linux-2.6.16.orig/include/linux/jiffies.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/jiffies.h 2006-07-04 14:41:39.000000000 +0400
+@@ -74,6 +74,7 @@
+ */
+ extern u64 __jiffy_data jiffies_64;
+ extern unsigned long volatile __jiffy_data jiffies;
++extern unsigned long cycles_per_jiffy, cycles_per_clock;
+
+ #if (BITS_PER_LONG < 64)
+ u64 get_jiffies_64(void);
+diff -upr linux-2.6.16.orig/include/linux/kdev_t.h linux-2.6.16-026test015/include/linux/kdev_t.h
+--- linux-2.6.16.orig/include/linux/kdev_t.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/kdev_t.h 2006-07-04 14:41:38.000000000 +0400
+@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de
+ return dev & 0x3ffff;
+ }
+
++#define UNNAMED_MAJOR_COUNT 16
++
++#if UNNAMED_MAJOR_COUNT > 1
++
++extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT];
++
++static inline dev_t make_unnamed_dev(int idx)
++{
++ /*
++ * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the
++ * unnamed device index into major number.
++ */
++ return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)],
++ idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8));
++}
++
++static inline int unnamed_dev_idx(dev_t dev)
++{
++ int i;
++ for (i = 0; i < UNNAMED_MAJOR_COUNT &&
++ MAJOR(dev) != unnamed_dev_majors[i]; i++);
++ return MINOR(dev) | (i << 8);
++}
++
++static inline int is_unnamed_dev(dev_t dev)
++{
++ int i;
++ for (i = 0; i < UNNAMED_MAJOR_COUNT &&
++ MAJOR(dev) != unnamed_dev_majors[i]; i++);
++ return i < UNNAMED_MAJOR_COUNT;
++}
++
++#else /* UNNAMED_MAJOR_COUNT */
++
++static inline dev_t make_unnamed_dev(int idx)
++{
++ return MKDEV(0, idx);
++}
++
++static inline int unnamed_dev_idx(dev_t dev)
++{
++ return MINOR(dev);
++}
++
++static inline int is_unnamed_dev(dev_t dev)
++{
++ return MAJOR(dev) == 0;
++}
++
++#endif /* UNNAMED_MAJOR_COUNT */
++
+
+ #else /* __KERNEL__ */
+
+diff -upr linux-2.6.16.orig/include/linux/kernel.h linux-2.6.16-026test015/include/linux/kernel.h
+--- linux-2.6.16.orig/include/linux/kernel.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/kernel.h 2006-07-04 14:41:38.000000000 +0400
+@@ -132,6 +132,9 @@ asmlinkage int vprintk(const char *fmt,
+ __attribute__ ((format (printf, 1, 0)));
+ asmlinkage int printk(const char * fmt, ...)
+ __attribute__ ((format (printf, 1, 2)));
++asmlinkage int ve_printk(int, const char * fmt, ...)
++ __attribute__ ((format (printf, 2, 3)));
++void prepare_printk(void);
+ #else
+ static inline int vprintk(const char *s, va_list args)
+ __attribute__ ((format (printf, 1, 0)));
+@@ -139,8 +142,16 @@ static inline int vprintk(const char *s,
+ static inline int printk(const char *s, ...)
+ __attribute__ ((format (printf, 1, 2)));
+ static inline int printk(const char *s, ...) { return 0; }
++static inline int ve_printk(int d, const char *s, ...)
++ __attribute__ ((format (printf, 1, 2)));
++static inline int printk(int d, const char *s, ...) { return 0; }
++#define prepare_printk() do { } while (0)
+ #endif
+
++#define VE0_LOG 1
++#define VE_LOG 2
++#define VE_LOG_BOTH (VE0_LOG | VE_LOG)
++
+ unsigned long int_sqrt(unsigned long);
+
+ static inline int __attribute_pure__ long_log2(unsigned long x)
+@@ -159,9 +170,14 @@ static inline unsigned long __attribute_
+ extern int printk_ratelimit(void);
+ extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst);
+
++extern int console_silence_loglevel;
++
+ static inline void console_silent(void)
+ {
+- console_loglevel = 0;
++ if (console_loglevel > console_silence_loglevel) {
++ printk(KERN_EMERG "console shuts up ...\n");
++ console_loglevel = 0;
++ }
+ }
+
+ static inline void console_verbose(void)
+@@ -171,10 +187,13 @@ static inline void console_verbose(void)
+ }
+
+ extern void bust_spinlocks(int yes);
++extern void wake_up_klogd(void);
+ extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
+ extern __deprecated_for_modules int panic_timeout;
+ extern int panic_on_oops;
++extern int decode_call_traces;
+ extern int tainted;
++extern int kernel_text_csum_broken;
+ extern const char *print_tainted(void);
+ extern void add_taint(unsigned);
+
+diff -upr linux-2.6.16.orig/include/linux/kmem_cache.h linux-2.6.16-026test015/include/linux/kmem_cache.h
+--- linux-2.6.16.orig/include/linux/kmem_cache.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/kmem_cache.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,199 @@
++#ifndef __KMEM_CACHE_H__
++#define __KMEM_CACHE_H__
++#include <linux/threads.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/list.h>
++#include <linux/mm.h>
++#include <asm/atomic.h>
++
++/*
++ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
++ * SLAB_RED_ZONE & SLAB_POISON.
++ * 0 for faster, smaller code (especially in the critical paths).
++ *
++ * STATS - 1 to collect stats for /proc/slabinfo.
++ * 0 for faster, smaller code (especially in the critical paths).
++ *
++ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
++ */
++
++#ifdef CONFIG_DEBUG_SLAB
++#define SLAB_DEBUG 1
++#define SLAB_STATS 1
++#define SLAB_FORCED_DEBUG 1
++#else
++#define SLAB_DEBUG 0
++#define SLAB_STATS 0
++#define SLAB_FORCED_DEBUG 0
++#endif
++
++/*
++ * struct array_cache
++ *
++ * Purpose:
++ * - LIFO ordering, to hand out cache-warm objects from _alloc
++ * - reduce the number of linked list operations
++ * - reduce spinlock operations
++ *
++ * The limit is stored in the per-cpu structure to reduce the data cache
++ * footprint.
++ *
++ */
++struct array_cache {
++ unsigned int avail;
++ unsigned int limit;
++ unsigned int batchcount;
++ unsigned int touched;
++ spinlock_t lock;
++ void *entry[0]; /*
++ * Must have this definition in here for the proper
++ * alignment of array_cache. Also simplifies accessing
++ * the entries.
++ * [0] is for gcc 2.95. It should really be [].
++ */
++};
++
++/* bootstrap: The caches do not work without cpuarrays anymore,
++ * but the cpuarrays are allocated from the generic caches...
++ */
++#define BOOT_CPUCACHE_ENTRIES 1
++struct arraycache_init {
++ struct array_cache cache;
++ void *entries[BOOT_CPUCACHE_ENTRIES];
++};
++
++/*
++ * The slab lists for all objects.
++ */
++struct kmem_list3 {
++ struct list_head slabs_partial; /* partial list first, better asm code */
++ struct list_head slabs_full;
++ struct list_head slabs_free;
++ unsigned long free_objects;
++ unsigned long next_reap;
++ int free_touched;
++ unsigned int free_limit;
++ unsigned int colour_next; /* Per-node cache coloring */
++ spinlock_t list_lock;
++ struct array_cache *shared; /* shared per node */
++ struct array_cache **alien; /* on other nodes */
++};
++
++/*
++ * struct kmem_cache
++ *
++ * manages a cache.
++ */
++
++struct kmem_cache {
++/* 1) per-cpu data, touched during every alloc/free */
++ struct array_cache *array[NR_CPUS];
++ unsigned int batchcount;
++ unsigned int limit;
++ unsigned int shared;
++ unsigned int buffer_size;
++/* 2) touched by every alloc & free from the backend */
++ struct kmem_list3 *nodelists[MAX_NUMNODES];
++ unsigned int flags; /* constant flags */
++ unsigned int num; /* # of objs per slab */
++ spinlock_t spinlock;
++
++/* 3) cache_grow/shrink */
++ /* order of pgs per slab (2^n) */
++ unsigned int gfporder;
++
++ /* force GFP flags, e.g. GFP_DMA */
++ gfp_t gfpflags;
++
++ size_t colour; /* cache colouring range */
++ unsigned int colour_off; /* colour offset */
++ struct kmem_cache *slabp_cache;
++ unsigned int slab_size;
++ unsigned int dflags; /* dynamic flags */
++
++ /* constructor func */
++ void (*ctor) (void *, struct kmem_cache *, unsigned long);
++
++ /* de-constructor func */
++ void (*dtor) (void *, struct kmem_cache *, unsigned long);
++
++/* 4) cache creation/removal */
++ const char *name;
++ struct list_head next;
++
++/* 5) statistics */
++#if SLAB_STATS
++ unsigned long num_active;
++ unsigned long num_allocations;
++ unsigned long high_mark;
++ unsigned long grown;
++ unsigned long reaped;
++ unsigned long errors;
++ unsigned long max_freeable;
++ unsigned long node_allocs;
++ unsigned long node_frees;
++ atomic_t allochit;
++ atomic_t allocmiss;
++ atomic_t freehit;
++ atomic_t freemiss;
++#endif
++#if SLAB_DEBUG
++ /*
++ * If debugging is enabled, then the allocator can add additional
++ * fields and/or padding to every object. buffer_size contains the total
++ * object size including these internal fields, the following two
++ * variables contain the offset to the user object and its size.
++ */
++ int obj_offset;
++ int obj_size;
++#endif
++#ifdef CONFIG_USER_RESOURCE
++ unsigned int objuse;
++#endif
++};
++
++#define CFLGS_OFF_SLAB (0x80000000UL)
++#define CFLGS_ENVIDS (0x04000000UL)
++#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
++#define ENVIDS(x) ((x)->flags & CFLGS_ENVIDS)
++#define kmem_mark_nocharge(c) do { (c)->flags |= SLAB_NO_CHARGE; } while (0)
++
++struct slab;
++/* Functions for storing/retrieving the cachep and or slab from the
++ * global 'mem_map'. These are used to find the slab an obj belongs to.
++ * With kfree(), these are used to find the cache which an obj belongs to.
++ */
++static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
++{
++ page->lru.next = (struct list_head *)cache;
++}
++
++static inline struct kmem_cache *page_get_cache(struct page *page)
++{
++ return (struct kmem_cache *)page->lru.next;
++}
++
++static inline void page_set_slab(struct page *page, struct slab *slab)
++{
++ page->lru.prev = (struct list_head *)slab;
++}
++
++static inline struct slab *page_get_slab(struct page *page)
++{
++ return (struct slab *)page->lru.prev;
++}
++
++static inline struct kmem_cache *virt_to_cache(const void *obj)
++{
++ struct page *page = virt_to_page(obj);
++ return page_get_cache(page);
++}
++
++static inline struct slab *virt_to_slab(const void *obj)
++{
++ struct page *page = virt_to_page(obj);
++ return page_get_slab(page);
++}
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/kmem_slab.h linux-2.6.16-026test015/include/linux/kmem_slab.h
+--- linux-2.6.16.orig/include/linux/kmem_slab.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/kmem_slab.h 2006-07-04 14:41:36.000000000 +0400
+@@ -0,0 +1,71 @@
++#ifndef __KMEM_SLAB_H__
++#define __KMEM_SLAB_H__
++
++/*
++ * kmem_bufctl_t:
++ *
++ * Bufctl's are used for linking objs within a slab
++ * linked offsets.
++ *
++ * This implementation relies on "struct page" for locating the cache &
++ * slab an object belongs to.
++ * This allows the bufctl structure to be small (one int), but limits
++ * the number of objects a slab (not a cache) can contain when off-slab
++ * bufctls are used. The limit is the size of the largest general cache
++ * that does not use off-slab slabs.
++ * For 32bit archs with 4 kB pages, is this 56.
++ * This is not serious, as it is only for large objects, when it is unwise
++ * to have too many per slab.
++ * Note: This limit can be raised by introducing a general cache whose size
++ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
++ */
++
++typedef unsigned int kmem_bufctl_t;
++#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
++#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
++#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2)
++
++/*
++ * struct slab
++ *
++ * Manages the objs in a slab. Placed either at the beginning of mem allocated
++ * for a slab, or allocated from an general cache.
++ * Slabs are chained into three list: fully used, partial, fully free slabs.
++ */
++struct slab {
++ struct list_head list;
++ unsigned long colouroff;
++ void *s_mem; /* including colour offset */
++ unsigned int inuse; /* num of objs active in slab */
++ kmem_bufctl_t free;
++ unsigned short nodeid;
++};
++
++/*
++ * struct slab_rcu
++ *
++ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
++ * arrange for kmem_freepages to be called via RCU. This is useful if
++ * we need to approach a kernel structure obliquely, from its address
++ * obtained without the usual locking. We can lock the structure to
++ * stabilize it and check it's still at the given address, only if we
++ * can be sure that the memory has not been meanwhile reused for some
++ * other kind of object (which our subsystem's lock might corrupt).
++ *
++ * rcu_read_lock before reading the address, then rcu_read_unlock after
++ * taking the spinlock within the structure expected at that address.
++ *
++ * We assume struct slab_rcu can overlay struct slab when destroying.
++ */
++struct slab_rcu {
++ struct rcu_head head;
++ struct kmem_cache *cachep;
++ void *addr;
++};
++
++static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
++{
++ return (kmem_bufctl_t *) (slabp + 1);
++}
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/list.h linux-2.6.16-026test015/include/linux/list.h
+--- linux-2.6.16.orig/include/linux/list.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/list.h 2006-07-04 14:41:38.000000000 +0400
+@@ -325,6 +325,9 @@ static inline void list_splice_init(stru
+ #define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
++#define list_first_entry(ptr, type, member) \
++ container_of((ptr)->next, type, member)
++
+ /**
+ * list_for_each - iterate over a list
+ * @pos: the &struct list_head to use as a loop counter.
+@@ -411,6 +414,20 @@ static inline void list_splice_init(stru
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+ /**
++ * list_for_each_entry_continue_reverse - iterate backwards over list of given
++ * type continuing after existing point
++ * @pos: the type * to use as a loop counter.
++ * @head: the head for your list.
++ * @member: the name of the list_struct within the struct.
++ */
++#define list_for_each_entry_continue_reverse(pos, head, member) \
++ for (pos = list_entry(pos->member.prev, typeof(*pos), member), \
++ prefetch(pos->member.prev); \
++ &pos->member != (head); \
++ pos = list_entry(pos->member.prev, typeof(*pos), member), \
++ prefetch(pos->member.prev))
++
++/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos: the type * to use as a loop counter.
+ * @n: another type * to use as temporary storage
+diff -upr linux-2.6.16.orig/include/linux/major.h linux-2.6.16-026test015/include/linux/major.h
+--- linux-2.6.16.orig/include/linux/major.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/major.h 2006-07-04 14:41:38.000000000 +0400
+@@ -165,4 +165,7 @@
+
+ #define VIOTAPE_MAJOR 230
+
++#define UNNAMED_EXTRA_MAJOR 130
++#define UNNAMED_EXTRA_MAJOR_COUNT 120
++
+ #endif
+diff -upr linux-2.6.16.orig/include/linux/mm.h linux-2.6.16-026test015/include/linux/mm.h
+--- linux-2.6.16.orig/include/linux/mm.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/mm.h 2006-07-04 14:41:39.000000000 +0400
+@@ -41,6 +41,27 @@ extern int sysctl_legacy_va_layout;
+
+ #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
+
++#include <linux/mm_counter.h>
++
++#ifdef CONFIG_USER_RESOURCE
++#define set_vma_rss(vma, v) set_mm_counter(vma, vm_rss, v)
++#define get_vma_rss(vma) get_mm_counter(vma, vm_rss)
++#define inc_vma_rss(vma) inc_mm_counter(vma, vm_rss)
++#define dec_vma_rss(vma) dec_mm_counter(vma, vm_rss)
++#define add_vma_rss(vma, v) add_mm_counter(vma, vm_rss, v)
++#define sub_vma_rss(vma, v) do { \
++ if (unlikely(dec_mm_counter_chk(vma, vm_rss, v))) \
++ warn_bad_rss(vma, v); \
++ } while (0)
++#else
++#define set_vma_rss(vma, v) do { } while (0)
++#define get_vma_rss(vma) (0)
++#define inc_vma_rss(vma) do { } while (0)
++#define dec_vma_rss(vma) do { } while (0)
++#define add_vma_rss(vma, v) do { } while (0)
++#define sub_vma_rss(vma, v) do { } while (0)
++#endif
++
+ /*
+ * Linux kernel virtual memory manager primitives.
+ * The idea being to have a "virtual" mm in the same way
+@@ -111,6 +132,9 @@ struct vm_area_struct {
+ #ifdef CONFIG_NUMA
+ struct mempolicy *vm_policy; /* NUMA policy for the VMA */
+ #endif
++#ifdef CONFIG_USER_RESOURCE
++ mm_counter_t _vm_rss;
++#endif
+ };
+
+ /*
+@@ -229,10 +253,9 @@ struct page {
+ unsigned long private; /* Mapping-private opaque data:
+ * usually used for buffer_heads
+ * if PagePrivate set; used for
+- * swp_entry_t if PageSwapCache.
+- * When page is free, this
++ * swp_entry_t if PageSwapCache;
+ * indicates order in the buddy
+- * system.
++ * system if PG_buddy is set.
+ */
+ struct address_space *mapping; /* If low bit clear, points to
+ * inode address_space, or NULL.
+@@ -264,6 +287,12 @@ struct page {
+ void *virtual; /* Kernel virtual address (NULL if
+ not kmapped, ie. highmem) */
+ #endif /* WANT_PAGE_VIRTUAL */
++#ifdef CONFIG_USER_RESOURCE
++ union {
++ struct user_beancounter *page_ub;
++ struct page_beancounter *page_pb;
++ } bc;
++#endif
+ };
+
+ #define page_private(page) ((page)->private)
+@@ -636,16 +665,9 @@ struct page *shmem_nopage(struct vm_area
+ int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new);
+ struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr);
+-int shmem_lock(struct file *file, int lock, struct user_struct *user);
+ #else
+ #define shmem_nopage filemap_nopage
+
+-static inline int shmem_lock(struct file *file, int lock,
+- struct user_struct *user)
+-{
+- return 0;
+-}
+-
+ static inline int shmem_set_policy(struct vm_area_struct *vma,
+ struct mempolicy *new)
+ {
+@@ -706,7 +728,9 @@ void free_pgd_range(struct mmu_gather **
+ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+- struct vm_area_struct *vma);
++ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
++int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma,
++ unsigned long addr, size_t size);
+ int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
+ unsigned long size, pgprot_t prot);
+ void unmap_mapping_range(struct address_space *mapping,
+diff -upr linux-2.6.16.orig/include/linux/mm_counter.h linux-2.6.16-026test015/include/linux/mm_counter.h
+--- linux-2.6.16.orig/include/linux/mm_counter.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/mm_counter.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,32 @@
++#ifndef __MM_COUNTER_H_
++#define __MM_COUNTER_H_
++#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
++/*
++ * The mm counters are not protected by its page_table_lock,
++ * so must be incremented atomically.
++ */
++#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
++#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
++#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
++#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
++#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
++#define dec_mm_counter_chk(mm, member, value) \
++ atomic_long_add_negative(-(value), &(mm)->_##member)
++typedef atomic_long_t mm_counter_t;
++
++#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++/*
++ * The mm counters are protected by its page_table_lock,
++ * so can be incremented directly.
++ */
++#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
++#define get_mm_counter(mm, member) ((mm)->_##member)
++#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
++#define inc_mm_counter(mm, member) (mm)->_##member++
++#define dec_mm_counter(mm, member) (mm)->_##member--
++#define dec_mm_counter_chk(mm, member, value) \
++ (((mm)->_##member -= (value)) < 0)
++typedef unsigned long mm_counter_t;
++
++#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++#endif
+diff -upr linux-2.6.16.orig/include/linux/mount.h linux-2.6.16-026test015/include/linux/mount.h
+--- linux-2.6.16.orig/include/linux/mount.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/mount.h 2006-07-04 14:41:38.000000000 +0400
+@@ -47,6 +47,7 @@ struct vfsmount {
+ struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
+ struct namespace *mnt_namespace; /* containing namespace */
+ int mnt_pinned;
++ unsigned owner;
+ };
+
+ static inline struct vfsmount *mntget(struct vfsmount *mnt)
+diff -upr linux-2.6.16.orig/include/linux/msg.h linux-2.6.16-026test015/include/linux/msg.h
+--- linux-2.6.16.orig/include/linux/msg.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/msg.h 2006-07-04 14:41:39.000000000 +0400
+@@ -92,6 +92,8 @@ struct msg_queue {
+ struct list_head q_senders;
+ };
+
++int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg);
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_MSG_H */
+diff -upr linux-2.6.16.orig/include/linux/namei.h linux-2.6.16-026test015/include/linux/namei.h
+--- linux-2.6.16.orig/include/linux/namei.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/namei.h 2006-07-04 14:41:38.000000000 +0400
+@@ -48,12 +48,15 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA
+ #define LOOKUP_PARENT 16
+ #define LOOKUP_NOALT 32
+ #define LOOKUP_REVAL 64
++#define LOOKUP_STRICT 128 /* no symlinks or other filesystems */
++
+ /*
+ * Intent data
+ */
+ #define LOOKUP_OPEN (0x0100)
+ #define LOOKUP_CREATE (0x0200)
+ #define LOOKUP_ACCESS (0x0400)
++#define LOOKUP_NOAREACHECK (0x0800) /* no area check on lookup */
+
+ extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
+ extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *));
+diff -upr linux-2.6.16.orig/include/linux/namespace.h linux-2.6.16-026test015/include/linux/namespace.h
+--- linux-2.6.16.orig/include/linux/namespace.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/namespace.h 2006-07-04 14:41:38.000000000 +0400
+@@ -13,6 +13,8 @@ struct namespace {
+ int event;
+ };
+
++extern struct rw_semaphore namespace_sem;
++
+ extern int copy_namespace(int, struct task_struct *);
+ extern void __put_namespace(struct namespace *namespace);
+ extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *);
+diff -upr linux-2.6.16.orig/include/linux/netdevice.h linux-2.6.16-026test015/include/linux/netdevice.h
+--- linux-2.6.16.orig/include/linux/netdevice.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netdevice.h 2006-07-04 14:41:39.000000000 +0400
+@@ -37,6 +37,7 @@
+ #include <linux/config.h>
+ #include <linux/device.h>
+ #include <linux/percpu.h>
++#include <linux/ctype.h>
+
+ struct divert_blk;
+ struct vlan_group;
+@@ -233,6 +234,11 @@ enum netdev_state_t
+ __LINK_STATE_LINKWATCH_PENDING
+ };
+
++struct netdev_bc {
++ struct user_beancounter *exec_ub, *owner_ub;
++};
++
++#define netdev_bc(dev) (&(dev)->dev_bc)
+
+ /*
+ * This structure holds at boot time configured netdevice settings. They
+@@ -309,6 +315,8 @@ struct net_device
+ #define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */
+ #define NETIF_F_LLTX 4096 /* LockLess TX */
+ #define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/
++#define NETIF_F_VIRTUAL 0x40000000 /* can be registered in ve */
++#define NETIF_F_VENET 0x80000000 /* Device is VENET device */
+
+ struct net_device *next_sched;
+
+@@ -431,6 +439,7 @@ struct net_device
+ enum { NETREG_UNINITIALIZED=0,
+ NETREG_REGISTERING, /* called register_netdevice */
+ NETREG_REGISTERED, /* completed register todo */
++ NETREG_REGISTER_ERR, /* register todo failed */
+ NETREG_UNREGISTERING, /* called unregister_netdevice */
+ NETREG_UNREGISTERED, /* completed unregister todo */
+ NETREG_RELEASED, /* called free_netdev */
+@@ -500,8 +509,18 @@ struct net_device
+ struct divert_blk *divert;
+ #endif /* CONFIG_NET_DIVERT */
+
++ unsigned orig_mtu; /* MTU value before move to VE */
++ struct ve_struct *owner_env; /* Owner VE of the interface */
++ struct netdev_bc dev_bc;
++
+ /* class/net/name entry */
+ struct class_device class_dev;
++
++#ifdef CONFIG_VE
++ /* List entry in global devices list to keep track of their names
++ * assignment */
++ struct list_head dev_global_list_entry;
++#endif
+ };
+
+ #define NETDEV_ALIGN 32
+@@ -535,9 +554,23 @@ struct packet_type {
+ #include <linux/notifier.h>
+
+ extern struct net_device loopback_dev; /* The loopback */
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define loopback_dev (*get_exec_env()->_loopback_dev)
++#define ve0_loopback (*get_ve0()->_loopback_dev)
++#define dev_base (get_exec_env()->_net_dev_base)
++#define visible_dev_head(x) (&(x)->_net_dev_head)
++#define visible_dev_index_head(x) (&(x)->_net_dev_index_head)
++#else
+ extern struct net_device *dev_base; /* All devices */
++#define ve0_loopback loopback_dev
++#define visible_dev_head(x) NULL
++#define visible_dev_index_head(x) NULL
++#endif
+ extern rwlock_t dev_base_lock; /* Device list lock */
+
++struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env);
++struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env);
++
+ extern int netdev_boot_setup_check(struct net_device *dev);
+ extern unsigned long netdev_boot_base(const char *prefix, int unit);
+ extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr);
+@@ -554,6 +587,7 @@ extern int dev_alloc_name(struct net_de
+ extern int dev_open(struct net_device *dev);
+ extern int dev_close(struct net_device *dev);
+ extern int dev_queue_xmit(struct sk_buff *skb);
++extern int dev_set_mtu(struct net_device *dev, int new_mtu);
+ extern int register_netdevice(struct net_device *dev);
+ extern int unregister_netdevice(struct net_device *dev);
+ extern void free_netdev(struct net_device *dev);
+@@ -951,6 +985,18 @@ extern void dev_seq_stop(struct seq_file
+
+ extern void linkwatch_run_queue(void);
+
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++static inline int ve_is_dev_movable(struct net_device *dev)
++{
++ return !(dev->features & NETIF_F_VIRTUAL);
++}
++#else
++static inline int ve_is_dev_movable(struct net_device *dev)
++{
++ return 0;
++}
++#endif
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_DEV_H */
+diff -upr linux-2.6.16.orig/include/linux/netfilter/nf_conntrack_ftp.h linux-2.6.16-026test015/include/linux/netfilter/nf_conntrack_ftp.h
+--- linux-2.6.16.orig/include/linux/netfilter/nf_conntrack_ftp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter/nf_conntrack_ftp.h 2006-07-04 14:41:39.000000000 +0400
+@@ -32,13 +32,22 @@ struct ip_conntrack_expect;
+
+ /* For NAT to hook in when we find a packet which describes what other
+ * connection we should expect. */
+-extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
++typedef unsigned int (*ip_nat_helper_ftp_hook)(struct sk_buff **pskb,
+ enum ip_conntrack_info ctinfo,
+ enum ip_ct_ftp_type type,
+ unsigned int matchoff,
+ unsigned int matchlen,
+ struct ip_conntrack_expect *exp,
+ u32 *seq);
++extern ip_nat_helper_ftp_hook ip_nat_ftp_hook;
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_ip_nat_ftp_hook \
++ ((ip_nat_helper_ftp_hook) \
++ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook))
++#else
++#define ve_ip_nat_ftp_hook ip_nat_ftp_hook
++#endif
+ #endif /* __KERNEL__ */
+
+ #endif /* _NF_CONNTRACK_FTP_H */
+diff -upr linux-2.6.16.orig/include/linux/netfilter/x_tables.h linux-2.6.16-026test015/include/linux/netfilter/x_tables.h
+--- linux-2.6.16.orig/include/linux/netfilter/x_tables.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter/x_tables.h 2006-07-04 14:41:39.000000000 +0400
+@@ -80,12 +80,19 @@ struct xt_counters_info
+
+ #ifdef __KERNEL__
+
++#include <linux/config.h>
+ #include <linux/netdevice.h>
+
+ #define ASSERT_READ_LOCK(x)
+ #define ASSERT_WRITE_LOCK(x)
+ #include <linux/netfilter_ipv4/listhelp.h>
+
++#ifdef CONFIG_COMPAT
++#define COMPAT_TO_USER 1
++#define COMPAT_FROM_USER -1
++#define COMPAT_CALC_SIZE 0
++#endif
++
+ struct xt_match
+ {
+ struct list_head list;
+@@ -118,6 +125,10 @@ struct xt_match
+ /* Called when entry of this type deleted. */
+ void (*destroy)(void *matchinfo, unsigned int matchinfosize);
+
++#ifdef CONFIG_COMPAT
++ /* Called when userspace align differs from kernel space one */
++ int (*compat)(void *match, void **dstptr, int *size, int convert);
++#endif
+ /* Set this to THIS_MODULE if you are a module, otherwise NULL */
+ struct module *me;
+ };
+@@ -154,6 +165,10 @@ struct xt_target
+ /* Called when entry of this type deleted. */
+ void (*destroy)(void *targinfo, unsigned int targinfosize);
+
++#ifdef CONFIG_COMPAT
++ /* Called when userspace align differs from kernel space one */
++ int (*compat)(void *target, void **dstptr, int *size, int convert);
++#endif
+ /* Set this to THIS_MODULE if you are a module, otherwise NULL */
+ struct module *me;
+ };
+@@ -211,6 +226,10 @@ extern int xt_register_table(struct xt_t
+ struct xt_table_info *bootstrap,
+ struct xt_table_info *newinfo);
+ extern void *xt_unregister_table(struct xt_table *table);
++extern struct xt_table *virt_xt_register_table(struct xt_table *table,
++ struct xt_table_info *bootstrap,
++ struct xt_table_info *newinfo);
++extern void *virt_xt_unregister_table(struct xt_table *table);
+
+ extern struct xt_table_info *xt_replace_table(struct xt_table *table,
+ unsigned int num_counters,
+@@ -233,6 +252,34 @@ extern void xt_proto_fini(int af);
+ extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
+ extern void xt_free_table_info(struct xt_table_info *info);
+
++#ifdef CONFIG_COMPAT
++#include <net/compat.h>
++
++/* FIXME: this works only on 32 bit tasks
++ * need to change whole approach in order to calculate align as function of
++ * current task alignment */
++
++struct compat_xt_counters
++{
++ u_int32_t cnt[4];
++};
++
++struct compat_xt_counters_info
++{
++ char name[XT_TABLE_MAXNAMELEN];
++ compat_uint_t num_counters;
++ struct compat_xt_counters counters[0];
++};
++
++#define COMPAT_XT_ALIGN(s) (((s) + (__alignof__(struct compat_xt_counters)-1)) \
++ & ~(__alignof__(struct compat_xt_counters)-1))
++
++extern int ipt_match_align_compat(void *match, void **dstptr,
++ int *size, int off, int convert);
++extern int ipt_target_align_compat(void *target, void **dstptr,
++ int *size, int off, int convert);
++
++#endif /* CONFIG_COMPAT */
+ #endif /* __KERNEL__ */
+
+ #endif /* _X_TABLES_H */
+diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_conntrack.h linux-2.6.16-026test015/include/linux/netfilter/xt_conntrack.h
+--- linux-2.6.16.orig/include/linux/netfilter/xt_conntrack.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter/xt_conntrack.h 2006-07-04 14:41:36.000000000 +0400
+@@ -5,6 +5,7 @@
+ #ifndef _XT_CONNTRACK_H
+ #define _XT_CONNTRACK_H
+
++#include <linux/config.h>
+ #include <linux/netfilter/nf_conntrack_tuple_common.h>
+ #include <linux/in.h>
+
+@@ -60,4 +61,21 @@ struct xt_conntrack_info
+ /* Inverse flags */
+ u_int8_t invflags;
+ };
++
++#ifdef CONFIG_COMPAT
++struct compat_xt_conntrack_info
++{
++ compat_uint_t statemask, statusmask;
++
++ struct ip_conntrack_tuple tuple[IP_CT_DIR_MAX];
++ struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
++
++ compat_ulong_t expires_min, expires_max;
++
++ /* Flags word */
++ u_int8_t flags;
++ /* Inverse flags */
++ u_int8_t invflags;
++};
++#endif
+ #endif /*_XT_CONNTRACK_H*/
+diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_helper.h linux-2.6.16-026test015/include/linux/netfilter/xt_helper.h
+--- linux-2.6.16.orig/include/linux/netfilter/xt_helper.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter/xt_helper.h 2006-07-04 14:41:36.000000000 +0400
+@@ -1,8 +1,17 @@
+ #ifndef _XT_HELPER_H
+ #define _XT_HELPER_H
+
++#include <linux/config.h>
++
+ struct xt_helper_info {
+ int invert;
+ char name[30];
+ };
++
++#ifdef CONFIG_COMPAT
++struct compat_xt_helper_info {
++ compat_int_t invert;
++ char name[30];
++};
++#endif
+ #endif /* _XT_HELPER_H */
+diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_limit.h linux-2.6.16-026test015/include/linux/netfilter/xt_limit.h
+--- linux-2.6.16.orig/include/linux/netfilter/xt_limit.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter/xt_limit.h 2006-07-04 14:41:36.000000000 +0400
+@@ -1,6 +1,8 @@
+ #ifndef _XT_RATE_H
+ #define _XT_RATE_H
+
++#include <linux/config.h>
++
+ /* timings are in milliseconds. */
+ #define XT_LIMIT_SCALE 10000
+
+@@ -18,4 +20,19 @@ struct xt_rateinfo {
+ /* Ugly, ugly fucker. */
+ struct xt_rateinfo *master;
+ };
++
++#ifdef CONFIG_COMPAT
++struct compat_xt_rateinfo {
++ u_int32_t avg; /* Average secs between packets * scale */
++ u_int32_t burst; /* Period multiplier for upper limit. */
++
++ /* Used internally by the kernel */
++ compat_ulong_t prev;
++ u_int32_t credit;
++ u_int32_t credit_cap, cost;
++
++ /* Ugly, ugly fucker. */
++ compat_uptr_t master;
++};
++#endif
+ #endif /*_XT_RATE_H*/
+diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_state.h linux-2.6.16-026test015/include/linux/netfilter/xt_state.h
+--- linux-2.6.16.orig/include/linux/netfilter/xt_state.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter/xt_state.h 2006-07-04 14:41:36.000000000 +0400
+@@ -1,6 +1,8 @@
+ #ifndef _XT_STATE_H
+ #define _XT_STATE_H
+
++#include <linux/config.h>
++
+ #define XT_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1))
+ #define XT_STATE_INVALID (1 << 0)
+
+@@ -10,4 +12,11 @@ struct xt_state_info
+ {
+ unsigned int statemask;
+ };
++
++#ifdef CONFIG_COMPAT
++struct compat_xt_state_info
++{
++ compat_uint_t statemask;
++};
++#endif
+ #endif /*_XT_STATE_H*/
+diff -upr linux-2.6.16.orig/include/linux/netfilter.h linux-2.6.16-026test015/include/linux/netfilter.h
+--- linux-2.6.16.orig/include/linux/netfilter.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter.h 2006-07-04 14:41:39.000000000 +0400
+@@ -107,12 +107,21 @@ struct nf_info
+ int nf_register_hook(struct nf_hook_ops *reg);
+ void nf_unregister_hook(struct nf_hook_ops *reg);
+
++int virt_nf_register_hook(struct nf_hook_ops *reg);
++int virt_nf_unregister_hook(struct nf_hook_ops *reg);
++
+ /* Functions to register get/setsockopt ranges (non-inclusive). You
+ need to check permissions yourself! */
+ int nf_register_sockopt(struct nf_sockopt_ops *reg);
+ void nf_unregister_sockopt(struct nf_sockopt_ops *reg);
+
++#ifdef CONFIG_VE_IPTABLES
++#define ve_nf_hooks \
++ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks))
++#else
+ extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
++#define ve_nf_hooks nf_hooks
++#endif
+
+ /* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will
+ * disappear once iptables is replaced with pkttables. Please DO NOT use them
+@@ -190,7 +199,7 @@ static inline int nf_hook_thresh(int pf,
+ if (!cond)
+ return 1;
+ #ifndef CONFIG_NETFILTER_DEBUG
+- if (list_empty(&nf_hooks[pf][hook]))
++ if (list_empty(&ve_nf_hooks[pf][hook]))
+ return 1;
+ #endif
+ return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh);
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack.h 2006-07-04 14:41:39.000000000 +0400
+@@ -71,6 +71,11 @@ do { \
+
+ struct ip_conntrack_helper;
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/ve.h>
++#include <linux/ve_owner.h>
++#endif
++
+ struct ip_conntrack
+ {
+ /* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
+@@ -122,8 +127,15 @@ struct ip_conntrack
+ /* Traversed often, so hopefully in different cacheline to top */
+ /* These are my tuples; original and reply */
+ struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
++#ifdef CONFIG_VE_IPTABLES
++ struct ve_struct *ct_owner_env;
++#endif
+ };
+
++#ifdef CONFIG_VE_IPTABLES
++DCL_VE_OWNER_PROTO(CT, struct ip_conntrack, ct_owner_env)
++#endif
++
+ struct ip_conntrack_expect
+ {
+ /* Internal linked list (global expectation list) */
+@@ -232,7 +244,15 @@ extern void ip_conntrack_tcp_update(stru
+ enum ip_conntrack_dir dir);
+
+ /* Call me when a conntrack is destroyed. */
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_ip_conntrack_destroyed \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_destroyed)
++#else
+ extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack);
++#define ve_ip_conntrack_destroyed ip_conntrack_destroyed
++#endif
++
+
+ /* Fake conntrack entry for untracked connections */
+ extern struct ip_conntrack ip_conntrack_untracked;
+@@ -261,7 +281,7 @@ extern void ip_conntrack_proto_put(struc
+ extern void ip_ct_remove_expectations(struct ip_conntrack *ct);
+
+ extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *,
+- struct ip_conntrack_tuple *);
++ struct ip_conntrack_tuple *, struct user_beancounter *);
+
+ extern void ip_conntrack_free(struct ip_conntrack *ct);
+
+@@ -270,6 +290,8 @@ extern void ip_conntrack_hash_insert(str
+ extern struct ip_conntrack_expect *
+ __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple);
+
++extern void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp);
++
+ extern struct ip_conntrack_expect *
+ ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple);
+
+@@ -291,6 +313,7 @@ static inline int is_dying(struct ip_con
+ }
+
+ extern unsigned int ip_conntrack_htable_size;
++extern int ip_conntrack_disable_ve0;
+
+ #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+
+@@ -341,6 +364,9 @@ ip_conntrack_event_cache(enum ip_conntra
+ struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct;
+ struct ip_conntrack_ecache *ecache;
+
++ if (!ve_is_super(get_exec_env()))
++ return;
++
+ local_bh_disable();
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ if (ct != ecache->ct)
+@@ -352,7 +378,7 @@ ip_conntrack_event_cache(enum ip_conntra
+ static inline void ip_conntrack_event(enum ip_conntrack_events event,
+ struct ip_conntrack *ct)
+ {
+- if (is_confirmed(ct) && !is_dying(ct))
++ if (is_confirmed(ct) && !is_dying(ct) && ve_is_super(get_exec_env()))
+ notifier_call_chain(&ip_conntrack_chain, event, ct);
+ }
+
+@@ -360,7 +386,8 @@ static inline void
+ ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
+ struct ip_conntrack_expect *exp)
+ {
+- notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
++ if (ve_is_super(get_exec_env()))
++ notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
+ }
+ #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+ static inline void ip_conntrack_event_cache(enum ip_conntrack_events event,
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_core.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-07-04 14:41:39.000000000 +0400
+@@ -3,7 +3,6 @@
+ #include <linux/netfilter.h>
+
+ #define MAX_IP_CT_PROTO 256
+-extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+
+ /* This header is used to share core functionality between the
+ standalone connection tracking module, and the compatibility layer's use
+@@ -54,8 +53,26 @@ static inline int ip_conntrack_confirm(s
+
+ extern void ip_ct_unlink_expect(struct ip_conntrack_expect *exp);
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_ip_ct_protos \
++ (get_exec_env()->_ip_conntrack->_ip_ct_protos)
++#define ve_ip_conntrack_hash \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_hash)
++#define ve_ip_conntrack_expect_list \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_expect_list)
++#define ve_ip_conntrack_vmalloc \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_vmalloc)
++#else
++extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+ extern struct list_head *ip_conntrack_hash;
+ extern struct list_head ip_conntrack_expect_list;
++#define ve_ip_ct_protos ip_ct_protos
++#define ve_ip_conntrack_hash ip_conntrack_hash
++#define ve_ip_conntrack_expect_list ip_conntrack_expect_list
++#define ve_ip_conntrack_vmalloc ip_conntrack_vmalloc
++#endif /* CONFIG_VE_IPTABLES */
++
+ extern rwlock_t ip_conntrack_lock;
+ #endif /* _IP_CONNTRACK_CORE_H */
+
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_helper.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-07-04 14:41:39.000000000 +0400
+@@ -31,6 +31,9 @@ struct ip_conntrack_helper
+ extern int ip_conntrack_helper_register(struct ip_conntrack_helper *);
+ extern void ip_conntrack_helper_unregister(struct ip_conntrack_helper *);
+
++extern int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *);
++extern void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *);
++
+ /* Allocate space for an expectation: this is mandatory before calling
+ ip_conntrack_expect_related. You will have to call put afterwards. */
+ extern struct ip_conntrack_expect *
+@@ -41,4 +44,5 @@ extern void ip_conntrack_expect_put(stru
+ extern int ip_conntrack_expect_related(struct ip_conntrack_expect *exp);
+ extern void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp);
+
++extern struct list_head helpers;
+ #endif /*_IP_CONNTRACK_HELPER_H*/
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_irc.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-07-04 14:41:39.000000000 +0400
+@@ -14,16 +14,26 @@
+ #ifndef _IP_CONNTRACK_IRC_H
+ #define _IP_CONNTRACK_IRC_H
+
++#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
++
+ /* This structure exists only once per master */
+ struct ip_ct_irc_master {
+ };
+
+ #ifdef __KERNEL__
+-extern unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
+- enum ip_conntrack_info ctinfo,
+- unsigned int matchoff,
+- unsigned int matchlen,
+- struct ip_conntrack_expect *exp);
++typedef unsigned int (*ip_nat_helper_irc_hook)(struct sk_buff **,
++ enum ip_conntrack_info, unsigned int, unsigned int,
++ struct ip_conntrack_expect *);
++
++extern ip_nat_helper_irc_hook ip_nat_irc_hook;
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_ip_nat_irc_hook \
++ ((ip_nat_helper_irc_hook) \
++ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook))
++#else
++#define ve_ip_nat_irc_hook ip_nat_irc_hook
++#endif
+
+ #define IRC_PORT 6667
+
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_protocol.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-07-04 14:41:39.000000000 +0400
+@@ -67,6 +67,7 @@ struct ip_conntrack_protocol
+ /* Protocol registration. */
+ extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto);
+ extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto);
++
+ /* Existing built-in protocols */
+ extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp;
+ extern struct ip_conntrack_protocol ip_conntrack_protocol_udp;
+@@ -74,6 +75,41 @@ extern struct ip_conntrack_protocol ip_c
+ extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
+ extern int ip_conntrack_protocol_tcp_init(void);
+
++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL)
++#include <linux/sched.h>
++#define ve_ip_ct_tcp_timeouts \
++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeouts)
++#define ve_ip_ct_udp_timeout \
++ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout)
++#define ve_ip_ct_udp_timeout_stream \
++ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout_stream)
++#define ve_ip_ct_icmp_timeout \
++ (get_exec_env()->_ip_conntrack->_ip_ct_icmp_timeout)
++#define ve_ip_ct_generic_timeout \
++ (get_exec_env()->_ip_conntrack->_ip_ct_generic_timeout)
++#define ve_ip_ct_log_invalid \
++ (get_exec_env()->_ip_conntrack->_ip_ct_log_invalid)
++#define ve_ip_ct_tcp_timeout_max_retrans \
++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeout_max_retrans)
++#define ve_ip_ct_tcp_loose \
++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_loose)
++#define ve_ip_ct_tcp_be_liberal \
++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_be_liberal)
++#define ve_ip_ct_tcp_max_retrans \
++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_max_retrans)
++#else
++#define ve_ip_ct_tcp_timeouts *tcp_timeouts
++#define ve_ip_ct_udp_timeout ip_ct_udp_timeout
++#define ve_ip_ct_udp_timeout_stream ip_ct_udp_timeout_stream
++#define ve_ip_ct_icmp_timeout ip_ct_icmp_timeout
++#define ve_ip_ct_generic_timeout ip_ct_generic_timeout
++#define ve_ip_ct_log_invalid ip_ct_log_invalid
++#define ve_ip_ct_tcp_timeout_max_retrans ip_ct_tcp_timeout_max_retrans
++#define ve_ip_ct_tcp_loose ip_ct_tcp_loose
++#define ve_ip_ct_tcp_be_liberal ip_ct_tcp_be_liberal
++#define ve_ip_ct_tcp_max_retrans ip_ct_tcp_max_retrans
++#endif
++
+ /* Log invalid packets */
+ extern unsigned int ip_ct_log_invalid;
+
+@@ -85,10 +121,10 @@ extern int ip_ct_port_nfattr_to_tuple(st
+ #ifdef CONFIG_SYSCTL
+ #ifdef DEBUG_INVALID_PACKETS
+ #define LOG_INVALID(proto) \
+- (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW)
++ (ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW)
+ #else
+ #define LOG_INVALID(proto) \
+- ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \
++ ((ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) \
+ && net_ratelimit())
+ #endif
+ #else
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat.h 2006-07-04 14:41:39.000000000 +0400
+@@ -1,5 +1,6 @@
+ #ifndef _IP_NAT_H
+ #define _IP_NAT_H
++#include <linux/config.h>
+ #include <linux/netfilter_ipv4.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_tuple.h>
+
+@@ -72,10 +73,29 @@ extern unsigned int ip_nat_setup_info(st
+ extern int ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack);
+
++extern void ip_nat_hash_conntrack(struct ip_conntrack *conntrack);
++
+ /* Calculate relative checksum. */
+ extern u_int16_t ip_nat_cheat_check(u_int32_t oldvalinv,
+ u_int32_t newval,
+ u_int16_t oldcheck);
++
++#ifdef CONFIG_COMPAT
++#include <net/compat.h>
++
++struct compat_ip_nat_range
++{
++ compat_uint_t flags;
++ u_int32_t min_ip, max_ip;
++ union ip_conntrack_manip_proto min, max;
++};
++
++struct compat_ip_nat_multi_range
++{
++ compat_uint_t rangesize;
++ struct compat_ip_nat_range range[1];
++};
++#endif
+ #else /* !__KERNEL__: iptables wants this to compile. */
+ #define ip_nat_multi_range ip_nat_multi_range_compat
+ #endif /*__KERNEL__*/
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat_rule.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat_rule.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-07-04 14:41:39.000000000 +0400
+@@ -6,7 +6,7 @@
+
+ #ifdef __KERNEL__
+
+-extern int ip_nat_rule_init(void) __init;
++extern int ip_nat_rule_init(void);
+ extern void ip_nat_rule_cleanup(void);
+ extern int ip_nat_rule_find(struct sk_buff **pskb,
+ unsigned int hooknum,
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_tables.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_tables.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_tables.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_tables.h 2006-07-04 14:41:39.000000000 +0400
+@@ -16,6 +16,7 @@
+ #define _IPTABLES_H
+
+ #ifdef __KERNEL__
++#include <linux/config.h>
+ #include <linux/if.h>
+ #include <linux/types.h>
+ #include <linux/in.h>
+@@ -330,7 +331,7 @@ extern void ipt_init(void) __init;
+ //#define ipt_register_table(tbl, repl) xt_register_table(AF_INET, tbl, repl)
+ //#define ipt_unregister_table(tbl) xt_unregister_table(AF_INET, tbl)
+
+-extern int ipt_register_table(struct ipt_table *table,
++extern struct ipt_table *ipt_register_table(struct ipt_table *table,
+ const struct ipt_replace *repl);
+ extern void ipt_unregister_table(struct ipt_table *table);
+
+@@ -364,5 +365,62 @@ extern unsigned int ipt_do_table(struct
+ void *userdata);
+
+ #define IPT_ALIGN(s) XT_ALIGN(s)
++
++#ifdef CONFIG_COMPAT
++#include <net/compat.h>
++
++struct compat_ipt_getinfo
++{
++ char name[IPT_TABLE_MAXNAMELEN];
++ compat_uint_t valid_hooks;
++ compat_uint_t hook_entry[NF_IP_NUMHOOKS];
++ compat_uint_t underflow[NF_IP_NUMHOOKS];
++ compat_uint_t num_entries;
++ compat_uint_t size;
++};
++
++struct compat_ipt_entry
++{
++ struct ipt_ip ip;
++ compat_uint_t nfcache;
++ u_int16_t target_offset;
++ u_int16_t next_offset;
++ compat_uint_t comefrom;
++ struct compat_xt_counters counters;
++ unsigned char elems[0];
++};
++
++struct compat_ipt_entry_match
++{
++ union {
++ struct {
++ u_int16_t match_size;
++ char name[IPT_FUNCTION_MAXNAMELEN];
++ } user;
++ u_int16_t match_size;
++ } u;
++ unsigned char data[0];
++};
++
++struct compat_ipt_entry_target
++{
++ union {
++ struct {
++ u_int16_t target_size;
++ char name[IPT_FUNCTION_MAXNAMELEN];
++ } user;
++ u_int16_t target_size;
++ } u;
++ unsigned char data[0];
++};
++
++#define COMPAT_IPT_ALIGN(s) COMPAT_XT_ALIGN(s)
++
++extern int ipt_match_align_compat(void *match, void **dstptr,
++ int *size, int off, int convert);
++extern int ipt_target_align_compat(void *target, void **dstptr,
++ int *size, int off, int convert);
++
++#endif /* CONFIG_COMPAT */
+ #endif /*__KERNEL__*/
+ #endif /* _IPTABLES_H */
+diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv6/ip6_tables.h linux-2.6.16-026test015/include/linux/netfilter_ipv6/ip6_tables.h
+--- linux-2.6.16.orig/include/linux/netfilter_ipv6/ip6_tables.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/netfilter_ipv6/ip6_tables.h 2006-07-04 14:41:39.000000000 +0400
+@@ -340,7 +340,7 @@ extern void ip6t_init(void) __init;
+ #define ip6t_register_match(match) xt_register_match(AF_INET6, match)
+ #define ip6t_unregister_match(match) xt_unregister_match(AF_INET6, match)
+
+-extern int ip6t_register_table(struct ip6t_table *table,
++extern struct ip6t_table *ip6t_register_table(struct ip6t_table *table,
+ const struct ip6t_replace *repl);
+ extern void ip6t_unregister_table(struct ip6t_table *table);
+ extern unsigned int ip6t_do_table(struct sk_buff **pskb,
+diff -upr linux-2.6.16.orig/include/linux/nfcalls.h linux-2.6.16-026test015/include/linux/nfcalls.h
+--- linux-2.6.16.orig/include/linux/nfcalls.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/nfcalls.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,254 @@
++/*
++ * include/linux/nfcalls.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_NFCALLS_H
++#define _LINUX_NFCALLS_H
++
++#include <linux/rcupdate.h>
++
++#ifdef CONFIG_MODULES
++extern struct module no_module;
++
++#define DECL_KSYM_MODULE(name) \
++ extern struct module *vz_mod_##name
++#define DECL_KSYM_CALL(type, name, args) \
++ extern type (*vz_##name) args
++
++#define INIT_KSYM_MODULE(name) \
++ struct module *vz_mod_##name = &no_module; \
++ EXPORT_SYMBOL(vz_mod_##name)
++#define INIT_KSYM_CALL(type, name, args) \
++ type (*vz_##name) args; \
++ EXPORT_SYMBOL(vz_##name)
++
++#define __KSYMERRCALL(err, type, mod, name, args) \
++({ \
++ type ret = (type)err; \
++ if (!__vzksym_module_get(vz_mod_##mod)) { \
++ if (vz_##name) \
++ ret = ((*vz_##name)args); \
++ __vzksym_module_put(vz_mod_##mod); \
++ } \
++ ret; \
++})
++#define __KSYMSAFECALL_VOID(mod, name, args) \
++do { \
++ if (!__vzksym_module_get(vz_mod_##mod)) { \
++ if (vz_##name) \
++ ((*vz_##name)args); \
++ __vzksym_module_put(vz_mod_##mod); \
++ } \
++} while (0)
++#else
++#define DECL_KSYM_CALL(type, name, args) \
++ extern type name args
++#define INIT_KSYM_MODULE(name)
++#define INIT_KSYM_CALL(type, name, args) \
++ type name args
++#define __KSYMERRCALL(err, type, mod, name, args) ((*name)args)
++#define __KSYMSAFECALL_VOID(mod, name, args) ((*name)args)
++#endif
++
++#define KSYMERRCALL(err, mod, name, args) \
++ __KSYMERRCALL(err, int, mod, name, args)
++#define KSYMSAFECALL(type, mod, name, args) \
++ __KSYMERRCALL(0, type, mod, name, args)
++#define KSYMSAFECALL_VOID(mod, name, args) \
++ __KSYMSAFECALL_VOID(mod, name, args)
++
++#if defined(CONFIG_VE) && defined(CONFIG_MODULES)
++/* should be called _after_ KSYMRESOLVE's */
++#define KSYMMODRESOLVE(name) \
++ __vzksym_modresolve(&vz_mod_##name, THIS_MODULE)
++#define KSYMMODUNRESOLVE(name) \
++ __vzksym_modunresolve(&vz_mod_##name)
++
++#define KSYMRESOLVE(name) \
++ vz_##name = &name
++#define KSYMUNRESOLVE(name) \
++ vz_##name = NULL
++#else
++#define KSYMRESOLVE(name) do { } while (0)
++#define KSYMUNRESOLVE(name) do { } while (0)
++#define KSYMMODRESOLVE(name) do { } while (0)
++#define KSYMMODUNRESOLVE(name) do { } while (0)
++#endif
++
++#ifdef CONFIG_MODULES
++static inline void __vzksym_modresolve(struct module **modp, struct module *mod)
++{
++ /*
++ * we want to be sure, that pointer updates are visible first:
++ * 1. wmb() is here only for piece of sure
++ * (note, no rmb() in KSYMSAFECALL)
++ * 2. synchronize_sched() guarantees that updates are visible
++ * on all cpus and allows us to remove rmb() in KSYMSAFECALL
++ */
++ wmb(); synchronize_sched();
++ *modp = mod;
++ /* just to be sure, our changes are visible as soon as possible */
++ wmb(); synchronize_sched();
++}
++
++static inline void __vzksym_modunresolve(struct module **modp)
++{
++ /*
++ * try_module_get() in KSYMSAFECALL should fail at this moment since
++ * THIS_MODULE in in unloading state (we should be called from fini),
++ * no need to syncronize pointers/ve_module updates.
++ */
++ *modp = &no_module;
++ /*
++ * synchronize_sched() guarantees here that we see
++ * updated module pointer before the module really gets away
++ */
++ synchronize_sched();
++}
++
++static inline int __vzksym_module_get(struct module *mod)
++{
++ /*
++ * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE
++ * and smp_read_barrier_depends() here...
++ */
++ smp_read_barrier_depends(); /* for module loading */
++ if (!try_module_get(mod))
++ return -EBUSY;
++
++ return 0;
++}
++
++static inline void __vzksym_module_put(struct module *mod)
++{
++ module_put(mod);
++}
++#endif
++
++#if defined(CONFIG_VE)
++#ifdef CONFIG_MODULES
++DECL_KSYM_MODULE(x_tables);
++DECL_KSYM_MODULE(xt_tcpudp);
++DECL_KSYM_MODULE(ip_tables);
++DECL_KSYM_MODULE(ip6_tables);
++DECL_KSYM_MODULE(iptable_filter);
++DECL_KSYM_MODULE(ip6table_filter);
++DECL_KSYM_MODULE(iptable_mangle);
++DECL_KSYM_MODULE(ip6table_mangle);
++DECL_KSYM_MODULE(xt_limit);
++DECL_KSYM_MODULE(ipt_multiport);
++DECL_KSYM_MODULE(ip6t_multiport);
++DECL_KSYM_MODULE(ipt_tos);
++DECL_KSYM_MODULE(ipt_TOS);
++DECL_KSYM_MODULE(ipt_REJECT);
++DECL_KSYM_MODULE(ip6t_REJECT);
++DECL_KSYM_MODULE(ipt_TCPMSS);
++DECL_KSYM_MODULE(xt_tcpmss);
++DECL_KSYM_MODULE(ipt_ttl);
++DECL_KSYM_MODULE(ipt_LOG);
++DECL_KSYM_MODULE(ip6t_LOG);
++DECL_KSYM_MODULE(xt_length);
++DECL_KSYM_MODULE(ip_conntrack);
++DECL_KSYM_MODULE(ip_conntrack_ftp);
++DECL_KSYM_MODULE(ip_conntrack_irc);
++DECL_KSYM_MODULE(xt_conntrack);
++DECL_KSYM_MODULE(xt_state);
++DECL_KSYM_MODULE(xt_helper);
++DECL_KSYM_MODULE(ip_nat);
++DECL_KSYM_MODULE(iptable_nat);
++DECL_KSYM_MODULE(ip_nat_ftp);
++DECL_KSYM_MODULE(ip_nat_irc);
++DECL_KSYM_MODULE(ipt_REDIRECT);
++#endif
++
++struct sk_buff;
++
++DECL_KSYM_CALL(int, init_netfilter, (void));
++DECL_KSYM_CALL(int, init_xtables, (void));
++DECL_KSYM_CALL(int, init_xt_tcpudp, (void));
++DECL_KSYM_CALL(int, init_iptables, (void));
++DECL_KSYM_CALL(int, init_ip6tables, (void));
++DECL_KSYM_CALL(int, init_iptable_filter, (void));
++DECL_KSYM_CALL(int, init_ip6table_filter, (void));
++DECL_KSYM_CALL(int, init_iptable_mangle, (void));
++DECL_KSYM_CALL(int, init_ip6table_mangle, (void));
++DECL_KSYM_CALL(int, init_xt_limit, (void));
++DECL_KSYM_CALL(int, init_iptable_multiport, (void));
++DECL_KSYM_CALL(int, init_ip6table_multiport, (void));
++DECL_KSYM_CALL(int, init_iptable_tos, (void));
++DECL_KSYM_CALL(int, init_iptable_TOS, (void));
++DECL_KSYM_CALL(int, init_iptable_REJECT, (void));
++DECL_KSYM_CALL(int, init_ip6table_REJECT, (void));
++DECL_KSYM_CALL(int, init_iptable_TCPMSS, (void));
++DECL_KSYM_CALL(int, init_xt_tcpmss, (void));
++DECL_KSYM_CALL(int, init_iptable_ttl, (void));
++DECL_KSYM_CALL(int, init_iptable_LOG, (void));
++DECL_KSYM_CALL(int, init_ip6table_LOG, (void));
++DECL_KSYM_CALL(int, init_xt_length, (void));
++DECL_KSYM_CALL(int, init_iptable_conntrack, (void));
++DECL_KSYM_CALL(int, init_iptable_ftp, (void));
++DECL_KSYM_CALL(int, init_iptable_irc, (void));
++DECL_KSYM_CALL(int, init_xt_conntrack_match, (void));
++DECL_KSYM_CALL(int, init_xt_state, (void));
++DECL_KSYM_CALL(int, init_xt_helper, (void));
++DECL_KSYM_CALL(int, ip_nat_init, (void));
++DECL_KSYM_CALL(int, init_iptable_nat, (void));
++DECL_KSYM_CALL(int, init_iptable_nat_ftp, (void));
++DECL_KSYM_CALL(int, init_iptable_nat_irc, (void));
++DECL_KSYM_CALL(int, init_iptable_REDIRECT, (void));
++DECL_KSYM_CALL(void, fini_iptable_nat_irc, (void));
++DECL_KSYM_CALL(void, fini_iptable_nat_ftp, (void));
++DECL_KSYM_CALL(void, fini_iptable_nat, (void));
++DECL_KSYM_CALL(void, ip_nat_cleanup, (void));
++DECL_KSYM_CALL(void, fini_xt_helper, (void));
++DECL_KSYM_CALL(void, fini_xt_state, (void));
++DECL_KSYM_CALL(void, fini_xt_conntrack_match, (void));
++DECL_KSYM_CALL(void, fini_iptable_irc, (void));
++DECL_KSYM_CALL(void, fini_iptable_ftp, (void));
++DECL_KSYM_CALL(void, fini_iptable_conntrack, (void));
++DECL_KSYM_CALL(void, fini_xt_length, (void));
++DECL_KSYM_CALL(void, fini_ip6table_LOG, (void));
++DECL_KSYM_CALL(void, fini_iptable_LOG, (void));
++DECL_KSYM_CALL(void, fini_iptable_ttl, (void));
++DECL_KSYM_CALL(void, fini_xt_tcpmss, (void));
++DECL_KSYM_CALL(void, fini_iptable_TCPMSS, (void));
++DECL_KSYM_CALL(void, fini_ip6table_REJECT, (void));
++DECL_KSYM_CALL(void, fini_iptable_REJECT, (void));
++DECL_KSYM_CALL(void, fini_iptable_TOS, (void));
++DECL_KSYM_CALL(void, fini_iptable_tos, (void));
++DECL_KSYM_CALL(void, fini_ip6table_multiport, (void));
++DECL_KSYM_CALL(void, fini_iptable_multiport, (void));
++DECL_KSYM_CALL(void, fini_xt_limit, (void));
++DECL_KSYM_CALL(void, fini_iptable_filter, (void));
++DECL_KSYM_CALL(void, fini_ip6table_filter, (void));
++DECL_KSYM_CALL(void, fini_iptable_mangle, (void));
++DECL_KSYM_CALL(void, fini_ip6table_mangle, (void));
++DECL_KSYM_CALL(void, fini_ip6tables, (void));
++DECL_KSYM_CALL(void, fini_iptables, (void));
++DECL_KSYM_CALL(void, fini_xt_tcpudp, (void));
++DECL_KSYM_CALL(void, fini_xtables, (void));
++DECL_KSYM_CALL(void, fini_netfilter, (void));
++DECL_KSYM_CALL(void, fini_iptable_REDIRECT, (void));
++
++#include <linux/netfilter/x_tables.h>
++
++DECL_KSYM_CALL(void, ipt_flush_table, (struct xt_table *table));
++DECL_KSYM_CALL(void, ip6t_flush_table, (struct xt_table *table));
++#endif /* CONFIG_VE */
++
++#ifdef CONFIG_VE_CALLS_MODULE
++DECL_KSYM_MODULE(vzmon);
++DECL_KSYM_CALL(int, real_get_device_perms_ve,
++ (int dev_type, dev_t dev, int access_mode));
++DECL_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env));
++DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env));
++DECL_KSYM_CALL(void, real_update_load_avg_ve, (void));
++#endif
++
++#endif /* _LINUX_NFCALLS_H */
+diff -upr linux-2.6.16.orig/include/linux/nfs_fs.h linux-2.6.16-026test015/include/linux/nfs_fs.h
+--- linux-2.6.16.orig/include/linux/nfs_fs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/nfs_fs.h 2006-07-04 14:41:37.000000000 +0400
+@@ -296,7 +296,7 @@ extern struct inode *nfs_fhget(struct su
+ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *);
+ extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr);
+ extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+-extern int nfs_permission(struct inode *, int, struct nameidata *);
++extern int nfs_permission(struct inode *, int, struct nameidata *, struct exec_perm *);
+ extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *);
+ extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *);
+ extern int nfs_open(struct inode *, struct file *);
+diff -upr linux-2.6.16.orig/include/linux/notifier.h linux-2.6.16-026test015/include/linux/notifier.h
+--- linux-2.6.16.orig/include/linux/notifier.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/notifier.h 2006-07-04 14:41:39.000000000 +0400
+@@ -27,8 +27,9 @@ extern int notifier_call_chain(struct no
+
+ #define NOTIFY_DONE 0x0000 /* Don't care */
+ #define NOTIFY_OK 0x0001 /* Suits me */
++#define NOTIFY_FAIL 0x0002 /* Reject */
+ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */
+-#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */
++#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */
+ /*
+ * Clean way to return from the notifier and stop further calls.
+ */
+diff -upr linux-2.6.16.orig/include/linux/page-flags.h linux-2.6.16-026test015/include/linux/page-flags.h
+--- linux-2.6.16.orig/include/linux/page-flags.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/page-flags.h 2006-07-04 14:41:36.000000000 +0400
+@@ -74,7 +74,9 @@
+ #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */
+ #define PG_reclaim 17 /* To be reclaimed asap */
+ #define PG_nosave_free 18 /* Free, should not be written */
+-#define PG_uncached 19 /* Page has been mapped as uncached */
++#define PG_buddy 19 /* Page is free, on buddy lists */
++
++#define PG_uncached 20 /* Page has been mapped as uncached */
+
+ /*
+ * Global page accounting. One instance per CPU. Only unsigned longs are
+@@ -319,6 +321,10 @@ extern void __mod_page_state_offset(unsi
+ #define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags)
+ #define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags)
+
++#define PageBuddy(page) test_bit(PG_buddy, &(page)->flags)
++#define __SetPageBuddy(page) __set_bit(PG_buddy, &(page)->flags)
++#define __ClearPageBuddy(page) __clear_bit(PG_buddy, &(page)->flags)
++
+ #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags)
+ #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags)
+ #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags)
+diff -upr linux-2.6.16.orig/include/linux/pid.h linux-2.6.16-026test015/include/linux/pid.h
+--- linux-2.6.16.orig/include/linux/pid.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/pid.h 2006-07-04 14:41:38.000000000 +0400
+@@ -1,6 +1,18 @@
+ #ifndef _LINUX_PID_H
+ #define _LINUX_PID_H
+
++#define VPID_BIT 10
++#define VPID_DIV (1<<VPID_BIT)
++
++#ifdef CONFIG_VE
++#define __is_virtual_pid(pid) ((pid) & VPID_DIV)
++#define is_virtual_pid(pid) \
++ (__is_virtual_pid(pid) || ((pid)==1 && !ve_is_super(get_exec_env())))
++#else
++#define __is_virtual_pid(pid) 0
++#define is_virtual_pid(pid) 0
++#endif
++
+ enum pid_type
+ {
+ PIDTYPE_PID,
+@@ -15,6 +27,9 @@ struct pid
+ /* Try to keep pid_chain in the same cacheline as nr for find_pid */
+ int nr;
+ struct hlist_node pid_chain;
++#ifdef CONFIG_VE
++ int vnr;
++#endif
+ /* list of pids with the same nr, only one of them is in the hash */
+ struct list_head pid_list;
+ };
+@@ -40,16 +55,89 @@ extern int alloc_pidmap(void);
+ extern void FASTCALL(free_pidmap(int));
+ extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread);
+
+-#define do_each_task_pid(who, type, task) \
+- if ((task = find_task_by_pid_type(type, who))) { \
++#ifndef CONFIG_VE
++
++#define vpid_to_pid(pid) (pid)
++#define __vpid_to_pid(pid) (pid)
++#define pid_type_to_vpid(type, pid) (pid)
++#define __pid_type_to_vpid(type, pid) (pid)
++
++#define comb_vpid_to_pid(pid) (pid)
++#define comb_pid_to_vpid(pid) (pid)
++
++#else
++
++struct ve_struct;
++extern void free_vpid(int vpid, struct ve_struct *ve);
++extern int alloc_vpid(int pid, int vpid);
++extern int vpid_to_pid(int pid);
++extern int __vpid_to_pid(int pid);
++extern pid_t pid_type_to_vpid(int type, pid_t pid);
++extern pid_t _pid_type_to_vpid(int type, pid_t pid);
++
++static inline int comb_vpid_to_pid(int vpid)
++{
++ int pid = vpid;
++
++ if (vpid > 0) {
++ pid = vpid_to_pid(vpid);
++ if (unlikely(pid < 0))
++ return 0;
++ } else if (vpid < 0) {
++ pid = vpid_to_pid(-vpid);
++ if (unlikely(pid < 0))
++ return 0;
++ pid = -pid;
++ }
++ return pid;
++}
++
++static inline int comb_pid_to_vpid(int pid)
++{
++ int vpid = pid;
++
++ if (pid > 0) {
++ vpid = pid_type_to_vpid(PIDTYPE_PID, pid);
++ if (unlikely(vpid < 0))
++ return 0;
++ } else if (pid < 0) {
++ vpid = pid_type_to_vpid(PIDTYPE_PGID, -pid);
++ if (unlikely(vpid < 0))
++ return 0;
++ vpid = -vpid;
++ }
++ return vpid;
++}
++#endif
++
++#define do_each_task_pid_all(who, type, task) \
++ if ((task = find_task_by_pid_type_all(type, who))) { \
+ prefetch((task)->pids[type].pid_list.next); \
+ do {
+
+-#define while_each_task_pid(who, type, task) \
++#define while_each_task_pid_all(who, type, task) \
+ } while (task = pid_task((task)->pids[type].pid_list.next,\
+ type), \
+ prefetch((task)->pids[type].pid_list.next), \
+ hlist_unhashed(&(task)->pids[type].pid_chain)); \
+ } \
+
++#ifndef CONFIG_VE
++#define __do_each_task_pid_ve(who, type, task, owner) \
++ do_each_task_pid_all(who, type, task)
++#define __while_each_task_pid_ve(who, type, task, owner) \
++ while_each_task_pid_all(who, type, task)
++#else /* CONFIG_VE */
++#define __do_each_task_pid_ve(who, type, task, owner) \
++ do_each_task_pid_all(who, type, task) \
++ if (ve_accessible(VE_TASK_INFO(task)->owner_env, owner))
++#define __while_each_task_pid_ve(who, type, task, owner) \
++ while_each_task_pid_all(who, type, task)
++#endif /* CONFIG_VE */
++
++#define do_each_task_pid_ve(who, type, task) \
++ __do_each_task_pid_ve(who, type, task, get_exec_env());
++#define while_each_task_pid_ve(who, type, task) \
++ __while_each_task_pid_ve(who, type, task, get_exec_env());
++
+ #endif /* _LINUX_PID_H */
+diff -upr linux-2.6.16.orig/include/linux/proc_fs.h linux-2.6.16-026test015/include/linux/proc_fs.h
+--- linux-2.6.16.orig/include/linux/proc_fs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/proc_fs.h 2006-07-04 14:41:38.000000000 +0400
+@@ -78,7 +78,7 @@ struct kcore_list {
+ struct vmcore {
+ struct list_head list;
+ unsigned long long paddr;
+- unsigned long size;
++ unsigned long long size;
+ loff_t offset;
+ };
+
+@@ -86,8 +86,14 @@ struct vmcore {
+
+ extern struct proc_dir_entry proc_root;
+ extern struct proc_dir_entry *proc_root_fs;
++#ifdef CONFIG_VE
++#include <linux/sched.h>
++#define proc_net (get_exec_env()->_proc_net)
++#define proc_net_stat (get_exec_env()->_proc_net_stat)
++#else
+ extern struct proc_dir_entry *proc_net;
+ extern struct proc_dir_entry *proc_net_stat;
++#endif
+ extern struct proc_dir_entry *proc_bus;
+ extern struct proc_dir_entry *proc_root_driver;
+ extern struct proc_dir_entry *proc_root_kcore;
+@@ -98,8 +104,8 @@ extern void proc_misc_init(void);
+ struct mm_struct;
+
+ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+-struct dentry *proc_pid_unhash(struct task_struct *p);
+-void proc_pid_flush(struct dentry *proc_dentry);
++void proc_pid_unhash(struct task_struct *p, struct dentry * [2]);
++void proc_pid_flush(struct dentry *proc_dentry[2]);
+ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+ unsigned long task_vsize(struct mm_struct *);
+ int task_statm(struct mm_struct *, int *, int *, int *, int *);
+@@ -107,7 +113,11 @@ char *task_mem(struct mm_struct *, char
+
+ extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
+ struct proc_dir_entry *parent);
++extern struct proc_dir_entry *create_proc_glob_entry(const char *name,
++ mode_t mode,
++ struct proc_dir_entry *parent);
+ extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
++extern void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent);
+
+ extern struct vfsmount *proc_mnt;
+ extern int proc_fill_super(struct super_block *,void *,int);
+@@ -194,6 +204,15 @@ static inline struct proc_dir_entry *pro
+ return res;
+ }
+
++static inline struct proc_dir_entry *proc_glob_fops_create(const char *name,
++ mode_t mode, struct file_operations *fops)
++{
++ struct proc_dir_entry *res = create_proc_glob_entry(name, mode, NULL);
++ if (res)
++ res->proc_fops = fops;
++ return res;
++}
++
+ static inline void proc_net_remove(const char *name)
+ {
+ remove_proc_entry(name,proc_net);
+@@ -206,16 +225,21 @@ static inline void proc_net_remove(const
+ #define proc_bus NULL
+
+ #define proc_net_fops_create(name, mode, fops) ({ (void)(mode), NULL; })
++#define proc_glob_fops_create(name, mode, fops) ({ (void)(mode), NULL; })
+ #define proc_net_create(name, mode, info) ({ (void)(mode), NULL; })
+ static inline void proc_net_remove(const char *name) {}
+
+-static inline struct dentry *proc_pid_unhash(struct task_struct *p) { return NULL; }
+-static inline void proc_pid_flush(struct dentry *proc_dentry) { }
++static inline struct dentry *proc_pid_unhash(struct task_struct *p,
++ struct dentry *d[2]) { return NULL; }
++static inline void proc_pid_flush(struct dentry *proc_dentry[2]) { }
+
+ static inline struct proc_dir_entry *create_proc_entry(const char *name,
+ mode_t mode, struct proc_dir_entry *parent) { return NULL; }
++static inline struct proc_dir_entry *create_proc_glob_entry(const char *name,
++ mode_t mode, struct proc_dir_entry *parent) { return NULL; }
+
+ #define remove_proc_entry(name, parent) do {} while (0)
++#define remove_proc_glob_entry(name, parent) do {} while (0)
+
+ static inline struct proc_dir_entry *proc_symlink(const char *name,
+ struct proc_dir_entry *parent,const char *dest) {return NULL;}
+@@ -266,4 +290,18 @@ static inline struct proc_dir_entry *PDE
+ return PROC_I(inode)->pde;
+ }
+
++static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de)
++{
++ if (de)
++ atomic_inc(&de->count);
++ return de;
++}
++
++extern void de_put(struct proc_dir_entry *);
++
++#define LPDE(inode) (PROC_I((inode))->pde)
++#ifdef CONFIG_VE
++#define GPDE(inode) (*(struct proc_dir_entry **)(&(inode)->i_pipe))
++#endif
++
+ #endif /* _LINUX_PROC_FS_H */
+diff -upr linux-2.6.16.orig/include/linux/quota.h linux-2.6.16-026test015/include/linux/quota.h
+--- linux-2.6.16.orig/include/linux/quota.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/quota.h 2006-07-04 14:41:39.000000000 +0400
+@@ -37,7 +37,6 @@
+
+ #include <linux/errno.h>
+ #include <linux/types.h>
+-#include <linux/spinlock.h>
+
+ #define __DQUOT_VERSION__ "dquot_6.5.1"
+ #define __DQUOT_NUM_VERSION__ 6*10000+5*100+1
+@@ -45,8 +44,6 @@
+ typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */
+ typedef __u64 qsize_t; /* Type in which we store sizes */
+
+-extern spinlock_t dq_data_lock;
+-
+ /* Size of blocks in which are counted size limits */
+ #define QUOTABLOCK_BITS 10
+ #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+@@ -133,6 +130,10 @@ struct if_dqinfo {
+
+ #ifdef __KERNEL__
+
++#include <linux/spinlock.h>
++
++extern spinlock_t dq_data_lock;
++
+ #include <linux/dqblk_xfs.h>
+ #include <linux/dqblk_v1.h>
+ #include <linux/dqblk_v2.h>
+@@ -242,6 +243,8 @@ struct quota_format_ops {
+ int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */
+ };
+
++struct inode;
++struct iattr;
+ /* Operations working with dquots */
+ struct dquot_operations {
+ int (*initialize) (struct inode *, int);
+@@ -256,9 +259,11 @@ struct dquot_operations {
+ int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */
+ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */
+ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */
++ int (*rename) (struct inode *, struct inode *, struct inode *);
+ };
+
+ /* Operations handling requests from userspace */
++struct v2_disk_dqblk;
+ struct quotactl_ops {
+ int (*quota_on)(struct super_block *, int, int, char *);
+ int (*quota_off)(struct super_block *, int);
+@@ -271,6 +276,9 @@ struct quotactl_ops {
+ int (*set_xstate)(struct super_block *, unsigned int, int);
+ int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
+ int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *);
++#ifdef CONFIG_QUOTA_COMPAT
++ int (*get_quoti)(struct super_block *, int, unsigned int, struct v2_disk_dqblk *);
++#endif
+ };
+
+ struct quota_format_type {
+@@ -291,6 +299,10 @@ struct quota_info {
+ struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */
+ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */
+ struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++ struct vz_quota_master *vzdq_master;
++ int vzdq_count;
++#endif
+ };
+
+ /* Inline would be better but we need to dereference super_block which is not defined yet */
+diff -upr linux-2.6.16.orig/include/linux/quotaops.h linux-2.6.16-026test015/include/linux/quotaops.h
+--- linux-2.6.16.orig/include/linux/quotaops.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/quotaops.h 2006-07-04 14:41:39.000000000 +0400
+@@ -171,6 +171,19 @@ static __inline__ int DQUOT_TRANSFER(str
+ return 0;
+ }
+
++static __inline__ int DQUOT_RENAME(struct inode *inode,
++ struct inode *old_dir, struct inode *new_dir)
++{
++ struct dquot_operations *q_op;
++
++ q_op = inode->i_sb->dq_op;
++ if (q_op && q_op->rename) {
++ if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA)
++ return 1;
++ }
++ return 0;
++}
++
+ /* The following two functions cannot be called inside a transaction */
+ #define DQUOT_SYNC(sb) sync_dquots(sb, -1)
+
+@@ -197,6 +210,7 @@ static __inline__ int DQUOT_OFF(struct s
+ #define DQUOT_SYNC(sb) do { } while(0)
+ #define DQUOT_OFF(sb) do { } while(0)
+ #define DQUOT_TRANSFER(inode, iattr) (0)
++#define DQUOT_RENAME(inode, old_dir, new_dir) (0)
+ static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
+ {
+ inode_add_bytes(inode, nr);
+diff -upr linux-2.6.16.orig/include/linux/raid/raid1.h linux-2.6.16-026test015/include/linux/raid/raid1.h
+--- linux-2.6.16.orig/include/linux/raid/raid1.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/raid/raid1.h 2006-07-04 14:41:36.000000000 +0400
+@@ -130,6 +130,6 @@ struct r1bio_s {
+ * with failure when last write completes (and all failed).
+ * Record that bi_end_io was called with this flag...
+ */
+-#define R1BIO_Returned 4
++#define R1BIO_Returned 6
+
+ #endif
+diff -upr linux-2.6.16.orig/include/linux/reiserfs_xattr.h linux-2.6.16-026test015/include/linux/reiserfs_xattr.h
+--- linux-2.6.16.orig/include/linux/reiserfs_xattr.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/reiserfs_xattr.h 2006-07-04 14:41:37.000000000 +0400
+@@ -42,7 +42,8 @@ int reiserfs_removexattr(struct dentry *
+ int reiserfs_delete_xattrs(struct inode *inode);
+ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
+ int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
+-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd);
++int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
++ struct exec_perm *);
+
+ int reiserfs_xattr_del(struct inode *, const char *);
+ int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t);
+diff -upr linux-2.6.16.orig/include/linux/rmap.h linux-2.6.16-026test015/include/linux/rmap.h
+--- linux-2.6.16.orig/include/linux/rmap.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/rmap.h 2006-07-04 14:41:39.000000000 +0400
+@@ -74,6 +74,7 @@ void page_add_anon_rmap(struct page *, s
+ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
+ void page_add_file_rmap(struct page *);
+ void page_remove_rmap(struct page *);
++struct anon_vma *page_lock_anon_vma(struct page *page);
+
+ /**
+ * page_dup_rmap - duplicate pte mapping to a page
+diff -upr linux-2.6.16.orig/include/linux/rtc.h linux-2.6.16-026test015/include/linux/rtc.h
+--- linux-2.6.16.orig/include/linux/rtc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/rtc.h 2006-07-04 14:41:36.000000000 +0400
+@@ -11,8 +11,6 @@
+ #ifndef _LINUX_RTC_H_
+ #define _LINUX_RTC_H_
+
+-#include <linux/interrupt.h>
+-
+ /*
+ * The struct used to pass data via the following ioctl. Similar to the
+ * struct tm in <time.h>, but it needs to be here so that the kernel
+@@ -95,6 +93,8 @@ struct rtc_pll_info {
+
+ #ifdef __KERNEL__
+
++#include <linux/interrupt.h>
++
+ typedef struct rtc_task {
+ void (*func)(void *private_data);
+ void *private_data;
+diff -upr linux-2.6.16.orig/include/linux/sched.h linux-2.6.16-026test015/include/linux/sched.h
+--- linux-2.6.16.orig/include/linux/sched.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/sched.h 2006-07-04 14:41:39.000000000 +0400
+@@ -38,7 +38,10 @@
+
+ #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
+
++#include <ub/ub_task.h>
++
+ struct exec_domain;
++struct ve_struct;
+
+ /*
+ * cloning flags:
+@@ -92,15 +95,34 @@ extern unsigned long avenrun[]; /* Load
+ load += n*(FIXED_1-exp); \
+ load >>= FSHIFT;
+
++#define LOAD_INT(x) ((x) >> FSHIFT)
++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
++
+ extern unsigned long total_forks;
+ extern int nr_threads;
+ extern int last_pid;
+ DECLARE_PER_CPU(unsigned long, process_counts);
+ extern int nr_processes(void);
++
++extern unsigned long nr_sleeping(void);
++extern unsigned long nr_stopped(void);
++extern unsigned long nr_zombie;
++extern atomic_t nr_dead;
+ extern unsigned long nr_running(void);
+ extern unsigned long nr_uninterruptible(void);
+ extern unsigned long nr_iowait(void);
+
++#ifdef CONFIG_VE
++struct ve_struct;
++extern unsigned long nr_running_ve(struct ve_struct *);
++extern unsigned long nr_iowait_ve(struct ve_struct *);
++extern unsigned long nr_uninterruptible_ve(struct ve_struct *);
++#else
++#define nr_running_ve(ve) 0
++#define nr_iowait_ve(ve) 0
++#define nr_uninterruptible_ve(ve) 0
++#endif
++
+ #include <linux/time.h>
+ #include <linux/param.h>
+ #include <linux/resource.h>
+@@ -189,6 +211,8 @@ extern cpumask_t nohz_cpu_mask;
+
+ extern void show_state(void);
+ extern void show_regs(struct pt_regs *);
++extern void smp_show_regs(struct pt_regs *, void *);
++extern void show_vsched(void);
+
+ /*
+ * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
+@@ -252,31 +276,7 @@ arch_get_unmapped_area_topdown(struct fi
+ extern void arch_unmap_area(struct mm_struct *, unsigned long);
+ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
+
+-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+-/*
+- * The mm counters are not protected by its page_table_lock,
+- * so must be incremented atomically.
+- */
+-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
+-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
+-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
+-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
+-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
+-typedef atomic_long_t mm_counter_t;
+-
+-#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+-/*
+- * The mm counters are protected by its page_table_lock,
+- * so can be incremented directly.
+- */
+-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
+-#define get_mm_counter(mm, member) ((mm)->_##member)
+-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
+-#define inc_mm_counter(mm, member) (mm)->_##member++
+-#define dec_mm_counter(mm, member) (mm)->_##member--
+-typedef unsigned long mm_counter_t;
+-
+-#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
++#include <linux/mm_counter.h>
+
+ #define get_mm_rss(mm) \
+ (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
+@@ -332,6 +332,7 @@ struct mm_struct {
+ unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
+
+ unsigned dumpable:2;
++ unsigned vps_dumpable:1;
+ cpumask_t cpu_vm_mask;
+
+ /* Architecture-specific MM context */
+@@ -348,6 +349,9 @@ struct mm_struct {
+ /* aio bits */
+ rwlock_t ioctx_list_lock;
+ struct kioctx *ioctx_list;
++#ifdef CONFIG_USER_RESOURCE
++ struct user_beancounter *mm_ub;
++#endif
+ };
+
+ struct sighand_struct {
+@@ -364,6 +368,9 @@ static inline void sighand_free(struct s
+ call_rcu(&sp->rcu, sighand_free_cb);
+ }
+
++#include <linux/ve.h>
++#include <linux/ve_task.h>
++
+ /*
+ * NOTE! "signal_struct" does not have it's own
+ * locking, because a shared signal_struct always
+@@ -688,6 +695,8 @@ static inline void prefetch_stack(struct
+
+ struct audit_context; /* See audit.c */
+ struct mempolicy;
++struct vcpu_scheduler;
++struct vcpu_info;
+
+ struct task_struct {
+ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
+@@ -701,6 +710,14 @@ struct task_struct {
+ #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ int oncpu;
+ #endif
++#ifdef CONFIG_SCHED_VCPU
++ struct vcpu_scheduler *vsched;
++ struct vcpu_info *vcpu;
++
++ /* id's are saved to avoid locking (e.g. on vsched->id access) */
++ int vsched_id;
++ int vcpu_id;
++#endif
+ int prio, static_prio;
+ struct list_head run_list;
+ prio_array_t *array;
+@@ -846,6 +863,11 @@ struct task_struct {
+
+ unsigned long ptrace_message;
+ siginfo_t *last_siginfo; /* For ptrace use. */
++
++/* state tracking for suspend */
++ __u8 pn_state;
++ __u8 stopped_state:1;
++
+ /*
+ * current io wait handle: wait queue entry to use for io waits
+ * If this thread is processing aio, this points at the waitqueue
+@@ -871,6 +893,16 @@ struct task_struct {
+ #endif
+ atomic_t fs_excl; /* holding fs exclusive resources */
+ struct rcu_head rcu;
++#ifdef CONFIG_USER_RESOURCE
++ struct task_beancounter task_bc;
++#endif
++#ifdef CONFIG_VE
++ struct ve_task_info ve_task_info;
++#endif
++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE)
++ unsigned long magic;
++ struct inode *ino;
++#endif
+ };
+
+ static inline pid_t process_group(struct task_struct *tsk)
+@@ -929,6 +961,43 @@ static inline void put_task_struct(struc
+ #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */
+ #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */
+
++#ifndef CONFIG_VE
++#define set_pn_state(tsk, state) do { } while(0)
++#define clear_pn_state(tsk) do { } while(0)
++#define set_stop_state(tsk) do { } while(0)
++#define clear_stop_state(tsk) do { } while(0)
++#else
++#define PN_STOP_TF 1 /* was not in 2.6.8 */
++#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */
++#define PN_STOP_ENTRY 3
++#define PN_STOP_FORK 4
++#define PN_STOP_VFORK 5
++#define PN_STOP_SIGNAL 6
++#define PN_STOP_EXIT 7
++#define PN_STOP_EXEC 8
++#define PN_STOP_LEAVE 9
++
++static inline void set_pn_state(struct task_struct *tsk, int state)
++{
++ tsk->pn_state = state;
++}
++
++static inline void clear_pn_state(struct task_struct *tsk)
++{
++ tsk->pn_state = 0;
++}
++
++static inline void set_stop_state(struct task_struct *tsk)
++{
++ tsk->stopped_state = 1;
++}
++
++static inline void clear_stop_state(struct task_struct *tsk)
++{
++ tsk->stopped_state = 0;
++}
++#endif
++
+ /*
+ * Only the _current_ task can read/write to tsk->flags, but other
+ * tasks can access tsk->flags in readonly mode for example
+@@ -968,6 +1037,21 @@ static inline int set_cpus_allowed(task_
+ extern unsigned long long sched_clock(void);
+ extern unsigned long long current_sched_time(const task_t *current_task);
+
++static inline unsigned long cycles_to_clocks(cycles_t cycles)
++{
++ extern unsigned long cycles_per_clock;
++ do_div(cycles, cycles_per_clock);
++ return cycles;
++}
++
++static inline u64 cycles_to_jiffies(cycles_t cycles)
++{
++ extern unsigned long cycles_per_jiffy;
++ do_div(cycles, cycles_per_jiffy);
++ return cycles;
++}
++
++
+ /* sched_exec is called by processes performing an exec */
+ #ifdef CONFIG_SMP
+ extern void sched_exec(void);
+@@ -1020,12 +1104,237 @@ extern struct task_struct init_task;
+
+ extern struct mm_struct init_mm;
+
+-#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr)
+-extern struct task_struct *find_task_by_pid_type(int type, int pid);
++#define find_task_by_pid_all(nr) \
++ find_task_by_pid_type_all(PIDTYPE_PID, nr)
++extern struct task_struct *find_task_by_pid_type_all(int type, int pid);
+ extern void set_special_pids(pid_t session, pid_t pgrp);
+ extern void __set_special_pids(pid_t session, pid_t pgrp);
+
++#ifndef CONFIG_VE
++#define find_task_by_pid_ve find_task_by_pid_all
++
++#define get_exec_env() ((struct ve_struct *)NULL)
++#define set_exec_env(new_env) ((struct ve_struct *)NULL)
++
++#define ve_is_super(env) 1
++#define ve_accessible(target, owner) 1
++#define ve_accessible_strict(target, owner) 1
++#define ve_accessible_veid(target, owner) 1
++#define ve_accessible_strict_veid(target, owner) 1
++
++#define VEID(envid) 0
++#define get_ve0() NULL
++
++static inline pid_t virt_pid(struct task_struct *tsk)
++{
++ return tsk->pid;
++}
++
++static inline pid_t virt_tgid(struct task_struct *tsk)
++{
++ return tsk->tgid;
++}
++
++static inline pid_t virt_pgid(struct task_struct *tsk)
++{
++ return tsk->signal->pgrp;
++}
++
++static inline pid_t virt_sid(struct task_struct *tsk)
++{
++ return tsk->signal->session;
++}
++
++#define get_task_pid_ve(tsk, ve) get_task_pid(tsk)
++
++static inline pid_t get_task_pid(struct task_struct *tsk)
++{
++ return tsk->pid;
++}
++
++static inline pid_t get_task_tgid(struct task_struct *tsk)
++{
++ return tsk->tgid;
++}
++
++static inline pid_t get_task_pgid(struct task_struct *tsk)
++{
++ return tsk->signal->pgrp;
++}
++
++static inline pid_t get_task_sid(struct task_struct *tsk)
++{
++ return tsk->signal->session;
++}
++
++static inline void set_virt_pid(struct task_struct *tsk, pid_t pid)
++{
++}
++
++static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid)
++{
++}
++
++static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid)
++{
++}
++
++static inline void set_virt_sid(struct task_struct *tsk, pid_t pid)
++{
++}
++
++static inline pid_t get_task_ppid(struct task_struct *p)
++{
++ return pid_alive(p) ? p->group_leader->real_parent->tgid : 0;
++}
++
++#else /* CONFIG_VE */
++
++#include <asm/current.h>
++#include <linux/ve.h>
++
++extern struct ve_struct ve0;
++
++#define find_task_by_pid_ve(nr) \
++ find_task_by_pid_type_ve(PIDTYPE_PID, nr)
++
++extern struct task_struct *find_task_by_pid_type_ve(int type, int pid);
++
++#define get_ve0() (&ve0)
++#define VEID(envid) ((envid)->veid)
++
++#define get_exec_env() (VE_TASK_INFO(current)->exec_env)
++static inline struct ve_struct *set_exec_env(struct ve_struct *new_env)
++{
++ struct ve_struct *old_env;
++
++ old_env = VE_TASK_INFO(current)->exec_env;
++ VE_TASK_INFO(current)->exec_env = new_env;
++
++ return old_env;
++}
++
++#define ve_is_super(env) ((env) == get_ve0())
++#define ve_accessible_strict(target, owner) ((target) == (owner))
++static inline int ve_accessible(struct ve_struct *target,
++ struct ve_struct *owner) {
++ return ve_is_super(owner) || ve_accessible_strict(target, owner);
++}
++
++#define ve_accessible_strict_veid(target, owner) ((target) == (owner))
++static inline int ve_accessible_veid(envid_t target, envid_t owner)
++{
++ return get_ve0()->veid == owner ||
++ ve_accessible_strict_veid(target, owner);
++}
++
++static inline pid_t virt_pid(struct task_struct *tsk)
++{
++ return tsk->pids[PIDTYPE_PID].vnr;
++}
++
++static inline pid_t virt_tgid(struct task_struct *tsk)
++{
++ return tsk->pids[PIDTYPE_TGID].vnr;
++}
++
++static inline pid_t virt_pgid(struct task_struct *tsk)
++{
++ return tsk->pids[PIDTYPE_PGID].vnr;
++}
++
++static inline pid_t virt_sid(struct task_struct *tsk)
++{
++ return tsk->pids[PIDTYPE_SID].vnr;
++}
++
++static inline pid_t get_task_pid_ve(struct task_struct *tsk, struct ve_struct *env)
++{
++ return ve_is_super(env) ? tsk->pid : virt_pid(tsk);
++}
++
++static inline pid_t get_task_pid(struct task_struct *tsk)
++{
++ return get_task_pid_ve(tsk, get_exec_env());
++}
++
++static inline pid_t get_task_tgid(struct task_struct *tsk)
++{
++ return ve_is_super(get_exec_env()) ? tsk->tgid : virt_tgid(tsk);
++}
++
++static inline pid_t get_task_pgid(struct task_struct *tsk)
++{
++ return ve_is_super(get_exec_env()) ? tsk->signal->pgrp : virt_pgid(tsk);
++}
++
++static inline pid_t get_task_sid(struct task_struct *tsk)
++{
++ return ve_is_super(get_exec_env()) ? tsk->signal->session : virt_sid(tsk);
++}
++
++static inline void set_virt_pid(struct task_struct *tsk, pid_t pid)
++{
++ tsk->pids[PIDTYPE_PID].vnr = pid;
++}
++
++static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid)
++{
++ tsk->pids[PIDTYPE_TGID].vnr = pid;
++}
++
++static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid)
++{
++ tsk->pids[PIDTYPE_PGID].vnr = pid;
++}
++
++static inline void set_virt_sid(struct task_struct *tsk, pid_t pid)
++{
++ tsk->pids[PIDTYPE_SID].vnr = pid;
++}
++
++static inline pid_t get_task_ppid(struct task_struct *p)
++{
++ struct task_struct *parent;
++ struct ve_struct *env;
++
++ if (!pid_alive(p))
++ return 0;
++ env = get_exec_env();
++ if (get_task_pid_ve(p, env) == 1)
++ return 0;
++ parent = p->group_leader->real_parent;
++ return ve_accessible(VE_TASK_INFO(parent)->owner_env, env) ?
++ get_task_tgid(parent) : 1;
++}
++
++void ve_sched_get_cpu_stat(struct ve_struct *envid, cycles_t *idle,
++ cycles_t *strv, unsigned int cpu);
++void ve_sched_attach(struct ve_struct *envid);
++
++#endif /* CONFIG_VE */
++
++
++#ifdef CONFIG_VE
++extern cycles_t ve_sched_get_idle_time(struct ve_struct *, int);
++extern cycles_t ve_sched_get_iowait_time(struct ve_struct *, int);
++#else
++#define ve_sched_get_idle_time(ve, cpu) 0
++#define ve_sched_get_iowait_time(ve, cpu) 0
++#endif
++
++#ifdef CONFIG_SCHED_VCPU
++struct vcpu_scheduler;
++extern void fastcall vsched_cpu_online_map(struct vcpu_scheduler *sched,
++ cpumask_t *mask);
++#else
++#define vsched_cpu_online_map(vsched, mask) do { \
++ *mask = cpu_online_map; \
++ } while (0)
++#endif
++
+ /* per-UID process charging. */
++extern int set_user(uid_t new_ruid, int dumpclear);
+ extern struct user_struct * alloc_uid(uid_t);
+ static inline struct user_struct *get_uid(struct user_struct *u)
+ {
+@@ -1043,7 +1352,7 @@ extern int FASTCALL(wake_up_state(struct
+ extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
+ unsigned long clone_flags));
+-#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) || defined (CONFIG_SCHED_VCPU)
+ extern void kick_process(struct task_struct *tsk);
+ #else
+ static inline void kick_process(struct task_struct *tsk) { }
+@@ -1161,12 +1470,19 @@ extern task_t *child_reaper;
+
+ extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
+ extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
++extern long do_fork_pid(unsigned long clone_flags,
++ unsigned long stack_start,
++ struct pt_regs *regs,
++ unsigned long stack_size,
++ int __user *parent_tidptr,
++ int __user *child_tidptr,
++ long pid0);
+ task_t *fork_idle(int);
+
+ extern void set_task_comm(struct task_struct *tsk, char *from);
+ extern void get_task_comm(char *to, struct task_struct *tsk);
+
+-#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) || defined (CONFIG_SCHED_VCPU)
+ extern void wait_task_inactive(task_t * p);
+ #else
+ #define wait_task_inactive(p) do { } while (0)
+@@ -1187,22 +1503,100 @@ extern void wait_task_inactive(task_t *
+ add_parent(p, (p)->parent); \
+ } while (0)
+
+-#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks)
+-#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks)
++#define next_task_all(p) list_entry((p)->tasks.next, struct task_struct, tasks)
++#define prev_task_all(p) list_entry((p)->tasks.prev, struct task_struct, tasks)
+
+-#define for_each_process(p) \
+- for (p = &init_task ; (p = next_task(p)) != &init_task ; )
++#define for_each_process_all(p) \
++ for (p = &init_task ; (p = next_task_all(p)) != &init_task ; )
+
+ /*
+ * Careful: do_each_thread/while_each_thread is a double loop so
+ * 'break' will not work as expected - use goto instead.
+ */
+-#define do_each_thread(g, t) \
+- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
++#define do_each_thread_all(g, t) \
++ for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do
+
+-#define while_each_thread(g, t) \
++#define while_each_thread_all(g, t) \
+ while ((t = next_thread(t)) != g)
+
++#ifndef CONFIG_VE
++
++#define SET_VE_LINKS(p)
++#define REMOVE_VE_LINKS(p)
++#define for_each_process_ve(p) for_each_process_all(p)
++#define do_each_thread_ve(g, t) do_each_thread_all(g, t)
++#define while_each_thread_ve(g, t) while_each_thread_all(g, t)
++#define first_task_ve() next_task_ve(&init_task)
++#define __first_task_ve(owner) next_task_ve(&init_task)
++#define __next_task_ve(owner, p) next_task_ve(p)
++#define next_task_ve(p) \
++ (next_task_all(p) != &init_task ? next_task_all(p) : NULL)
++
++#else /* CONFIG_VE */
++
++#define SET_VE_LINKS(p) \
++ do { \
++ if (thread_group_leader(p)) \
++ list_add_tail(&VE_TASK_INFO(p)->vetask_list, \
++ &VE_TASK_INFO(p)->owner_env->vetask_lh); \
++ } while (0)
++
++#define REMOVE_VE_LINKS(p) \
++ do { \
++ if (thread_group_leader(p)) \
++ list_del(&VE_TASK_INFO(p)->vetask_list); \
++ } while(0)
++
++static inline task_t* __first_task_ve(struct ve_struct *ve)
++{
++ task_t *tsk;
++
++ if (unlikely(ve_is_super(ve))) {
++ tsk = next_task_all(&init_task);
++ if (tsk == &init_task)
++ tsk = NULL;
++ } else {
++ /* probably can return ve->init_entry, but it's more clear */
++ BUG_ON(list_empty(&ve->vetask_lh));
++ tsk = VE_TASK_LIST_2_TASK(ve->vetask_lh.next);
++ }
++ return tsk;
++}
++
++static inline task_t* __next_task_ve(struct ve_struct *ve, task_t *tsk)
++{
++ if (unlikely(ve_is_super(ve))) {
++ tsk = next_task_all(tsk);
++ if (tsk == &init_task)
++ tsk = NULL;
++ } else {
++ struct list_head *tmp;
++
++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != ve);
++ tmp = VE_TASK_INFO(tsk)->vetask_list.next;
++ if (tmp == &ve->vetask_lh)
++ tsk = NULL;
++ else
++ tsk = VE_TASK_LIST_2_TASK(tmp);
++ }
++ return tsk;
++}
++
++#define first_task_ve() __first_task_ve(get_exec_env())
++#define next_task_ve(p) __next_task_ve(get_exec_env(), p)
++/* no one uses prev_task_ve(), copy next_task_ve() if needed */
++
++#define for_each_process_ve(p) \
++ for (p = first_task_ve(); p != NULL ; p = next_task_ve(p))
++
++#define do_each_thread_ve(g, t) \
++ for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do
++
++#define while_each_thread_ve(g, t) \
++ while ((t = next_thread(t)) != g)
++
++#endif /* CONFIG_VE */
++
+ extern task_t * FASTCALL(next_thread(const task_t *p));
+
+ #define thread_group_leader(p) (p->pid == p->tgid)
+@@ -1348,28 +1742,63 @@ extern void signal_wake_up(struct task_s
+ */
+ #ifdef CONFIG_SMP
+
+-static inline unsigned int task_cpu(const struct task_struct *p)
++static inline unsigned int task_pcpu(const struct task_struct *p)
+ {
+ return task_thread_info(p)->cpu;
+ }
+
+-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
++static inline void set_task_pcpu(struct task_struct *p, unsigned int cpu)
+ {
+ task_thread_info(p)->cpu = cpu;
+ }
+
+ #else
+
++static inline unsigned int task_pcpu(const struct task_struct *p)
++{
++ return 0;
++}
++
++static inline void set_task_pcpu(struct task_struct *p, unsigned int cpu)
++{
++}
++
++#endif /* CONFIG_SMP */
++
++#ifdef CONFIG_SCHED_VCPU
++
++static inline unsigned int task_vsched_id(const struct task_struct *p)
++{
++ return p->vsched_id;
++}
++
+ static inline unsigned int task_cpu(const struct task_struct *p)
+ {
++ return p->vcpu_id;
++}
++
++extern void set_task_cpu(struct task_struct *p, unsigned int vcpu);
++extern int vcpu_online(int cpu);
++
++#else
++
++static inline unsigned int task_vsched_id(const struct task_struct *p)
++{
+ return 0;
+ }
+
++static inline unsigned int task_cpu(const struct task_struct *p)
++{
++ return task_pcpu(p);
++}
++
+ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
+ {
++ set_task_pcpu(p, cpu);
+ }
+
+-#endif /* CONFIG_SMP */
++#define vcpu_online(cpu) cpu_online(cpu)
++#endif /* CONFIG_SCHED_VCPU */
+
+ #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
+ extern void arch_pick_mmap_layout(struct mm_struct *mm);
+@@ -1401,7 +1830,7 @@ static inline int frozen(struct task_str
+ */
+ static inline int freezing(struct task_struct *p)
+ {
+- return p->flags & PF_FREEZE;
++ return test_tsk_thread_flag(p, TIF_FREEZE);
+ }
+
+ /*
+@@ -1410,7 +1839,7 @@ static inline int freezing(struct task_s
+ */
+ static inline void freeze(struct task_struct *p)
+ {
+- p->flags |= PF_FREEZE;
++ set_tsk_thread_flag(p, TIF_FREEZE);
+ }
+
+ /*
+@@ -1431,7 +1860,8 @@ static inline int thaw_process(struct ta
+ */
+ static inline void frozen_process(struct task_struct *p)
+ {
+- p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
++ clear_tsk_thread_flag(p, TIF_FREEZE);
++ p->flags |= PF_FROZEN;
+ }
+
+ extern void refrigerator(void);
+diff -upr linux-2.6.16.orig/include/linux/sem.h linux-2.6.16-026test015/include/linux/sem.h
+--- linux-2.6.16.orig/include/linux/sem.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/sem.h 2006-07-04 14:41:39.000000000 +0400
+@@ -155,6 +155,9 @@ static inline void exit_sem(struct task_
+ }
+ #endif
+
++int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg);
++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg);
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_SEM_H */
+diff -upr linux-2.6.16.orig/include/linux/shm.h linux-2.6.16-026test015/include/linux/shm.h
+--- linux-2.6.16.orig/include/linux/shm.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/shm.h 2006-07-04 14:41:39.000000000 +0400
+@@ -86,6 +86,7 @@ struct shmid_kernel /* private to the ke
+ pid_t shm_cprid;
+ pid_t shm_lprid;
+ struct user_struct *mlock_user;
++ struct ipc_ids *_shm_ids;
+ };
+
+ /* shm_mode upper byte flags */
+@@ -104,6 +105,9 @@ static inline long do_shmat(int shmid, c
+ }
+ #endif
+
++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg);
++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg);
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_SHM_H_ */
+diff -upr linux-2.6.16.orig/include/linux/shmem_fs.h linux-2.6.16-026test015/include/linux/shmem_fs.h
+--- linux-2.6.16.orig/include/linux/shmem_fs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/shmem_fs.h 2006-07-04 14:41:37.000000000 +0400
+@@ -19,6 +19,9 @@ struct shmem_inode_info {
+ swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */
+ struct list_head swaplist; /* chain of maybes on swap */
+ struct inode vfs_inode;
++#ifdef CONFIG_USER_RESOURCE
++ struct user_beancounter *shmi_ub;
++#endif
+ };
+
+ struct shmem_sb_info {
+diff -upr linux-2.6.16.orig/include/linux/signal.h linux-2.6.16-026test015/include/linux/signal.h
+--- linux-2.6.16.orig/include/linux/signal.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/signal.h 2006-07-04 14:41:39.000000000 +0400
+@@ -3,6 +3,7 @@
+
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
++#include <linux/slab.h>
+ #include <asm/signal.h>
+ #include <asm/siginfo.h>
+
+@@ -41,6 +42,9 @@ struct sigqueue {
+ int flags;
+ siginfo_t info;
+ struct user_struct *user;
++#ifdef CONFIG_USER_RESOURCE
++ struct user_beancounter *sig_ub;
++#endif
+ };
+
+ /* flags values. */
+@@ -263,6 +267,8 @@ extern int sigprocmask(int, sigset_t *,
+ struct pt_regs;
+ extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
+
++extern kmem_cache_t *sigqueue_cachep;
++
+ #endif /* __KERNEL__ */
+
+ #endif /* _LINUX_SIGNAL_H */
+diff -upr linux-2.6.16.orig/include/linux/skbuff.h linux-2.6.16-026test015/include/linux/skbuff.h
+--- linux-2.6.16.orig/include/linux/skbuff.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/skbuff.h 2006-07-04 14:41:38.000000000 +0400
+@@ -19,6 +19,7 @@
+ #include <linux/compiler.h>
+ #include <linux/time.h>
+ #include <linux/cache.h>
++#include <linux/ve_owner.h>
+
+ #include <asm/atomic.h>
+ #include <asm/types.h>
+@@ -211,6 +212,8 @@ enum {
+ * @tc_verd: traffic control verdict
+ */
+
++#include <ub/ub_sk.h>
++
+ struct sk_buff {
+ /* These two members must be first. */
+ struct sk_buff *next;
+@@ -294,13 +297,18 @@ struct sk_buff {
+ *data,
+ *tail,
+ *end;
++ struct skb_beancounter skb_bc;
++ struct ve_struct *owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(SKB, struct sk_buff, owner_env)
++
+ #ifdef __KERNEL__
+ /*
+ * Handling routines are only of interest to the kernel
+ */
+ #include <linux/slab.h>
++#include <ub/ub_net.h>
+
+ #include <asm/system.h>
+
+@@ -1007,6 +1015,8 @@ static inline int pskb_trim(struct sk_bu
+ */
+ static inline void skb_orphan(struct sk_buff *skb)
+ {
++ ub_skb_uncharge(skb);
++
+ if (skb->destructor)
+ skb->destructor(skb);
+ skb->destructor = NULL;
+diff -upr linux-2.6.16.orig/include/linux/slab.h linux-2.6.16-026test015/include/linux/slab.h
+--- linux-2.6.16.orig/include/linux/slab.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/slab.h 2006-07-04 14:41:37.000000000 +0400
+@@ -48,6 +48,26 @@ typedef struct kmem_cache kmem_cache_t;
+ #define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */
+ #define SLAB_DESTROY_BY_RCU 0x00080000UL /* defer freeing pages to RCU */
+
++/*
++ * allocation rules: __GFP_UBC 0
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ * cache (SLAB_UBC) charge charge
++ * (usual caches: mm, vma, task_struct, ...)
++ *
++ * cache (SLAB_UBC | SLAB_NO_CHARGE) charge ---
++ * (ub_kmalloc) (kmalloc)
++ *
++ * cache (no UB flags) BUG() ---
++ * (nonub caches, mempools)
++ *
++ * pages charge ---
++ * (ub_vmalloc, (vmalloc,
++ * poll, fdsets, ...) non-ub allocs)
++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
++ */
++#define SLAB_UBC 0x20000000UL /* alloc space for ubs ... */
++#define SLAB_NO_CHARGE 0x40000000UL /* ... but don't charge */
++
+ /* flags passed to a constructor func */
+ #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */
+ #define SLAB_CTOR_ATOMIC 0x002UL /* tell constructor it can't sleep */
+@@ -108,6 +128,8 @@ found:
+ return __kmalloc(size, flags);
+ }
+
++#define ub_kmalloc(size, flags) kmalloc(size, ((flags) | __GFP_UBC))
++
+ extern void *kzalloc(size_t, gfp_t);
+
+ /**
+diff -upr linux-2.6.16.orig/include/linux/smp.h linux-2.6.16-026test015/include/linux/smp.h
+--- linux-2.6.16.orig/include/linux/smp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/smp.h 2006-07-04 14:41:37.000000000 +0400
+@@ -10,6 +10,9 @@
+
+ extern void cpu_idle(void);
+
++struct pt_regs;
++typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info);
++
+ #ifdef CONFIG_SMP
+
+ #include <linux/preempt.h>
+@@ -49,6 +52,8 @@ extern int __cpu_up(unsigned int cpunum)
+ */
+ extern void smp_cpus_done(unsigned int max_cpus);
+
++extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait);
++
+ /*
+ * Call a function on all other processors
+ */
+@@ -99,6 +104,12 @@ static inline void smp_send_reschedule(i
+ #define num_booting_cpus() 1
+ #define smp_prepare_boot_cpu() do {} while (0)
+
++static inline int smp_nmi_call_function(smp_nmi_function func,
++ void *info, int wait)
++{
++ return 0;
++}
++
+ #endif /* !SMP */
+
+ /*
+diff -upr linux-2.6.16.orig/include/linux/socket.h linux-2.6.16-026test015/include/linux/socket.h
+--- linux-2.6.16.orig/include/linux/socket.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/socket.h 2006-07-04 14:41:38.000000000 +0400
+@@ -300,6 +300,7 @@ extern int memcpy_toiovec(struct iovec *
+ extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen);
+ extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr);
+ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
++extern int vz_security_proto_check(int family, int type, int protocol);
+
+ #endif
+ #endif /* not kernel and not glibc */
+diff -upr linux-2.6.16.orig/include/linux/swap.h linux-2.6.16-026test015/include/linux/swap.h
+--- linux-2.6.16.orig/include/linux/swap.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/swap.h 2006-07-04 14:41:37.000000000 +0400
+@@ -80,6 +80,7 @@ struct address_space;
+ struct sysinfo;
+ struct writeback_control;
+ struct zone;
++struct user_beancounter;
+
+ /*
+ * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
+@@ -119,6 +120,7 @@ enum {
+ /*
+ * The in-memory structure used to track swap areas.
+ */
++struct user_beancounter;
+ struct swap_info_struct {
+ unsigned int flags;
+ int prio; /* swap priority */
+@@ -136,6 +138,9 @@ struct swap_info_struct {
+ unsigned int max;
+ unsigned int inuse_pages;
+ int next; /* next entry on swap list */
++#ifdef CONFIG_USER_SWAP_ACCOUNTING
++ struct user_beancounter **swap_ubs;
++#endif
+ };
+
+ struct swap_list_t {
+@@ -240,7 +245,7 @@ extern long total_swap_pages;
+ extern unsigned int nr_swapfiles;
+ extern struct swap_info_struct swap_info[];
+ extern void si_swapinfo(struct sysinfo *);
+-extern swp_entry_t get_swap_page(void);
++extern swp_entry_t get_swap_page(struct user_beancounter *);
+ extern swp_entry_t get_swap_page_of_type(int type);
+ extern int swap_duplicate(swp_entry_t);
+ extern int valid_swaphandles(swp_entry_t, unsigned long *);
+@@ -253,7 +258,9 @@ extern int remove_exclusive_swap_page(st
+ struct backing_dev_info;
+
+ extern spinlock_t swap_lock;
+-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page);
++struct page_beancounter;
++extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page,
++ struct page_beancounter **pb);
+
+ /* linux/mm/thrash.c */
+ extern struct mm_struct * swap_token_mm;
+@@ -310,7 +317,7 @@ static inline int remove_exclusive_swap_
+ return 0;
+ }
+
+-static inline swp_entry_t get_swap_page(void)
++static inline swp_entry_t get_swap_page(struct user_beancounter *ub)
+ {
+ swp_entry_t entry;
+ entry.val = 0;
+diff -upr linux-2.6.16.orig/include/linux/sysctl.h linux-2.6.16-026test015/include/linux/sysctl.h
+--- linux-2.6.16.orig/include/linux/sysctl.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/sysctl.h 2006-07-04 14:41:39.000000000 +0400
+@@ -148,6 +148,13 @@ enum
+ KERN_SPIN_RETRY=70, /* int: number of spinlock retries */
+ KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
+ KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
++ KERN_SILENCE_LEVEL=200, /* int: Console silence loglevel */
++ KERN_ALLOC_FAIL_WARN=201, /* int: whether we'll print "alloc failure" */
++ KERN_VIRT_PIDS=202, /* int: VE pids virtualization */
++ KERN_VIRT_OSRELEASE=205,/* virtualization of utsname.release */
++ KERN_FAIRSCHED_MAX_LATENCY=201, /* int: Max start_tag delta */
++ KERN_VCPU_SCHED_TIMESLICE=202,
++ KERN_VCPU_TIMESLICE=203,
+ };
+
+
+@@ -397,10 +404,12 @@ enum
+ NET_TCP_CONG_CONTROL=110,
+ NET_TCP_ABC=111,
+ NET_IPV4_IPFRAG_MAX_DIST=112,
++ NET_TCP_USE_SG=245,
+ };
+
+ enum {
+ NET_IPV4_ROUTE_FLUSH=1,
++ NET_IPV4_ROUTE_SRC_CHECK=188,
+ NET_IPV4_ROUTE_MIN_DELAY=2,
+ NET_IPV4_ROUTE_MAX_DELAY=3,
+ NET_IPV4_ROUTE_GC_THRESH=4,
+@@ -760,6 +769,12 @@ enum
+ FS_AIO_NR=18, /* current system-wide number of aio requests */
+ FS_AIO_MAX_NR=19, /* system-wide maximum number of aio requests */
+ FS_INOTIFY=20, /* inotify submenu */
++ FS_AT_VSYSCALL=21, /* int: to announce vsyscall data */
++};
++
++/* /proc/sys/debug */
++enum {
++ DBG_DECODE_CALLTRACES = 1, /* int: decode call traces on oops */
+ };
+
+ /* /proc/sys/fs/quota/ */
+@@ -900,6 +915,8 @@ extern int proc_doulongvec_minmax(ctl_ta
+ void __user *, size_t *, loff_t *);
+ extern int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int,
+ struct file *, void __user *, size_t *, loff_t *);
++extern int proc_doutsstring(ctl_table *table, int write, struct file *,
++ void __user *, size_t *, loff_t *);
+
+ extern int do_sysctl (int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+@@ -954,6 +971,8 @@ extern ctl_handler sysctl_ms_jiffies;
+ */
+
+ /* A sysctl table is an array of struct ctl_table: */
++struct ve_struct;
++
+ struct ctl_table
+ {
+ int ctl_name; /* Binary ID */
+@@ -967,6 +986,7 @@ struct ctl_table
+ struct proc_dir_entry *de; /* /proc control block */
+ void *extra1;
+ void *extra2;
++ struct ve_struct *owner_env;
+ };
+
+ /* struct ctl_table_header is used to maintain dynamic lists of
+@@ -983,6 +1003,9 @@ struct ctl_table_header * register_sysct
+ int insert_at_head);
+ void unregister_sysctl_table(struct ctl_table_header * table);
+
++ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr);
++void free_sysctl_clone(ctl_table *clone);
++
+ #else /* __KERNEL__ */
+
+ #endif /* __KERNEL__ */
+diff -upr linux-2.6.16.orig/include/linux/tty.h linux-2.6.16-026test015/include/linux/tty.h
+--- linux-2.6.16.orig/include/linux/tty.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/tty.h 2006-07-04 14:41:38.000000000 +0400
+@@ -238,8 +238,11 @@ struct tty_struct {
+ spinlock_t read_lock;
+ /* If the tty has a pending do_SAK, queue it here - akpm */
+ struct work_struct SAK_work;
++ struct ve_struct *owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(TTY, struct tty_struct, owner_env)
++
+ /* tty magic number */
+ #define TTY_MAGIC 0x5401
+
+@@ -266,6 +269,7 @@ struct tty_struct {
+ #define TTY_PTY_LOCK 16 /* pty private */
+ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */
+ #define TTY_HUPPED 18 /* Post driver->hangup() */
++#define TTY_CHARGED 19 /* Charged as ub resource */
+
+ #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
+
+diff -upr linux-2.6.16.orig/include/linux/tty_driver.h linux-2.6.16-026test015/include/linux/tty_driver.h
+--- linux-2.6.16.orig/include/linux/tty_driver.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/tty_driver.h 2006-07-04 14:41:38.000000000 +0400
+@@ -115,6 +115,7 @@
+ * character to the device.
+ */
+
++#include <linux/ve_owner.h>
+ #include <linux/fs.h>
+ #include <linux/list.h>
+ #include <linux/cdev.h>
+@@ -214,9 +215,18 @@ struct tty_driver {
+ unsigned int set, unsigned int clear);
+
+ struct list_head tty_drivers;
++ struct ve_struct *owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(TTYDRV, struct tty_driver, owner_env)
++
++#ifdef CONFIG_LEGACY_PTYS
++extern struct tty_driver *pty_driver;
++extern struct tty_driver *pty_slave_driver;
++#endif
++
+ extern struct list_head tty_drivers;
++extern rwlock_t tty_driver_guard;
+
+ struct tty_driver *alloc_tty_driver(int lines);
+ void put_tty_driver(struct tty_driver *driver);
+diff -upr linux-2.6.16.orig/include/linux/ve.h linux-2.6.16-026test015/include/linux/ve.h
+--- linux-2.6.16.orig/include/linux/ve.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/ve.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,337 @@
++/*
++ * include/linux/ve.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_VE_H
++#define _LINUX_VE_H
++
++#include <linux/config.h>
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++#include <linux/types.h>
++#include <linux/capability.h>
++#include <linux/utsname.h>
++#include <linux/sysctl.h>
++#include <linux/vzstat.h>
++#include <linux/kobject.h>
++
++#ifdef VZMON_DEBUG
++# define VZTRACE(fmt,args...) \
++ printk(KERN_DEBUG fmt, ##args)
++#else
++# define VZTRACE(fmt,args...)
++#endif /* VZMON_DEBUG */
++
++struct tty_driver;
++struct devpts_config;
++struct task_struct;
++struct new_utsname;
++struct file_system_type;
++struct icmp_mib;
++struct ip_mib;
++struct tcp_mib;
++struct udp_mib;
++struct linux_mib;
++struct fib_info;
++struct fib_rule;
++struct veip_struct;
++struct ve_monitor;
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++struct fib_table;
++struct devcnfv4_struct;
++#ifdef CONFIG_VE_IPTABLES
++struct xt_af;
++struct xt_table;
++struct xt_target;
++struct ip_conntrack;
++typedef unsigned int (*ip_nat_helper_func)(void);
++struct ve_ip_conntrack {
++ struct list_head *_ip_conntrack_hash;
++ struct list_head _ip_conntrack_expect_list;
++ struct list_head _ip_conntrack_unconfirmed;
++ struct ip_conntrack_protocol ** _ip_ct_protos;
++ struct list_head _ip_conntrack_helpers;
++ int _ip_conntrack_max;
++ int _ip_conntrack_vmalloc;
++ atomic_t _ip_conntrack_count;
++ void (*_ip_conntrack_destroyed)(struct ip_conntrack *conntrack);
++#ifdef CONFIG_SYSCTL
++ unsigned long _ip_ct_tcp_timeouts[10];
++ unsigned long _ip_ct_udp_timeout;
++ unsigned long _ip_ct_udp_timeout_stream;
++ unsigned long _ip_ct_icmp_timeout;
++ unsigned long _ip_ct_generic_timeout;
++ unsigned int _ip_ct_log_invalid;
++ unsigned long _ip_ct_tcp_timeout_max_retrans;
++ int _ip_ct_tcp_loose;
++ int _ip_ct_tcp_be_liberal;
++ int _ip_ct_tcp_max_retrans;
++ struct ctl_table_header *_ip_ct_sysctl_header;
++ ctl_table *_ip_ct_net_table;
++ ctl_table *_ip_ct_ipv4_table;
++ ctl_table *_ip_ct_netfilter_table;
++ ctl_table *_ip_ct_sysctl_table;
++#endif /*CONFIG_SYSCTL*/
++
++ struct ip_nat_protocol **_ip_nat_protos;
++ ip_nat_helper_func _ip_nat_ftp_hook;
++ ip_nat_helper_func _ip_nat_irc_hook;
++ struct list_head *_ip_nat_bysource;
++ struct xt_table *_ip_nat_table;
++
++ /* resource accounting */
++ struct user_beancounter *ub;
++};
++#endif
++#endif
++
++#define UIDHASH_BITS_VE 6
++#define UIDHASH_SZ_VE (1 << UIDHASH_BITS_VE)
++
++struct ve_cpu_stats {
++ cycles_t idle_time;
++ cycles_t iowait_time;
++ cycles_t strt_idle_time;
++ cycles_t used_time;
++ seqcount_t stat_lock;
++ int nr_running;
++ int nr_unint;
++ int nr_iowait;
++ cputime64_t user;
++ cputime64_t nice;
++ cputime64_t system;
++} ____cacheline_aligned;
++
++struct ve_struct {
++ struct ve_struct *prev;
++ struct ve_struct *next;
++
++ envid_t veid;
++ struct task_struct *init_entry;
++ struct list_head vetask_lh;
++ kernel_cap_t cap_default;
++ atomic_t pcounter;
++ /* ref counter to ve from ipc */
++ atomic_t counter;
++ unsigned int class_id;
++ struct veip_struct *veip;
++ struct rw_semaphore op_sem;
++ int is_running;
++ int is_locked;
++ int virt_pids;
++ /* see vzcalluser.h for VE_FEATURE_XXX definitions */
++ __u64 features;
++
++/* VE's root */
++ struct vfsmount *fs_rootmnt;
++ struct dentry *fs_root;
++
++/* sysctl */
++ struct new_utsname *utsname;
++ struct list_head sysctl_lh;
++ struct ctl_table_header *kern_header;
++ struct ctl_table *kern_table;
++ struct ctl_table_header *quota_header;
++ struct ctl_table *quota_table;
++ struct file_system_type *proc_fstype;
++ struct vfsmount *proc_mnt;
++ struct proc_dir_entry *proc_root;
++ struct proc_dir_entry *proc_sys_root;
++ struct proc_dir_entry *_proc_net;
++ struct proc_dir_entry *_proc_net_stat;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct proc_dir_entry *_proc_net_devsnmp6;
++#endif
++
++/* SYSV IPC */
++ struct ipc_ids *_shm_ids;
++ struct ipc_ids *_msg_ids;
++ struct ipc_ids *_sem_ids;
++ int _used_sems;
++ int _shm_tot;
++ size_t _shm_ctlmax;
++ size_t _shm_ctlall;
++ int _shm_ctlmni;
++ int _msg_ctlmax;
++ int _msg_ctlmni;
++ int _msg_ctlmnb;
++ int _sem_ctls[4];
++
++/* BSD pty's */
++ struct tty_driver *pty_driver;
++ struct tty_driver *pty_slave_driver;
++
++#ifdef CONFIG_UNIX98_PTYS
++ struct tty_driver *ptm_driver;
++ struct tty_driver *pts_driver;
++ struct idr *allocated_ptys;
++ struct file_system_type *devpts_fstype;
++ struct vfsmount *devpts_mnt;
++ struct dentry *devpts_root;
++ struct devpts_config *devpts_config;
++#endif
++
++ struct file_system_type *shmem_fstype;
++ struct vfsmount *shmem_mnt;
++#ifdef CONFIG_SYSFS
++ struct file_system_type *sysfs_fstype;
++ struct vfsmount *sysfs_mnt;
++ struct super_block *sysfs_sb;
++ struct sysfs_dirent *sysfs_root;
++#endif
++ struct subsystem *class_subsys;
++ struct subsystem *class_obj_subsys;
++ struct class *net_class;
++
++/* User uids hash */
++ struct list_head uidhash_table[UIDHASH_SZ_VE];
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ struct hlist_head _net_dev_head;
++ struct hlist_head _net_dev_index_head;
++ struct net_device *_net_dev_base, **_net_dev_tail;
++ int ifindex;
++ struct net_device *_loopback_dev;
++ struct net_device *_venet_dev;
++ struct ipv4_devconf *_ipv4_devconf;
++ struct ipv4_devconf *_ipv4_devconf_dflt;
++ struct ctl_table_header *forward_header;
++ struct ctl_table *forward_table;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct ipv6_devconf *_ipv6_devconf;
++ struct ipv6_devconf *_ipv6_devconf_dflt;
++#endif
++#endif
++ unsigned long rt_flush_required;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct neigh_table *ve_nd_tbl;
++#endif
++ struct neigh_table *ve_arp_tbl;
++
++/* per VE CPU stats*/
++ struct timespec start_timespec;
++ u64 start_jiffies;
++ cycles_t start_cycles;
++ unsigned long avenrun[3]; /* loadavg data */
++
++ cycles_t cpu_used_ve;
++ struct kstat_lat_pcpu_struct sched_lat_ve;
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ struct hlist_head *_fib_info_hash;
++ struct hlist_head *_fib_info_laddrhash;
++ int _fib_hash_size;
++ int _fib_info_cnt;
++
++ struct fib_rule *_local_rule;
++ struct fib_rule *_fib_rules;
++#ifdef CONFIG_IP_MULTIPLE_TABLES
++ /* XXX: why a magic constant? */
++ struct fib_table *_fib_tables[256]; /* RT_TABLE_MAX - for now */
++#else
++ struct fib_table *_main_table;
++ struct fib_table *_local_table;
++#endif
++ struct icmp_mib *_icmp_statistics[2];
++ struct ipstats_mib *_ip_statistics[2];
++ struct tcp_mib *_tcp_statistics[2];
++ struct udp_mib *_udp_statistics[2];
++ struct linux_mib *_net_statistics[2];
++ struct venet_stat *stat;
++#ifdef CONFIG_VE_IPTABLES
++/* core/netfilter.c virtualization */
++ void *_nf_hooks;
++ struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */
++ struct xt_table *_ve_ip6t_filter_pf;
++ struct xt_table *_ipt_mangle_table;
++ struct xt_table *_ip6t_mangle_table;
++ struct xt_af *_xt;
++ struct xt_target *_ipt_standard_target;
++ struct xt_target *_ip6t_standard_target;
++
++ __u64 _iptables_modules;
++ struct ve_ip_conntrack *_ip_conntrack;
++#endif /* CONFIG_VE_IPTABLES */
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ struct fib6_table *_fib6_table;
++ struct ipstats_mib *_ipv6_statistics[2];
++ struct icmpv6_mib *_icmpv6_statistics[2];
++ struct udp_mib *_udp_stats_in6[2];
++#endif
++#endif
++ wait_queue_head_t *_log_wait;
++ unsigned long *_log_start;
++ unsigned long *_log_end;
++ unsigned long *_logged_chars;
++ char *log_buf;
++#define VE_DEFAULT_LOG_BUF_LEN 4096
++
++ struct ve_cpu_stats ve_cpu_stats[NR_CPUS] ____cacheline_aligned;
++ unsigned long down_at;
++ struct list_head cleanup_list;
++
++ unsigned long jiffies_fixup;
++ unsigned char disable_net;
++ unsigned char sparse_vpid;
++ struct ve_monitor *monitor;
++ struct proc_dir_entry *monitor_proc;
++ unsigned long meminfo_val;
++};
++
++#define VE_CPU_STATS(ve, cpu) (&((ve)->ve_cpu_stats[(cpu)]))
++
++extern int nr_ve;
++
++#ifdef CONFIG_VE
++
++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode);
++void do_env_cleanup(struct ve_struct *envid);
++void do_update_load_avg_ve(void);
++void do_env_free(struct ve_struct *ptr);
++
++#define ve_utsname (*get_exec_env()->utsname)
++
++static inline struct ve_struct *get_ve(struct ve_struct *ptr)
++{
++ if (ptr != NULL)
++ atomic_inc(&ptr->counter);
++ return ptr;
++}
++
++static inline void put_ve(struct ve_struct *ptr)
++{
++ if (ptr && atomic_dec_and_test(&ptr->counter)) {
++ if (atomic_read(&ptr->pcounter) > 0)
++ BUG();
++ if (ptr->is_running)
++ BUG();
++ do_env_free(ptr);
++ }
++}
++
++#ifdef CONFIG_FAIRSCHED
++#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask)
++#else
++#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0)
++#endif
++#else /* CONFIG_VE */
++#define ve_utsname system_utsname
++#define get_ve(ve) (NULL)
++#define put_ve(ve) do { } while (0)
++#endif /* CONFIG_VE */
++
++#endif /* _LINUX_VE_H */
+diff -upr linux-2.6.16.orig/include/linux/ve_owner.h linux-2.6.16-026test015/include/linux/ve_owner.h
+--- linux-2.6.16.orig/include/linux/ve_owner.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/ve_owner.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,32 @@
++/*
++ * include/linux/ve_owner.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_OWNER_H__
++#define __VE_OWNER_H__
++
++#include <linux/config.h>
++#include <linux/vmalloc.h>
++
++
++#define DCL_VE_OWNER(name, type, member)
++ /* prototype declares static inline functions */
++
++#define DCL_VE_OWNER_PROTO(name, type, member) \
++type; \
++static inline struct ve_struct *VE_OWNER_##name(const type *obj) \
++{ \
++ return obj->member; \
++} \
++static inline void SET_VE_OWNER_##name(type *obj, struct ve_struct *ve) \
++{ \
++ obj->member = ve; \
++}
++
++#endif /* __VE_OWNER_H__ */
+diff -upr linux-2.6.16.orig/include/linux/ve_proto.h linux-2.6.16-026test015/include/linux/ve_proto.h
+--- linux-2.6.16.orig/include/linux/ve_proto.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/ve_proto.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,75 @@
++/*
++ * include/linux/ve_proto.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_H__
++#define __VE_H__
++
++#ifdef CONFIG_VE
++
++extern struct semaphore ve_call_guard;
++extern rwlock_t ve_call_lock;
++
++#ifdef CONFIG_SYSVIPC
++extern void prepare_ipc(void);
++extern int init_ve_ipc(struct ve_struct *);
++extern void fini_ve_ipc(struct ve_struct *);
++extern void ve_ipc_cleanup(void);
++#endif
++
++#ifdef CONFIG_UNIX98_PTYS
++extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */
++extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */
++#endif
++
++extern rwlock_t tty_driver_guard;
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++void ip_fragment_cleanup(struct ve_struct *envid);
++void tcp_v4_kill_ve_sockets(struct ve_struct *envid);
++struct fib_table * fib_hash_init(int id);
++int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr);
++extern int main_loopback_init(struct net_device*);
++int venet_init(void);
++#endif
++
++extern struct ve_struct *ve_list_head;
++extern rwlock_t ve_list_guard;
++extern struct ve_struct *get_ve_by_id(envid_t);
++extern struct ve_struct *__find_ve_by_id(envid_t);
++
++struct env_create_param2;
++extern int real_env_create(envid_t veid, unsigned flags, u32 class_id,
++ struct env_create_param2 *data, int datalen);
++
++extern int do_setdevperms(envid_t veid, unsigned type,
++ dev_t dev, unsigned mask);
++
++#define VE_HOOK_INIT 0
++#define VE_HOOK_FINI 1
++#define VE_MAX_HOOKS 2
++
++typedef int ve_hookfn(unsigned int hooknum, void *data);
++
++struct ve_hook
++{
++ struct list_head list;
++ ve_hookfn *hook;
++ ve_hookfn *undo;
++ struct module *owner;
++ int hooknum;
++ /* Functions are called in ascending priority. */
++ int priority;
++};
++
++extern int ve_hook_register(struct ve_hook *vh);
++extern void ve_hook_unregister(struct ve_hook *vh);
++
++#endif
++#endif
+diff -upr linux-2.6.16.orig/include/linux/ve_task.h linux-2.6.16-026test015/include/linux/ve_task.h
+--- linux-2.6.16.orig/include/linux/ve_task.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/ve_task.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,34 @@
++/*
++ * include/linux/ve_task.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_TASK_H__
++#define __VE_TASK_H__
++
++#include <linux/seqlock.h>
++
++struct ve_task_info {
++/* virtualization */
++ struct ve_struct *owner_env;
++ struct ve_struct *exec_env;
++ struct list_head vetask_list;
++ struct dentry *glob_proc_dentry;
++/* statistics: scheduling latency */
++ cycles_t sleep_time;
++ cycles_t sched_time;
++ cycles_t sleep_stamp;
++ cycles_t wakeup_stamp;
++ seqcount_t wakeup_lock;
++};
++
++#define VE_TASK_INFO(task) (&(task)->ve_task_info)
++#define VE_TASK_LIST_2_TASK(lh) \
++ list_entry(lh, struct task_struct, ve_task_info.vetask_list)
++
++#endif /* __VE_TASK_H__ */
+diff -upr linux-2.6.16.orig/include/linux/venet.h linux-2.6.16-026test015/include/linux/venet.h
+--- linux-2.6.16.orig/include/linux/venet.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/venet.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,70 @@
++/*
++ * include/linux/venet.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VENET_H
++#define _VENET_H
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/vzcalluser.h>
++
++#define VEIP_HASH_SZ 512
++
++struct ve_struct;
++struct venet_stat;
++struct ip_entry_struct
++{
++ __u32 key[4];
++ int family;
++ struct ve_struct *active_env;
++ struct venet_stat *stat;
++ struct veip_struct *veip;
++ struct list_head ip_hash;
++ struct list_head ve_list;
++};
++
++struct veip_struct
++{
++ struct list_head src_lh;
++ struct list_head dst_lh;
++ struct list_head ip_lh;
++ struct list_head list;
++ envid_t veid;
++};
++
++/* veip_hash_lock should be taken for write by caller */
++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip);
++/* veip_hash_lock should be taken for write by caller */
++void ip_entry_unhash(struct ip_entry_struct *entry);
++/* veip_hash_lock should be taken for read by caller */
++struct ip_entry_struct *ip_entry_lookup(u32 addr);
++struct ip_entry_struct *venet_entry_lookup(u32 *addr, int family);
++
++/* veip_hash_lock should be taken for read by caller */
++struct veip_struct *veip_find(envid_t veid);
++/* veip_hash_lock should be taken for write by caller */
++struct veip_struct *veip_findcreate(envid_t veid);
++/* veip_hash_lock should be taken for write by caller */
++void veip_put(struct veip_struct *veip);
++
++int veip_start(struct ve_struct *ve);
++void veip_stop(struct ve_struct *ve);
++int veip_entry_add(struct ve_struct *ve, struct sockaddr *addr);
++int veip_entry_del(envid_t veid, struct sockaddr *addr);
++int venet_change_skb_owner(struct sk_buff *skb);
++
++extern struct list_head ip_entry_hash_table[];
++extern rwlock_t veip_hash_lock;
++
++#ifdef CONFIG_PROC_FS
++int veip_seq_show(struct seq_file *m, void *v);
++#endif
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/veprintk.h linux-2.6.16-026test015/include/linux/veprintk.h
+--- linux-2.6.16.orig/include/linux/veprintk.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/veprintk.h 2006-07-04 14:41:38.000000000 +0400
+@@ -0,0 +1,38 @@
++/*
++ * include/linux/veprintk.h
++ *
++ * Copyright (C) 2006 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VE_PRINTK_H__
++#define __VE_PRINTK_H__
++
++#ifdef CONFIG_VE
++
++#define ve_log_wait (*(get_exec_env()->_log_wait))
++#define ve_log_start (*(get_exec_env()->_log_start))
++#define ve_log_end (*(get_exec_env()->_log_end))
++#define ve_logged_chars (*(get_exec_env()->_logged_chars))
++#define ve_log_buf (get_exec_env()->log_buf)
++#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \
++ log_buf_len : VE_DEFAULT_LOG_BUF_LEN)
++#define VE_LOG_BUF_MASK (ve_log_buf_len - 1)
++#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK])
++
++#else
++
++#define ve_log_wait log_wait
++#define ve_log_start log_start
++#define ve_log_end log_end
++#define ve_logged_chars logged_chars
++#define ve_log_buf log_buf
++#define ve_log_buf_len log_buf_len
++#define VE_LOG_BUF_MASK LOG_BUF_MASK
++#define VE_LOG_BUF(idx) LOG_BUF(idx)
++
++#endif /* CONFIG_VE */
++#endif /* __VE_PRINTK_H__ */
+diff -upr linux-2.6.16.orig/include/linux/virtinfo.h linux-2.6.16-026test015/include/linux/virtinfo.h
+--- linux-2.6.16.orig/include/linux/virtinfo.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/virtinfo.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,52 @@
++/*
++ * include/linux/virtinfo.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __LINUX_VIRTINFO_H
++#define __LINUX_VIRTINFO_H
++
++#include <linux/kernel.h>
++#include <linux/page-flags.h>
++#include <linux/rwsem.h>
++#include <linux/notifier.h>
++
++struct vnotifier_block
++{
++ int (*notifier_call)(struct vnotifier_block *self,
++ unsigned long, void *, int);
++ struct vnotifier_block *next;
++ int priority;
++};
++
++void virtinfo_notifier_register(int type, struct vnotifier_block *nb);
++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb);
++int virtinfo_notifier_call(int type, unsigned long n, void *data);
++
++struct meminfo {
++ struct sysinfo si;
++ unsigned long active, inactive;
++ unsigned long cache, swapcache;
++ unsigned long committed_space;
++ unsigned long allowed;
++ struct page_state ps;
++ unsigned long vmalloc_total, vmalloc_used, vmalloc_largest;
++};
++
++#define VIRTINFO_MEMINFO 0
++#define VIRTINFO_ENOUGHMEM 1
++
++enum virt_info_types {
++ VITYPE_GENERAL,
++ VITYPE_FAUDIT,
++ VITYPE_QUOTA,
++
++ VIRT_TYPES
++};
++
++#endif /* __LINUX_VIRTINFO_H */
+diff -upr linux-2.6.16.orig/include/linux/vmalloc.h linux-2.6.16-026test015/include/linux/vmalloc.h
+--- linux-2.6.16.orig/include/linux/vmalloc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/linux/vmalloc.h 2006-07-04 14:41:37.000000000 +0400
+@@ -18,6 +18,10 @@
+ #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */
+ #endif
+
++/* align size to 2^n page boundary */
++#define POWER2_PAGE_ALIGN(size) \
++ ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size))))
++
+ struct vm_struct {
+ void *addr;
+ unsigned long size;
+@@ -32,10 +36,14 @@ struct vm_struct {
+ * Highlevel APIs for driver use
+ */
+ extern void *vmalloc(unsigned long size);
++extern void *ub_vmalloc(unsigned long size);
+ extern void *vmalloc_node(unsigned long size, int node);
++extern void *ub_vmalloc_node(unsigned long size, int node);
+ extern void *vmalloc_exec(unsigned long size);
+ extern void *vmalloc_32(unsigned long size);
+ extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
++extern void *vmalloc_best(unsigned long size);
++extern void *ub_vmalloc_best(unsigned long size);
+ extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
+ pgprot_t prot);
+ extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask,
+@@ -52,6 +60,9 @@ extern void vunmap(void *addr);
+ extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
+ extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
+ unsigned long start, unsigned long end);
++extern struct vm_struct * get_vm_area_best(unsigned long size,
++ unsigned long flags);
++extern void vprintstat(void);
+ extern struct vm_struct *get_vm_area_node(unsigned long size,
+ unsigned long flags, int node);
+ extern struct vm_struct *remove_vm_area(void *addr);
+diff -upr linux-2.6.16.orig/include/linux/vsched.h linux-2.6.16-026test015/include/linux/vsched.h
+--- linux-2.6.16.orig/include/linux/vsched.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vsched.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,26 @@
++/*
++ * include/linux/vsched.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VSCHED_H__
++#define __VSCHED_H__
++
++#include <linux/config.h>
++#include <linux/cache.h>
++#include <linux/fairsched.h>
++#include <linux/sched.h>
++
++extern int vsched_create(int id, struct fairsched_node *node);
++extern int vsched_destroy(struct vcpu_scheduler *vsched);
++
++extern int vsched_mvpr(struct task_struct *p, struct vcpu_scheduler *vsched);
++
++extern int vcpu_online(int cpu);
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/vzcalluser.h linux-2.6.16-026test015/include/linux/vzcalluser.h
+--- linux-2.6.16.orig/include/linux/vzcalluser.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzcalluser.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,228 @@
++/*
++ * include/linux/vzcalluser.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_VZCALLUSER_H
++#define _LINUX_VZCALLUSER_H
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++
++#define KERN_VZ_PRIV_RANGE 51
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++/*
++ * VE management ioctls
++ */
++
++struct vzctl_old_env_create {
++ envid_t veid;
++ unsigned flags;
++#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */
++#define VE_EXCLUSIVE 2 /* Fail if exists */
++#define VE_ENTER 4 /* Enter existing VE */
++#define VE_TEST 8 /* Test if VE exists */
++#define VE_LOCK 16 /* Do not allow entering created VE */
++#define VE_SKIPLOCK 32 /* Allow entering embrion VE */
++ __u32 addr;
++};
++
++struct vzctl_mark_env_to_down {
++ envid_t veid;
++};
++
++struct vzctl_setdevperms {
++ envid_t veid;
++ unsigned type;
++#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */
++#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */
++#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */
++ unsigned dev;
++ unsigned mask;
++};
++
++struct vzctl_ve_netdev {
++ envid_t veid;
++ int op;
++#define VE_NETDEV_ADD 1
++#define VE_NETDEV_DEL 2
++ char *dev_name;
++};
++
++struct vzctl_ve_meminfo {
++ envid_t veid;
++ unsigned long val;
++};
++
++/* these masks represent modules */
++#define VE_IP_IPTABLES_MOD (1U<<0)
++#define VE_IP_FILTER_MOD (1U<<1)
++#define VE_IP_MANGLE_MOD (1U<<2)
++#define VE_IP_MATCH_LIMIT_MOD (1U<<3)
++#define VE_IP_MATCH_MULTIPORT_MOD (1U<<4)
++#define VE_IP_MATCH_TOS_MOD (1U<<5)
++#define VE_IP_TARGET_TOS_MOD (1U<<6)
++#define VE_IP_TARGET_REJECT_MOD (1U<<7)
++#define VE_IP_TARGET_TCPMSS_MOD (1U<<8)
++#define VE_IP_MATCH_TCPMSS_MOD (1U<<9)
++#define VE_IP_MATCH_TTL_MOD (1U<<10)
++#define VE_IP_TARGET_LOG_MOD (1U<<11)
++#define VE_IP_MATCH_LENGTH_MOD (1U<<12)
++#define VE_IP_CONNTRACK_MOD (1U<<14)
++#define VE_IP_CONNTRACK_FTP_MOD (1U<<15)
++#define VE_IP_CONNTRACK_IRC_MOD (1U<<16)
++#define VE_IP_MATCH_CONNTRACK_MOD (1U<<17)
++#define VE_IP_MATCH_STATE_MOD (1U<<18)
++#define VE_IP_MATCH_HELPER_MOD (1U<<19)
++#define VE_IP_NAT_MOD (1U<<20)
++#define VE_IP_NAT_FTP_MOD (1U<<21)
++#define VE_IP_NAT_IRC_MOD (1U<<22)
++#define VE_IP_TARGET_REDIRECT_MOD (1U<<23)
++
++/* these masks represent modules with their dependences */
++#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD)
++#define VE_IP_FILTER (VE_IP_FILTER_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MATCH_LIMIT (VE_IP_MATCH_LIMIT_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MATCH_MULTIPORT (VE_IP_MATCH_MULTIPORT_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MATCH_TOS (VE_IP_MATCH_TOS_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_TARGET_TOS (VE_IP_TARGET_TOS_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_TARGET_REJECT (VE_IP_TARGET_REJECT_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_TARGET_TCPMSS (VE_IP_TARGET_TCPMSS_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MATCH_TCPMSS (VE_IP_MATCH_TCPMSS_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MATCH_TTL (VE_IP_MATCH_TTL_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_TARGET_LOG (VE_IP_TARGET_LOG_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_MATCH_LENGTH (VE_IP_MATCH_LENGTH_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \
++ | VE_IP_IPTABLES)
++#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \
++ | VE_IP_CONNTRACK)
++#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \
++ | VE_IP_CONNTRACK)
++#define VE_IP_MATCH_CONNTRACK (VE_IP_MATCH_CONNTRACK_MOD \
++ | VE_IP_CONNTRACK)
++#define VE_IP_MATCH_STATE (VE_IP_MATCH_STATE_MOD \
++ | VE_IP_CONNTRACK)
++#define VE_IP_MATCH_HELPER (VE_IP_MATCH_HELPER_MOD \
++ | VE_IP_CONNTRACK)
++#define VE_IP_NAT (VE_IP_NAT_MOD \
++ | VE_IP_CONNTRACK)
++#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \
++ | VE_IP_NAT | VE_IP_CONNTRACK_FTP)
++#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \
++ | VE_IP_NAT | VE_IP_CONNTRACK_IRC)
++#define VE_IP_TARGET_REDIRECT (VE_IP_TARGET_REDIRECT_MOD \
++ | VE_IP_NAT)
++
++/* safe iptables mask to be used by default */
++#define VE_IP_DEFAULT \
++ (VE_IP_IPTABLES | \
++ VE_IP_FILTER | VE_IP_MANGLE | \
++ VE_IP_MATCH_LIMIT | VE_IP_MATCH_MULTIPORT | \
++ VE_IP_MATCH_TOS | VE_IP_TARGET_REJECT | \
++ VE_IP_TARGET_TCPMSS | VE_IP_MATCH_TCPMSS | \
++ VE_IP_MATCH_TTL | VE_IP_MATCH_LENGTH)
++
++#define VE_IPT_CMP(x,y) (((x) & (y)) == (y))
++
++struct vzctl_env_create_cid {
++ envid_t veid;
++ unsigned flags;
++ __u32 class_id;
++};
++
++struct vzctl_env_create {
++ envid_t veid;
++ unsigned flags;
++ __u32 class_id;
++};
++
++struct env_create_param {
++ __u64 iptables_mask;
++};
++
++#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param)
++
++struct env_create_param2 {
++ __u64 iptables_mask;
++ __u64 feature_mask;
++#define VE_FEATURE_SYSFS (1ULL << 0)
++ __u32 total_vcpus; /* 0 - don't care, same as in host */
++};
++#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(struct env_create_param2)
++
++typedef struct env_create_param2 env_create_param_t;
++
++struct vzctl_env_create_data {
++ envid_t veid;
++ unsigned flags;
++ __u32 class_id;
++ env_create_param_t *data;
++ int datalen;
++};
++
++struct vz_load_avg {
++ int val_int;
++ int val_frac;
++};
++
++struct vz_cpu_stat {
++ unsigned long user_jif;
++ unsigned long nice_jif;
++ unsigned long system_jif;
++ unsigned long uptime_jif;
++ __u64 idle_clk;
++ __u64 strv_clk;
++ __u64 uptime_clk;
++ struct vz_load_avg avenrun[3]; /* loadavg data */
++};
++
++struct vzctl_cpustatctl {
++ envid_t veid;
++ struct vz_cpu_stat *cpustat;
++};
++
++#define VZCTLTYPE '.'
++#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \
++ struct vzctl_old_env_create)
++#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \
++ struct vzctl_mark_env_to_down)
++#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \
++ struct vzctl_setdevperms)
++#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \
++ struct vzctl_env_create_cid)
++#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \
++ struct vzctl_env_create)
++#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \
++ struct vzctl_cpustatctl)
++#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \
++ struct vzctl_env_create_data)
++#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \
++ struct vzctl_ve_netdev)
++#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \
++ struct vzctl_ve_meminfo)
++
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/vzctl.h linux-2.6.16-026test015/include/linux/vzctl.h
+--- linux-2.6.16.orig/include/linux/vzctl.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzctl.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,30 @@
++/*
++ * include/linux/vzctl.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_VZCTL_H
++#define _LINUX_VZCTL_H
++
++#include <linux/list.h>
++
++struct module;
++struct inode;
++struct file;
++struct vzioctlinfo {
++ unsigned type;
++ int (*func)(struct inode *, struct file *,
++ unsigned int, unsigned long);
++ struct module *owner;
++ struct list_head list;
++};
++
++extern void vzioctl_register(struct vzioctlinfo *inf);
++extern void vzioctl_unregister(struct vzioctlinfo *inf);
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/vzctl_quota.h linux-2.6.16-026test015/include/linux/vzctl_quota.h
+--- linux-2.6.16.orig/include/linux/vzctl_quota.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzctl_quota.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,43 @@
++/*
++ * include/linux/vzctl_quota.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __LINUX_VZCTL_QUOTA_H__
++#define __LINUX_VZCTL_QUOTA_H__
++
++/*
++ * Quota management ioctl
++ */
++
++struct vz_quota_stat;
++struct vzctl_quotactl {
++ int cmd;
++ unsigned int quota_id;
++ struct vz_quota_stat *qstat;
++ char *ve_root;
++};
++
++struct vzctl_quotaugidctl {
++ int cmd; /* subcommand */
++ unsigned int quota_id; /* quota id where it applies to */
++ unsigned int ugid_index;/* for reading statistic. index of first
++ uid/gid record to read */
++ unsigned int ugid_size; /* size of ugid_buf array */
++ void *addr; /* user-level buffer */
++};
++
++#define VZDQCTLTYPE '+'
++#define VZCTL_QUOTA_CTL _IOWR(VZDQCTLTYPE, 1, \
++ struct vzctl_quotactl)
++#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \
++ struct vzctl_quotactl)
++#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \
++ struct vzctl_quotaugidctl)
++
++#endif /* __LINUX_VZCTL_QUOTA_H__ */
+diff -upr linux-2.6.16.orig/include/linux/vzctl_venet.h linux-2.6.16-026test015/include/linux/vzctl_venet.h
+--- linux-2.6.16.orig/include/linux/vzctl_venet.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzctl_venet.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,36 @@
++/*
++ * include/linux/vzctl_venet.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VZCTL_VENET_H
++#define _VZCTL_VENET_H
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++struct vzctl_ve_ip_map {
++ envid_t veid;
++ int op;
++#define VE_IP_ADD 1
++#define VE_IP_DEL 2
++ struct sockaddr *addr;
++ int addrlen;
++};
++
++#define VENETCTLTYPE '('
++
++#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \
++ struct vzctl_ve_ip_map)
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/vzctl_veth.h linux-2.6.16-026test015/include/linux/vzctl_veth.h
+--- linux-2.6.16.orig/include/linux/vzctl_veth.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzctl_veth.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,40 @@
++/*
++ * include/linux/vzctl_veth.h
++ *
++ * Copyright (C) 2006 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VZCTL_VETH_H
++#define _VZCTL_VETH_H
++
++#include <linux/types.h>
++#include <linux/ioctl.h>
++
++#ifndef __ENVID_T_DEFINED__
++typedef unsigned envid_t;
++#define __ENVID_T_DEFINED__
++#endif
++
++struct vzctl_ve_hwaddr {
++ envid_t veid;
++ int op;
++#define VE_ETH_ADD 1
++#define VE_ETH_DEL 2
++ unsigned char dev_addr[6];
++ int addrlen;
++ char dev_name[16];
++ unsigned char dev_addr_ve[6];
++ int addrlen_ve;
++ char dev_name_ve[16];
++};
++
++#define VETHCTLTYPE '['
++
++#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \
++ struct vzctl_ve_hwaddr)
++
++#endif
+diff -upr linux-2.6.16.orig/include/linux/vzdq_tree.h linux-2.6.16-026test015/include/linux/vzdq_tree.h
+--- linux-2.6.16.orig/include/linux/vzdq_tree.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzdq_tree.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,99 @@
++/*
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo disk quota tree definition
++ */
++
++#ifndef _VZDQ_TREE_H
++#define _VZDQ_TREE_H
++
++#include <linux/list.h>
++#include <asm/string.h>
++
++typedef unsigned int quotaid_t;
++#define QUOTAID_BITS 32
++#define QUOTAID_BBITS 4
++#define QUOTAID_EBITS 8
++
++#if QUOTAID_EBITS % QUOTAID_BBITS
++#error Quota bit assumption failure
++#endif
++
++#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS)
++#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1)
++#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \
++ / QUOTAID_BBITS)
++#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \
++ / QUOTAID_EBITS)
++#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS)
++
++/*
++ * Depth of keeping unused node (not inclusive).
++ * 0 means release all nodes including root,
++ * QUOTATREE_DEPTH means never release nodes.
++ * Current value: release all nodes strictly after QUOTATREE_EDEPTH
++ * (measured in external shift units).
++ */
++#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \
++ - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \
++ + 1)
++
++/*
++ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes.
++ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS),
++ * and each node contains 2^QUOTAID_BBITS pointers.
++ * Level 0 is a (single) tree root node.
++ *
++ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data.
++ * Nodes of lower levels contain pointers to nodes.
++ *
++ * Double pointer in array of i-level node, pointing to a (i+1)-level node
++ * (such as inside quotatree_find_state) are marked by level (i+1), not i.
++ * Level 0 double pointer is a pointer to root inside tree struct.
++ *
++ * The tree is permanent, i.e. all index blocks allocated are keeped alive to
++ * preserve the blocks numbers in the quota file tree to keep its changes
++ * locally.
++ */
++struct quotatree_node {
++ struct list_head list;
++ quotaid_t num;
++ void *blocks[QUOTATREE_BSIZE];
++};
++
++struct quotatree_level {
++ struct list_head usedlh, freelh;
++ quotaid_t freenum;
++};
++
++struct quotatree_tree {
++ struct quotatree_level levels[QUOTATREE_DEPTH];
++ struct quotatree_node *root;
++ unsigned int leaf_num;
++};
++
++struct quotatree_find_state {
++ void **block;
++ int level;
++};
++
++/* number of leafs (objects) and leaf level of the tree */
++#define QTREE_LEAFNUM(tree) ((tree)->leaf_num)
++#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1])
++
++struct quotatree_tree *quotatree_alloc(void);
++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id,
++ struct quotatree_find_state *st);
++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id,
++ struct quotatree_find_state *st, void *data);
++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id);
++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *));
++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id);
++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index);
++
++#endif /* _VZDQ_TREE_H */
++
+diff -upr linux-2.6.16.orig/include/linux/vzquota.h linux-2.6.16-026test015/include/linux/vzquota.h
+--- linux-2.6.16.orig/include/linux/vzquota.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzquota.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,291 @@
++/*
++ *
++ * Copyright (C) 2001-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * This file contains Virtuozzo disk quota implementation
++ */
++
++#ifndef _VZDQUOTA_H
++#define _VZDQUOTA_H
++
++#include <linux/types.h>
++#include <linux/quota.h>
++
++/* vzquotactl syscall commands */
++#define VZ_DQ_CREATE 5 /* create quota master block */
++#define VZ_DQ_DESTROY 6 /* destroy qmblk */
++#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */
++#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */
++#define VZ_DQ_SETLIMIT 9 /* set new limits */
++#define VZ_DQ_GETSTAT 10 /* get usage statistic */
++/* set of syscalls to maintain UGID quotas */
++#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */
++#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */
++#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */
++#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */
++#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */
++#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */
++#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */
++#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */
++
++/* common structure for vz and ugid quota */
++struct dq_stat {
++ /* blocks limits */
++ __u64 bhardlimit; /* absolute limit in bytes */
++ __u64 bsoftlimit; /* preferred limit in bytes */
++ time_t btime; /* time limit for excessive disk use */
++ __u64 bcurrent; /* current bytes count */
++ /* inodes limits */
++ __u32 ihardlimit; /* absolute limit on allocated inodes */
++ __u32 isoftlimit; /* preferred inode limit */
++ time_t itime; /* time limit for excessive inode use */
++ __u32 icurrent; /* current # allocated inodes */
++};
++
++/* One second resolution for grace times */
++#define CURRENT_TIME_SECONDS (get_seconds())
++
++/* Values for dq_info->flags */
++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */
++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */
++
++struct dq_info {
++ time_t bexpire; /* expire timeout for excessive disk use */
++ time_t iexpire; /* expire timeout for excessive inode use */
++ unsigned flags; /* see previos defines */
++};
++
++struct vz_quota_stat {
++ struct dq_stat dq_stat;
++ struct dq_info dq_info;
++};
++
++/* UID/GID interface record - for user-kernel level exchange */
++struct vz_quota_iface {
++ unsigned int qi_id; /* UID/GID this applies to */
++ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */
++ struct dq_stat qi_stat; /* limits, options, usage stats */
++};
++
++/* values for flags and dq_flags */
++/* this flag is set if the userspace has been unable to provide usage
++ * information about all ugids
++ * if the flag is set, we don't allocate new UG quota blocks (their
++ * current usage is unknown) or free existing UG quota blocks (not to
++ * lose information that this block is ok) */
++#define VZDQUG_FIXED_SET 0x01
++/* permit to use ugid quota */
++#define VZDQUG_ON 0x02
++#define VZDQ_USRQUOTA 0x10
++#define VZDQ_GRPQUOTA 0x20
++#define VZDQ_NOACT 0x1000 /* not actual */
++#define VZDQ_NOQUOT 0x2000 /* not under quota tree */
++
++struct vz_quota_ugid_stat {
++ unsigned int limit; /* max amount of ugid records */
++ unsigned int count; /* amount of ugid records */
++ unsigned int flags;
++};
++
++struct vz_quota_ugid_setlimit {
++ unsigned int type; /* quota type (USR/GRP) */
++ unsigned int id; /* ugid */
++ struct if_dqblk dqb; /* limits info */
++};
++
++struct vz_quota_ugid_setinfo {
++ unsigned int type; /* quota type (USR/GRP) */
++ struct if_dqinfo dqi; /* grace info */
++};
++
++#ifdef __KERNEL__
++#include <linux/list.h>
++#include <asm/atomic.h>
++#include <asm/semaphore.h>
++#include <linux/time.h>
++#include <linux/vzquota_qlnk.h>
++#include <linux/vzdq_tree.h>
++
++/* Values for dq_info flags */
++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */
++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */
++
++/* values for dq_state */
++#define VZDQ_STARTING 0 /* created, not turned on yet */
++#define VZDQ_WORKING 1 /* quota created, turned on */
++#define VZDQ_STOPING 2 /* created, turned on and off */
++
++/* master quota record - one per veid */
++struct vz_quota_master {
++ struct list_head dq_hash; /* next quota in hash list */
++ atomic_t dq_count; /* inode reference count */
++ unsigned int dq_flags; /* see VZDQUG_FIXED_SET */
++ unsigned int dq_state; /* see values above */
++ unsigned int dq_id; /* VEID this applies to */
++ struct dq_stat dq_stat; /* limits, grace, usage stats */
++ struct dq_info dq_info; /* grace times and flags */
++ spinlock_t dq_data_lock; /* for dq_stat */
++
++ struct semaphore dq_sem; /* semaphore to protect
++ ugid tree */
++
++ struct list_head dq_ilink_list; /* list of vz_quota_ilink */
++ struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */
++ struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */
++ unsigned int dq_ugid_count; /* amount of ugid records */
++ unsigned int dq_ugid_max; /* max amount of ugid records */
++ struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */
++
++ struct dentry *dq_root_dentry;/* dentry of fs tree */
++ struct vfsmount *dq_root_mnt; /* vfsmnt of this dentry */
++ struct super_block *dq_sb; /* superblock of our quota root */
++};
++
++/* UID/GID quota record - one per pair (quota_master, uid or gid) */
++struct vz_quota_ugid {
++ unsigned int qugid_id; /* UID/GID this applies to */
++ struct dq_stat qugid_stat; /* limits, options, usage stats */
++ int qugid_type; /* USRQUOTA|GRPQUOTA */
++ atomic_t qugid_count; /* reference count */
++};
++
++#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11)
++
++struct vz_quota_datast {
++ struct vz_quota_ilink qlnk;
++};
++
++#define VIRTINFO_QUOTA_GETSTAT 0
++#define VIRTINFO_QUOTA_ON 1
++#define VIRTINFO_QUOTA_OFF 2
++
++struct virt_info_quota {
++ struct super_block *super;
++ struct dq_stat *qstat;
++};
++
++/*
++ * Interface to VZ quota core
++ */
++#define INODE_QLNK(inode) (&(inode)->i_qlnk)
++#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk)
++
++#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef)
++
++#define VZ_QUOTAO_SETE 1
++#define VZ_QUOTAO_INIT 2
++#define VZ_QUOTAO_DESTR 3
++#define VZ_QUOTAO_SWAP 4
++#define VZ_QUOTAO_INICAL 5
++#define VZ_QUOTAO_DRCAL 6
++#define VZ_QUOTAO_QSET 7
++#define VZ_QUOTAO_TRANS 8
++#define VZ_QUOTAO_ACT 9
++#define VZ_QUOTAO_DTREE 10
++#define VZ_QUOTAO_DET 11
++#define VZ_QUOTAO_ON 12
++
++extern struct semaphore vz_quota_sem;
++void inode_qmblk_lock(struct super_block *sb);
++void inode_qmblk_unlock(struct super_block *sb);
++void qmblk_data_read_lock(struct vz_quota_master *qmblk);
++void qmblk_data_read_unlock(struct vz_quota_master *qmblk);
++void qmblk_data_write_lock(struct vz_quota_master *qmblk);
++void qmblk_data_write_unlock(struct vz_quota_master *qmblk);
++
++/* for quota operations */
++void vzquota_inode_init_call(struct inode *inode);
++void vzquota_inode_drop_call(struct inode *inode);
++int vzquota_inode_transfer_call(struct inode *, struct iattr *);
++struct vz_quota_master *vzquota_inode_data(struct inode *inode,
++ struct vz_quota_datast *);
++void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *);
++int vzquota_rename_check(struct inode *inode,
++ struct inode *old_dir, struct inode *new_dir);
++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode);
++/* for second-level quota */
++struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
++/* for management operations */
++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id,
++ struct vz_quota_stat *qstat);
++void vzquota_free_master(struct vz_quota_master *);
++struct vz_quota_master *vzquota_find_master(unsigned int quota_id);
++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode,
++ struct vz_quota_master *qmblk);
++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk);
++int vzquota_get_super(struct super_block *sb);
++void vzquota_put_super(struct super_block *sb);
++
++static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk)
++{
++ if (!atomic_read(&qmblk->dq_count))
++ BUG();
++ atomic_inc(&qmblk->dq_count);
++ return qmblk;
++}
++
++static inline void __qmblk_put(struct vz_quota_master *qmblk)
++{
++ atomic_dec(&qmblk->dq_count);
++}
++
++static inline void qmblk_put(struct vz_quota_master *qmblk)
++{
++ if (!atomic_dec_and_test(&qmblk->dq_count))
++ return;
++ vzquota_free_master(qmblk);
++}
++
++extern struct list_head vzquota_hash_table[];
++extern int vzquota_hash_size;
++
++/*
++ * Interface to VZ UGID quota
++ */
++extern struct quotactl_ops vz_quotactl_operations;
++extern struct dquot_operations vz_quota_operations2;
++extern struct quota_format_type vz_quota_empty_v2_format;
++
++#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \
++ qmblk->dq_uid_tree : \
++ qmblk->dq_gid_tree)
++
++#define VZDQUG_FIND_DONT_ALLOC 1
++#define VZDQUG_FIND_FAKE 2
++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk,
++ unsigned int quota_id, int type, int flags);
++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk,
++ unsigned int quota_id, int type, int flags);
++struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid);
++void vzquota_put_ugid(struct vz_quota_master *qmblk,
++ struct vz_quota_ugid *qugid);
++void vzquota_kill_ugid(struct vz_quota_master *qmblk);
++int vzquota_ugid_init(void);
++void vzquota_ugid_release(void);
++int vzquota_transfer_usage(struct inode *inode, int mask,
++ struct vz_quota_ilink *qlnk);
++
++struct vzctl_quotaugidctl;
++long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub);
++
++/*
++ * Other VZ quota parts
++ */
++extern struct dquot_operations vz_quota_operations;
++
++long do_vzquotactl(int cmd, unsigned int quota_id,
++ struct vz_quota_stat *qstat, const char *ve_root);
++int vzquota_proc_init(void);
++void vzquota_proc_release(void);
++struct vz_quota_master *vzquota_find_qmblk(struct super_block *);
++extern struct semaphore vz_quota_sem;
++
++void vzaquota_init(void);
++void vzaquota_fini(void);
++
++#endif /* __KERNEL__ */
++
++#endif /* _VZDQUOTA_H */
+diff -upr linux-2.6.16.orig/include/linux/vzquota_qlnk.h linux-2.6.16-026test015/include/linux/vzquota_qlnk.h
+--- linux-2.6.16.orig/include/linux/vzquota_qlnk.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzquota_qlnk.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,25 @@
++/*
++ * include/linux/vzquota_qlnk.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _VZDQUOTA_QLNK_H
++#define _VZDQUOTA_QLNK_H
++
++struct vz_quota_master;
++struct vz_quota_ugid;
++
++/* inode link, used to track inodes using quota via dq_ilink_list */
++struct vz_quota_ilink {
++ struct vz_quota_master *qmblk;
++ struct vz_quota_ugid *qugid[MAXQUOTAS];
++ struct list_head list;
++ unsigned char origin;
++};
++
++#endif /* _VZDQUOTA_QLNK_H */
+diff -upr linux-2.6.16.orig/include/linux/vzratelimit.h linux-2.6.16-026test015/include/linux/vzratelimit.h
+--- linux-2.6.16.orig/include/linux/vzratelimit.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzratelimit.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,28 @@
++/*
++ * include/linux/vzratelimit.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VZ_RATELIMIT_H__
++#define __VZ_RATELIMIT_H__
++
++/*
++ * Generic ratelimiting stuff.
++ */
++
++struct vz_rate_info {
++ int burst;
++ int interval; /* jiffy_t per event */
++ int bucket; /* kind of leaky bucket */
++ unsigned long last; /* last event */
++};
++
++/* Return true if rate limit permits. */
++int vz_ratelimit(struct vz_rate_info *p);
++
++#endif /* __VZ_RATELIMIT_H__ */
+diff -upr linux-2.6.16.orig/include/linux/vzstat.h linux-2.6.16-026test015/include/linux/vzstat.h
+--- linux-2.6.16.orig/include/linux/vzstat.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/linux/vzstat.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,182 @@
++/*
++ * include/linux/vzstat.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __VZSTAT_H__
++#define __VZSTAT_H__
++
++struct swap_cache_info_struct {
++ unsigned long add_total;
++ unsigned long del_total;
++ unsigned long find_success;
++ unsigned long find_total;
++ unsigned long noent_race;
++ unsigned long exist_race;
++ unsigned long remove_race;
++};
++
++struct kstat_lat_snap_struct {
++ cycles_t maxlat, totlat;
++ unsigned long count;
++};
++struct kstat_lat_pcpu_snap_struct {
++ cycles_t maxlat, totlat;
++ unsigned long count;
++ seqcount_t lock;
++} ____cacheline_aligned_in_smp;
++
++struct kstat_lat_struct {
++ struct kstat_lat_snap_struct cur, last;
++ cycles_t avg[3];
++};
++struct kstat_lat_pcpu_struct {
++ struct kstat_lat_pcpu_snap_struct cur[NR_CPUS];
++ cycles_t max_snap;
++ struct kstat_lat_snap_struct last;
++ cycles_t avg[3];
++};
++
++struct kstat_perf_snap_struct {
++ cycles_t wall_tottime, cpu_tottime;
++ cycles_t wall_maxdur, cpu_maxdur;
++ unsigned long count;
++};
++struct kstat_perf_struct {
++ struct kstat_perf_snap_struct cur, last;
++};
++
++struct kstat_zone_avg {
++ unsigned long free_pages_avg[3],
++ nr_active_avg[3],
++ nr_inactive_avg[3];
++};
++
++#define KSTAT_ALLOCSTAT_NR 5
++
++struct kernel_stat_glob {
++ unsigned long nr_unint_avg[3];
++
++ unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR];
++ struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
++ struct kstat_lat_pcpu_struct sched_lat;
++ struct kstat_lat_struct swap_in;
++
++ struct kstat_perf_struct ttfp, cache_reap,
++ refill_inact, shrink_icache, shrink_dcache;
++
++ struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */
++} ____cacheline_aligned;
++
++extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
++extern spinlock_t kstat_glb_lock;
++
++#ifdef CONFIG_VE
++#define KSTAT_PERF_ENTER(name) \
++ unsigned long flags; \
++ cycles_t start, sleep_time; \
++ \
++ start = get_cycles(); \
++ sleep_time = VE_TASK_INFO(current)->sleep_time; \
++
++#define KSTAT_PERF_LEAVE(name) \
++ spin_lock_irqsave(&kstat_glb_lock, flags); \
++ kstat_glob.name.cur.count++; \
++ start = get_cycles() - start; \
++ if (kstat_glob.name.cur.wall_maxdur < start) \
++ kstat_glob.name.cur.wall_maxdur = start;\
++ kstat_glob.name.cur.wall_tottime += start; \
++ start -= VE_TASK_INFO(current)->sleep_time - \
++ sleep_time; \
++ if (kstat_glob.name.cur.cpu_maxdur < start) \
++ kstat_glob.name.cur.cpu_maxdur = start; \
++ kstat_glob.name.cur.cpu_tottime += start; \
++ spin_unlock_irqrestore(&kstat_glb_lock, flags); \
++
++#else
++#define KSTAT_PERF_ENTER(name)
++#define KSTAT_PERF_LEAVE(name)
++#endif
++
++/*
++ * Add another statistics reading.
++ * Serialization is the caller's due.
++ */
++static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p,
++ cycles_t dur)
++{
++ p->cur.count++;
++ if (p->cur.maxlat < dur)
++ p->cur.maxlat = dur;
++ p->cur.totlat += dur;
++}
++
++static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu,
++ cycles_t dur)
++{
++ struct kstat_lat_pcpu_snap_struct *cur;
++
++ cur = &p->cur[cpu];
++ write_seqcount_begin(&cur->lock);
++ cur->count++;
++ if (cur->maxlat < dur)
++ cur->maxlat = dur;
++ cur->totlat += dur;
++ write_seqcount_end(&cur->lock);
++}
++
++/*
++ * Move current statistics to last, clear last.
++ * Serialization is the caller's due.
++ */
++static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p)
++{
++ cycles_t m;
++ memcpy(&p->last, &p->cur, sizeof(p->last));
++ p->cur.maxlat = 0;
++ m = p->last.maxlat;
++ CALC_LOAD(p->avg[0], EXP_1, m)
++ CALC_LOAD(p->avg[1], EXP_5, m)
++ CALC_LOAD(p->avg[2], EXP_15, m)
++}
++
++static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
++{
++ unsigned i, cpu;
++ struct kstat_lat_pcpu_snap_struct snap, *cur;
++ cycles_t m;
++
++ memset(&p->last, 0, sizeof(p->last));
++ for (cpu = 0; cpu < NR_CPUS; cpu++) {
++ cur = &p->cur[cpu];
++ do {
++ i = read_seqcount_begin(&cur->lock);
++ memcpy(&snap, cur, sizeof(snap));
++ } while (read_seqcount_retry(&cur->lock, i));
++ /*
++ * read above and this update of maxlat is not atomic,
++ * but this is OK, since it happens rarely and losing
++ * a couple of peaks is not essential. xemul
++ */
++ cur->maxlat = 0;
++
++ p->last.count += snap.count;
++ p->last.totlat += snap.totlat;
++ if (p->last.maxlat < snap.maxlat)
++ p->last.maxlat = snap.maxlat;
++ }
++
++ m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
++ CALC_LOAD(p->avg[0], EXP_1, m);
++ CALC_LOAD(p->avg[1], EXP_5, m);
++ CALC_LOAD(p->avg[2], EXP_15, m);
++ /* reset max_snap to calculate it correctly next time */
++ p->max_snap = 0;
++}
++
++#endif /* __VZSTAT_H__ */
+diff -upr linux-2.6.16.orig/include/net/addrconf.h linux-2.6.16-026test015/include/net/addrconf.h
+--- linux-2.6.16.orig/include/net/addrconf.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/addrconf.h 2006-07-04 14:41:39.000000000 +0400
+@@ -244,5 +244,14 @@ extern int if6_proc_init(void);
+ extern void if6_proc_exit(void);
+ #endif
+
++int addrconf_ifdown(struct net_device *dev, int how);
++int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen);
++
++#ifdef CONFIG_VE
++int addrconf_sysctl_init(struct ve_struct *ve);
++void addrconf_sysctl_fini(struct ve_struct *ve);
++void addrconf_sysctl_free(struct ve_struct *ve);
++#endif
++
+ #endif
+ #endif
+diff -upr linux-2.6.16.orig/include/net/af_unix.h linux-2.6.16-026test015/include/net/af_unix.h
+--- linux-2.6.16.orig/include/net/af_unix.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/af_unix.h 2006-07-04 14:41:38.000000000 +0400
+@@ -19,23 +19,37 @@ extern atomic_t unix_tot_inflight;
+
+ static inline struct sock *first_unix_socket(int *i)
+ {
++ struct sock *s;
++ struct ve_struct *ve;
++
++ ve = get_exec_env();
+ for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) {
+- if (!hlist_empty(&unix_socket_table[*i]))
+- return __sk_head(&unix_socket_table[*i]);
++ for (s = sk_head(&unix_socket_table[*i]);
++ s != NULL && !ve_accessible(s->sk_owner_env, ve);
++ s = sk_next(s));
++ if (s != NULL)
++ return s;
+ }
+ return NULL;
+ }
+
+ static inline struct sock *next_unix_socket(int *i, struct sock *s)
+ {
+- struct sock *next = sk_next(s);
+- /* More in this chain? */
+- if (next)
+- return next;
++ struct ve_struct *ve;
++
++ ve = get_exec_env();
++ for (s = sk_next(s); s != NULL; s = sk_next(s)) {
++ if (!ve_accessible(s->sk_owner_env, ve))
++ continue;
++ return s;
++ }
+ /* Look for next non-empty chain. */
+ for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) {
+- if (!hlist_empty(&unix_socket_table[*i]))
+- return __sk_head(&unix_socket_table[*i]);
++ for (s = sk_head(&unix_socket_table[*i]);
++ s != NULL && !ve_accessible(s->sk_owner_env, ve);
++ s = sk_next(s));
++ if (s != NULL)
++ return s;
+ }
+ return NULL;
+ }
+diff -upr linux-2.6.16.orig/include/net/arp.h linux-2.6.16-026test015/include/net/arp.h
+--- linux-2.6.16.orig/include/net/arp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/arp.h 2006-07-04 14:41:39.000000000 +0400
+@@ -7,7 +7,14 @@
+
+ #define HAVE_ARP_CREATE
+
+-extern struct neigh_table arp_tbl;
++#ifdef CONFIG_VE
++#define arp_tbl (*(get_exec_env()->ve_arp_tbl))
++extern int ve_arp_init(struct ve_struct *ve);
++extern void ve_arp_fini(struct ve_struct *ve);
++#else
++struct neigh_table global_arp_tbl;
++#define arp_tbl global_arp_tbl
++#endif
+
+ extern void arp_init(void);
+ extern int arp_rcv(struct sk_buff *skb, struct net_device *dev,
+diff -upr linux-2.6.16.orig/include/net/compat.h linux-2.6.16-026test015/include/net/compat.h
+--- linux-2.6.16.orig/include/net/compat.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/compat.h 2006-07-04 14:41:36.000000000 +0400
+@@ -23,6 +23,14 @@ struct compat_cmsghdr {
+ compat_int_t cmsg_type;
+ };
+
++#if defined(CONFIG_X86_64)
++#define is_current_32bits() (current_thread_info()->flags & _TIF_IA32)
++#elif defined(CONFIG_IA64)
++#define is_current_32bits() (IS_IA32_PROCESS(ia64_task_regs(current)))
++#else
++#define is_current_32bits() 0
++#endif
++
+ #else /* defined(CONFIG_COMPAT) */
+ #define compat_msghdr msghdr /* to avoid compiler warnings */
+ #endif /* defined(CONFIG_COMPAT) */
+diff -upr linux-2.6.16.orig/include/net/flow.h linux-2.6.16-026test015/include/net/flow.h
+--- linux-2.6.16.orig/include/net/flow.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/flow.h 2006-07-04 14:41:38.000000000 +0400
+@@ -10,6 +10,7 @@
+ #include <linux/in6.h>
+ #include <asm/atomic.h>
+
++struct ve_struct;
+ struct flowi {
+ int oif;
+ int iif;
+@@ -78,6 +79,9 @@ struct flowi {
+ #define fl_icmp_type uli_u.icmpt.type
+ #define fl_icmp_code uli_u.icmpt.code
+ #define fl_ipsec_spi uli_u.spi
++#ifdef CONFIG_VE
++ struct ve_struct *owner_env;
++#endif
+ } __attribute__((__aligned__(BITS_PER_LONG/8)));
+
+ #define FLOW_DIR_IN 0
+diff -upr linux-2.6.16.orig/include/net/icmp.h linux-2.6.16-026test015/include/net/icmp.h
+--- linux-2.6.16.orig/include/net/icmp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/icmp.h 2006-07-04 14:41:38.000000000 +0400
+@@ -31,9 +31,14 @@ struct icmp_err {
+
+ extern struct icmp_err icmp_err_convert[];
+ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+-#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field)
+-#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field)
+-#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field)
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_icmp_statistics (get_exec_env()->_icmp_statistics)
++#else
++#define ve_icmp_statistics icmp_statistics
++#endif
++#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field)
++#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field)
++#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field)
+
+ struct dst_entry;
+ struct net_proto_family;
+diff -upr linux-2.6.16.orig/include/net/if_inet6.h linux-2.6.16-026test015/include/net/if_inet6.h
+--- linux-2.6.16.orig/include/net/if_inet6.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/if_inet6.h 2006-07-04 14:41:39.000000000 +0400
+@@ -194,7 +194,14 @@ struct inet6_dev
+ unsigned long tstamp; /* ipv6InterfaceTable update timestamp */
+ };
+
+-extern struct ipv6_devconf ipv6_devconf;
++extern struct ipv6_devconf global_ipv6_devconf;
++extern struct ipv6_devconf global_ipv6_devconf_dflt;
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_ipv6_devconf (*(get_exec_env()->_ipv6_devconf))
++#else
++#define ve_ipv6_devconf global_ipv6_devconf
++#endif
+
+ static inline void ipv6_eth_mc_map(struct in6_addr *addr, char *buf)
+ {
+diff -upr linux-2.6.16.orig/include/net/inet6_hashtables.h linux-2.6.16-026test015/include/net/inet6_hashtables.h
+--- linux-2.6.16.orig/include/net/inet6_hashtables.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/inet6_hashtables.h 2006-07-04 14:41:39.000000000 +0400
+@@ -27,11 +27,13 @@ struct inet_hashinfo;
+
+ /* I have no idea if this is a good hash for v6 or not. -DaveM */
+ static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport,
+- const struct in6_addr *faddr, const u16 fport)
++ const struct in6_addr *faddr, const u16 fport,
++ const envid_t veid)
+ {
+ unsigned int hashent = (lport ^ fport);
+
+ hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]);
++ hashent ^= (veid ^ (veid >> 16));
+ hashent ^= hashent >> 16;
+ hashent ^= hashent >> 8;
+ return hashent;
+@@ -45,7 +47,7 @@ static inline int inet6_sk_ehashfn(const
+ const struct in6_addr *faddr = &np->daddr;
+ const __u16 lport = inet->num;
+ const __u16 fport = inet->dport;
+- return inet6_ehashfn(laddr, lport, faddr, fport);
++ return inet6_ehashfn(laddr, lport, faddr, fport, VEID(VE_OWNER_SK(sk)));
+ }
+
+ static inline void __inet6_hash(struct inet_hashinfo *hashinfo,
+@@ -94,14 +96,15 @@ static inline struct sock *
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+- unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport);
++ struct ve_struct *env = get_exec_env();
++ unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport, VEID(env));
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+
+ prefetch(head->chain.first);
+ read_lock(&head->lock);
+ sk_for_each(sk, node, &head->chain) {
+ /* For IPV6 do the cheaper port and family tests first. */
+- if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif))
++ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif, env))
+ goto hit; /* You sunk my battleship! */
+ }
+ /* Must check for a TIME_WAIT'er before going to listener hash. */
+@@ -114,6 +117,7 @@ static inline struct sock *
+
+ if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
++ ve_accessible_strict(tw->tw_owner_env, VEID(env)) &&
+ (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif))
+ goto hit;
+ }
+diff -upr linux-2.6.16.orig/include/net/inet_hashtables.h linux-2.6.16-026test015/include/net/inet_hashtables.h
+--- linux-2.6.16.orig/include/net/inet_hashtables.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/inet_hashtables.h 2006-07-04 14:41:38.000000000 +0400
+@@ -24,6 +24,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/types.h>
+ #include <linux/wait.h>
++#include <linux/ve_owner.h>
+
+ #include <net/inet_connection_sock.h>
+ #include <net/inet_sock.h>
+@@ -75,11 +76,13 @@ struct inet_ehash_bucket {
+ * ports are created in O(1) time? I thought so. ;-) -DaveM
+ */
+ struct inet_bind_bucket {
++ struct ve_struct *owner_env;
+ unsigned short port;
+ signed short fastreuse;
+ struct hlist_node node;
+ struct hlist_head owners;
+ };
++DCL_VE_OWNER_PROTO(TB, struct inet_bind_bucket, owner_env)
+
+ #define inet_bind_bucket_for_each(tb, node, head) \
+ hlist_for_each_entry(tb, node, head, node)
+@@ -139,37 +142,43 @@ static inline struct inet_ehash_bucket *
+ extern struct inet_bind_bucket *
+ inet_bind_bucket_create(kmem_cache_t *cachep,
+ struct inet_bind_hashbucket *head,
+- const unsigned short snum);
++ const unsigned short snum,
++ struct ve_struct *env);
+ extern void inet_bind_bucket_destroy(kmem_cache_t *cachep,
+ struct inet_bind_bucket *tb);
+
+-static inline int inet_bhashfn(const __u16 lport, const int bhash_size)
++static inline int inet_bhashfn(const __u16 lport, const int bhash_size,
++ unsigned veid)
+ {
+- return lport & (bhash_size - 1);
++ return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1));
+ }
+
+ extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
+ const unsigned short snum);
+
+ /* These can have wildcards, don't try too hard. */
+-static inline int inet_lhashfn(const unsigned short num)
++static inline int inet_lhashfn(const unsigned short num, unsigned veid)
+ {
+- return num & (INET_LHTABLE_SIZE - 1);
++ return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1));
+ }
+
+ static inline int inet_sk_listen_hashfn(const struct sock *sk)
+ {
+- return inet_lhashfn(inet_sk(sk)->num);
++ return inet_lhashfn(inet_sk(sk)->num, VEID(VE_OWNER_SK(sk)));
+ }
+
+ /* Caller must disable local BH processing. */
+ static inline void __inet_inherit_port(struct inet_hashinfo *table,
+ struct sock *sk, struct sock *child)
+ {
+- const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size);
+- struct inet_bind_hashbucket *head = &table->bhash[bhash];
++ int bhash;
++ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+
++ bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size,
++ VEID(VE_OWNER_SK(child)));
++ head = &table->bhash[bhash];
++
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ sk_add_bind_node(child, &tb->owners);
+@@ -275,7 +284,8 @@ static inline int inet_iif(const struct
+ extern struct sock *__inet_lookup_listener(const struct hlist_head *head,
+ const u32 daddr,
+ const unsigned short hnum,
+- const int dif);
++ const int dif,
++ struct ve_struct *env);
+
+ /* Optimize the common listener case. */
+ static inline struct sock *
+@@ -285,18 +295,21 @@ static inline struct sock *
+ {
+ struct sock *sk = NULL;
+ const struct hlist_head *head;
++ struct ve_struct *env;
+
++ env = get_exec_env();
+ read_lock(&hashinfo->lhash_lock);
+- head = &hashinfo->listening_hash[inet_lhashfn(hnum)];
++ head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))];
+ if (!hlist_empty(head)) {
+ const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+
+ if (inet->num == hnum && !sk->sk_node.next &&
++ ve_accessible_strict(VE_OWNER_SK(sk), env) &&
+ (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
+ (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+ !sk->sk_bound_dev_if)
+ goto sherry_cache;
+- sk = __inet_lookup_listener(head, daddr, hnum, dif);
++ sk = __inet_lookup_listener(head, daddr, hnum, dif, env);
+ }
+ if (sk) {
+ sherry_cache:
+@@ -323,25 +336,25 @@ sherry_cache:
+ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
+ const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr));
+ #endif /* __BIG_ENDIAN */
+-#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
++#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
+ (((__sk)->sk_hash == (__hash)) && \
+ ((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \
+ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
+ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+-#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
++#define INET_TW_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
+ (((__sk)->sk_hash == (__hash)) && \
+ ((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \
+ ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
+ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+ #else /* 32-bit arch */
+ #define INET_ADDR_COOKIE(__name, __saddr, __daddr)
+-#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \
++#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \
+ (((__sk)->sk_hash == (__hash)) && \
+ (inet_sk(__sk)->daddr == (__saddr)) && \
+ (inet_sk(__sk)->rcv_saddr == (__daddr)) && \
+ ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \
+ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+-#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \
++#define INET_TW_MATCH_ALLVE(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \
+ (((__sk)->sk_hash == (__hash)) && \
+ (inet_twsk(__sk)->tw_daddr == (__saddr)) && \
+ (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \
+@@ -349,6 +362,18 @@ sherry_cache:
+ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
+ #endif /* 64-bit arch */
+
++#define INET_MATCH(__sk, __hash, __cookie, __saddr, \
++ __daddr, __ports, __dif, __ve) \
++ (INET_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \
++ (__daddr), (__ports), (__dif)) \
++ && ve_accessible_strict(VE_OWNER_SK(__sk), (__ve)))
++
++#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, \
++ __daddr, __ports, __dif, __ve) \
++ (INET_TW_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \
++ (__daddr), (__ports), (__dif)) \
++ && ve_accessible_strict(inet_twsk(__sk)->tw_owner_env, VEID(__ve)))
++
+ /*
+ * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
+ * not check it for lookups anymore, thanks Alexey. -DaveM
+@@ -368,19 +393,25 @@ static inline struct sock *
+ /* Optimize here for direct hit, only listening connections can
+ * have wildcards anyways.
+ */
+- unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport);
+- struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash);
+-
++ unsigned int hash;
++ struct inet_ehash_bucket *head;
++ struct ve_struct *env;
++
++ env = get_exec_env();
++ hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(env));
++ head = inet_ehash_bucket(hashinfo, hash);
+ prefetch(head->chain.first);
+ read_lock(&head->lock);
+ sk_for_each(sk, node, &head->chain) {
+- if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
++ if (INET_MATCH(sk, hash, acookie, saddr, daddr,
++ ports, dif, env))
+ goto hit; /* You sunk my battleship! */
+ }
+
+ /* Must check for a TIME_WAIT'er before going to listener hash. */
+ sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) {
+- if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
++ if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr,
++ ports, dif, env))
+ goto hit;
+ }
+ sk = NULL;
+diff -upr linux-2.6.16.orig/include/net/inet_sock.h linux-2.6.16-026test015/include/net/inet_sock.h
+--- linux-2.6.16.orig/include/net/inet_sock.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/inet_sock.h 2006-07-04 14:41:38.000000000 +0400
+@@ -171,9 +171,10 @@ static inline void inet_sk_copy_descenda
+ extern int inet_sk_rebuild_header(struct sock *sk);
+
+ static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport,
+- const __u32 faddr, const __u16 fport)
++ const __u32 faddr, const __u16 fport,
++ const envid_t veid)
+ {
+- unsigned int h = (laddr ^ lport) ^ (faddr ^ fport);
++ int h = (laddr ^ lport) ^ (faddr ^ fport) ^ (veid ^ (veid >> 16));
+ h ^= h >> 16;
+ h ^= h >> 8;
+ return h;
+@@ -186,8 +187,9 @@ static inline int inet_sk_ehashfn(const
+ const __u16 lport = inet->num;
+ const __u32 faddr = inet->daddr;
+ const __u16 fport = inet->dport;
++ envid_t veid = VEID(VE_OWNER_SK(sk));
+
+- return inet_ehashfn(laddr, lport, faddr, fport);
++ return inet_ehashfn(laddr, lport, faddr, fport, veid);
+ }
+
+ #endif /* _INET_SOCK_H */
+diff -upr linux-2.6.16.orig/include/net/inet_timewait_sock.h linux-2.6.16-026test015/include/net/inet_timewait_sock.h
+--- linux-2.6.16.orig/include/net/inet_timewait_sock.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/inet_timewait_sock.h 2006-07-04 14:41:38.000000000 +0400
+@@ -134,6 +134,7 @@ struct inet_timewait_sock {
+ unsigned long tw_ttd;
+ struct inet_bind_bucket *tw_tb;
+ struct hlist_node tw_death_node;
++ envid_t tw_owner_env;
+ };
+
+ static inline void inet_twsk_add_node(struct inet_timewait_sock *tw,
+diff -upr linux-2.6.16.orig/include/net/ip.h linux-2.6.16-026test015/include/net/ip.h
+--- linux-2.6.16.orig/include/net/ip.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/ip.h 2006-07-04 14:41:38.000000000 +0400
+@@ -95,6 +95,7 @@ extern int ip_local_deliver(struct sk_b
+ extern int ip_mr_input(struct sk_buff *skb);
+ extern int ip_output(struct sk_buff *skb);
+ extern int ip_mc_output(struct sk_buff *skb);
++extern int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
+ extern int ip_do_nat(struct sk_buff *skb);
+ extern void ip_send_check(struct iphdr *ip);
+ extern int ip_queue_xmit(struct sk_buff *skb, int ipfragok);
+@@ -152,15 +153,25 @@ struct ipv4_config
+
+ extern struct ipv4_config ipv4_config;
+ DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+-#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field)
+-#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field)
+-#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field)
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_ip_statistics (get_exec_env()->_ip_statistics)
++#else
++#define ve_ip_statistics ip_statistics
++#endif
++#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field)
++#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field)
++#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field)
+ DECLARE_SNMP_STAT(struct linux_mib, net_statistics);
+-#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field)
+-#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field)
+-#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field)
+-#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd)
+-#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd)
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_net_statistics (get_exec_env()->_net_statistics)
++#else
++#define ve_net_statistics net_statistics
++#endif
++#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field)
++#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field)
++#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field)
++#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd)
++#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd)
+
+ extern int sysctl_local_port_range[2];
+ extern int sysctl_ip_default_ttl;
+@@ -380,4 +391,11 @@ extern int ip_misc_proc_init(void);
+
+ extern struct ctl_table ipv4_table[];
+
++#ifdef CONFIG_SYSCTL
++extern int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
++ void __user *buffer, size_t *lenp, loff_t *ppos);
++extern int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name,
++ int nlen, void __user *oldval, size_t __user *oldlenp,
++ void __user *newval, size_t newlen, void **context);
++#endif
+ #endif /* _IP_H */
+diff -upr linux-2.6.16.orig/include/net/ip6_fib.h linux-2.6.16-026test015/include/net/ip6_fib.h
+--- linux-2.6.16.orig/include/net/ip6_fib.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/ip6_fib.h 2006-07-04 14:41:39.000000000 +0400
+@@ -78,6 +78,15 @@ struct rt6_info
+ u8 rt6i_protocol;
+ };
+
++struct fib6_table
++{
++ struct list_head list;
++ struct fib6_node root;
++ struct ve_struct *owner_env;
++};
++
++extern struct list_head fib6_table_list;
++
+ struct fib6_walker_t
+ {
+ struct fib6_walker_t *prev, *next;
+@@ -143,7 +152,7 @@ struct rt6_statistics {
+
+ typedef void (*f_pnode)(struct fib6_node *fn, void *);
+
+-extern struct fib6_node ip6_routing_table;
++extern struct fib6_node ve0_ip6_routing_table;
+
+ /*
+ * exported functions
+diff -upr linux-2.6.16.orig/include/net/ip6_route.h linux-2.6.16-026test015/include/net/ip6_route.h
+--- linux-2.6.16.orig/include/net/ip6_route.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/ip6_route.h 2006-07-04 14:41:39.000000000 +0400
+@@ -139,5 +139,10 @@ static inline int ipv6_unicast_destinati
+ return rt->rt6i_flags & RTF_LOCAL;
+ }
+
++#ifdef CONFIG_VE
++int init_ve_route6(struct ve_struct *ve);
++void fini_ve_route6(struct ve_struct *ve);
++#endif
++
+ #endif
+ #endif
+diff -upr linux-2.6.16.orig/include/net/ip_fib.h linux-2.6.16-026test015/include/net/ip_fib.h
+--- linux-2.6.16.orig/include/net/ip_fib.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/ip_fib.h 2006-07-04 14:41:38.000000000 +0400
+@@ -168,10 +168,22 @@ struct fib_table {
+ unsigned char tb_data[0];
+ };
+
++struct fn_zone;
++struct fn_hash
++{
++ struct fn_zone *fn_zones[33];
++ struct fn_zone *fn_zone_list;
++};
++
+ #ifndef CONFIG_IP_MULTIPLE_TABLES
+
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ip_fib_local_table get_exec_env()->_local_table
++#define ip_fib_main_table get_exec_env()->_main_table
++#else
+ extern struct fib_table *ip_fib_local_table;
+ extern struct fib_table *ip_fib_main_table;
++#endif
+
+ static inline struct fib_table *fib_get_table(int id)
+ {
+@@ -203,7 +215,12 @@ static inline void fib_select_default(co
+ #define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL])
+ #define ip_fib_main_table (fib_tables[RT_TABLE_MAIN])
+
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define fib_tables get_exec_env()->_fib_tables
++#else
+ extern struct fib_table * fib_tables[RT_TABLE_MAX+1];
++#endif
++
+ extern int fib_lookup(const struct flowi *flp, struct fib_result *res);
+ extern struct fib_table *__fib_new_table(int id);
+ extern void fib_rule_put(struct fib_rule *r);
+@@ -250,10 +267,19 @@ extern u32 __fib_res_prefsrc(struct fib
+
+ /* Exported by fib_hash.c */
+ extern struct fib_table *fib_hash_init(int id);
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++struct ve_struct;
++extern int init_ve_route(struct ve_struct *ve);
++extern void fini_ve_route(struct ve_struct *ve);
++#else
++#define init_ve_route(ve) (0)
++#define fini_ve_route(ve) do { } while (0)
++#endif
+
+ #ifdef CONFIG_IP_MULTIPLE_TABLES
+ /* Exported by fib_rules.c */
+-
++extern int fib_rules_create(void);
++extern void fib_rules_destroy(void);
+ extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
+ extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
+ extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb);
+diff -upr linux-2.6.16.orig/include/net/ipv6.h linux-2.6.16-026test015/include/net/ipv6.h
+--- linux-2.6.16.orig/include/net/ipv6.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/ipv6.h 2006-07-04 14:41:39.000000000 +0400
+@@ -113,39 +113,48 @@ extern int sysctl_mld_max_msf;
+
+ /* MIBs */
+ DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics);
+-#define IP6_INC_STATS(field) SNMP_INC_STATS(ipv6_statistics, field)
+-#define IP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ipv6_statistics, field)
+-#define IP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ipv6_statistics, field)
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_ipv6_statistics (get_exec_env()->_ipv6_statistics)
++#define ve_icmpv6_statistics (get_exec_env()->_icmpv6_statistics)
++#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6)
++#else
++#define ve_ipv6_statistics ipv6_statistics
++#define ve_icmpv6_statistics icmpv6_statistics
++#define ve_udp_stats_in6 udp_stats_in6
++#endif
++#define IP6_INC_STATS(field) SNMP_INC_STATS(ve_ipv6_statistics, field)
++#define IP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ipv6_statistics, field)
++#define IP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ipv6_statistics, field)
+ DECLARE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
+ #define ICMP6_INC_STATS(idev, field) ({ \
+ struct inet6_dev *_idev = (idev); \
+ if (likely(_idev != NULL)) \
+ SNMP_INC_STATS(idev->stats.icmpv6, field); \
+- SNMP_INC_STATS(icmpv6_statistics, field); \
++ SNMP_INC_STATS(ve_icmpv6_statistics, field); \
+ })
+ #define ICMP6_INC_STATS_BH(idev, field) ({ \
+ struct inet6_dev *_idev = (idev); \
+ if (likely(_idev != NULL)) \
+ SNMP_INC_STATS_BH((_idev)->stats.icmpv6, field); \
+- SNMP_INC_STATS_BH(icmpv6_statistics, field); \
++ SNMP_INC_STATS_BH(ve_icmpv6_statistics, field); \
+ })
+ #define ICMP6_INC_STATS_USER(idev, field) ({ \
+ struct inet6_dev *_idev = (idev); \
+ if (likely(_idev != NULL)) \
+ SNMP_INC_STATS_USER(_idev->stats.icmpv6, field); \
+- SNMP_INC_STATS_USER(icmpv6_statistics, field); \
++ SNMP_INC_STATS_USER(ve_icmpv6_statistics, field); \
+ })
+ #define ICMP6_INC_STATS_OFFSET_BH(idev, field, offset) ({ \
+ struct inet6_dev *_idev = idev; \
+ __typeof__(offset) _offset = (offset); \
+ if (likely(_idev != NULL)) \
+ SNMP_INC_STATS_OFFSET_BH(_idev->stats.icmpv6, field, _offset); \
+- SNMP_INC_STATS_OFFSET_BH(icmpv6_statistics, field, _offset); \
++ SNMP_INC_STATS_OFFSET_BH(ve_icmpv6_statistics, field, _offset); \
+ })
+ DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6);
+-#define UDP6_INC_STATS(field) SNMP_INC_STATS(udp_stats_in6, field)
+-#define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_stats_in6, field)
+-#define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_stats_in6, field)
++#define UDP6_INC_STATS(field) SNMP_INC_STATS(ve_udp_stats_in6, field)
++#define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_stats_in6, field)
++#define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_stats_in6, field)
+
+ int snmp6_register_dev(struct inet6_dev *idev);
+ int snmp6_unregister_dev(struct inet6_dev *idev);
+@@ -154,6 +163,11 @@ int snmp6_free_dev(struct inet6_dev *ide
+ int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign);
+ void snmp6_mib_free(void *ptr[2]);
+
++#ifdef CONFIG_VE
++int ve_snmp_proc_init(void);
++void ve_snmp_proc_fini(void);
++#endif
++
+ struct ip6_ra_chain
+ {
+ struct ip6_ra_chain *next;
+diff -upr linux-2.6.16.orig/include/net/ndisc.h linux-2.6.16-026test015/include/net/ndisc.h
+--- linux-2.6.16.orig/include/net/ndisc.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/ndisc.h 2006-07-04 14:41:39.000000000 +0400
+@@ -50,7 +50,14 @@ struct net_device;
+ struct net_proto_family;
+ struct sk_buff;
+
+-extern struct neigh_table nd_tbl;
++#ifdef CONFIG_VE
++#define nd_tbl (*(get_exec_env()->ve_nd_tbl))
++extern int ve_ndisc_init(struct ve_struct *ve);
++extern void ve_ndisc_fini(struct ve_struct *ve);
++#else
++extern struct neigh_table global_nd_tbl;
++#define nd_tbl global_nd_tbl
++#endif
+
+ struct nd_msg {
+ struct icmp6hdr icmph;
+@@ -128,6 +135,7 @@ extern int ndisc_ifinfo_sysctl_change
+ extern void inet6_ifinfo_notify(int event,
+ struct inet6_dev *idev);
+
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ static inline struct neighbour * ndisc_get_neigh(struct net_device *dev, struct in6_addr *addr)
+ {
+
+@@ -136,6 +144,7 @@ static inline struct neighbour * ndisc_g
+
+ return NULL;
+ }
++#endif
+
+
+ #endif /* __KERNEL__ */
+diff -upr linux-2.6.16.orig/include/net/neighbour.h linux-2.6.16-026test015/include/net/neighbour.h
+--- linux-2.6.16.orig/include/net/neighbour.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/neighbour.h 2006-07-04 14:41:39.000000000 +0400
+@@ -191,6 +191,8 @@ struct neigh_table
+ atomic_t entries;
+ rwlock_t lock;
+ unsigned long last_rand;
++ struct ve_struct *owner_env;
++ struct user_beancounter *owner_ub;
+ kmem_cache_t *kmem_cachep;
+ struct neigh_statistics *stats;
+ struct neighbour **hash_buckets;
+@@ -210,7 +212,7 @@ struct neigh_table
+ #define NEIGH_UPDATE_F_ISROUTER 0x40000000
+ #define NEIGH_UPDATE_F_ADMIN 0x80000000
+
+-extern void neigh_table_init(struct neigh_table *tbl);
++extern int neigh_table_init(struct neigh_table *tbl);
+ extern int neigh_table_clear(struct neigh_table *tbl);
+ extern struct neighbour * neigh_lookup(struct neigh_table *tbl,
+ const void *pkey,
+diff -upr linux-2.6.16.orig/include/net/netlink_sock.h linux-2.6.16-026test015/include/net/netlink_sock.h
+--- linux-2.6.16.orig/include/net/netlink_sock.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/net/netlink_sock.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,22 @@
++#ifndef __NET_NETLINK_SOCK_H
++#define __NET_NETLINK_SOCK_H
++
++struct netlink_sock {
++ /* struct sock has to be the first member of netlink_sock */
++ struct sock sk;
++ u32 pid;
++ u32 dst_pid;
++ u32 dst_group;
++ u32 flags;
++ u32 subscriptions;
++ u32 ngroups;
++ unsigned long *groups;
++ unsigned long state;
++ wait_queue_head_t wait;
++ struct netlink_callback *cb;
++ spinlock_t cb_lock;
++ void (*data_ready)(struct sock *sk, int bytes);
++ struct module *module;
++};
++
++#endif /* __NET_NETLINK_SOCK_H */
+diff -upr linux-2.6.16.orig/include/net/route.h linux-2.6.16-026test015/include/net/route.h
+--- linux-2.6.16.orig/include/net/route.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/route.h 2006-07-04 14:41:38.000000000 +0400
+@@ -201,4 +201,14 @@ static inline struct inet_peer *rt_get_p
+
+ extern ctl_table ipv4_route_table[];
+
++#ifdef CONFIG_SYSCTL
++extern int ipv4_flush_delay;
++extern int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
++ struct file *filp, void __user *buffer, size_t *lenp,
++ loff_t *ppos);
++extern int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
++ int __user *name, int nlen, void __user *oldval,
++ size_t __user *oldlenp, void __user *newval,
++ size_t newlen, void **context);
++#endif
+ #endif /* _ROUTE_H */
+diff -upr linux-2.6.16.orig/include/net/scm.h linux-2.6.16-026test015/include/net/scm.h
+--- linux-2.6.16.orig/include/net/scm.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/scm.h 2006-07-04 14:41:38.000000000 +0400
+@@ -40,7 +40,7 @@ static __inline__ int scm_send(struct so
+ memset(scm, 0, sizeof(*scm));
+ scm->creds.uid = current->uid;
+ scm->creds.gid = current->gid;
+- scm->creds.pid = current->tgid;
++ scm->creds.pid = virt_tgid(current);
+ if (msg->msg_controllen <= 0)
+ return 0;
+ return __scm_send(sock, msg, scm);
+diff -upr linux-2.6.16.orig/include/net/sctp/sctp.h linux-2.6.16-026test015/include/net/sctp/sctp.h
+--- linux-2.6.16.orig/include/net/sctp/sctp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/sctp/sctp.h 2006-07-04 14:41:36.000000000 +0400
+@@ -461,12 +461,12 @@ static inline int sctp_frag_point(const
+ * there is room for a param header too.
+ */
+ #define sctp_walk_params(pos, chunk, member)\
+-_sctp_walk_params((pos), (chunk), WORD_ROUND(ntohs((chunk)->chunk_hdr.length)), member)
++_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member)
+
+ #define _sctp_walk_params(pos, chunk, end, member)\
+ for (pos.v = chunk->member;\
+ pos.v <= (void *)chunk + end - sizeof(sctp_paramhdr_t) &&\
+- pos.v <= (void *)chunk + end - WORD_ROUND(ntohs(pos.p->length)) &&\
++ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\
+ ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\
+ pos.v += WORD_ROUND(ntohs(pos.p->length)))
+
+@@ -477,7 +477,7 @@ _sctp_walk_errors((err), (chunk_hdr), nt
+ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \
+ sizeof(sctp_chunkhdr_t));\
+ (void *)err <= (void *)chunk_hdr + end - sizeof(sctp_errhdr_t) &&\
+- (void *)err <= (void *)chunk_hdr + end - WORD_ROUND(ntohs(err->length)) &&\
++ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\
+ ntohs(err->length) >= sizeof(sctp_errhdr_t); \
+ err = (sctp_errhdr_t *)((void *)err + WORD_ROUND(ntohs(err->length))))
+
+diff -upr linux-2.6.16.orig/include/net/sctp/structs.h linux-2.6.16-026test015/include/net/sctp/structs.h
+--- linux-2.6.16.orig/include/net/sctp/structs.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/sctp/structs.h 2006-07-04 14:41:36.000000000 +0400
+@@ -702,6 +702,7 @@ struct sctp_chunk {
+ __u8 tsn_gap_acked; /* Is this chunk acked by a GAP ACK? */
+ __s8 fast_retransmit; /* Is this chunk fast retransmitted? */
+ __u8 tsn_missing_report; /* Data chunk missing counter. */
++ __u8 data_accepted; /* At least 1 chunk in this packet accepted */
+ };
+
+ void sctp_chunk_hold(struct sctp_chunk *);
+diff -upr linux-2.6.16.orig/include/net/sock.h linux-2.6.16-026test015/include/net/sock.h
+--- linux-2.6.16.orig/include/net/sock.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/sock.h 2006-07-04 14:41:38.000000000 +0400
+@@ -55,6 +55,8 @@
+ #include <net/dst.h>
+ #include <net/checksum.h>
+
++#include <ub/ub_net.h>
++
+ /*
+ * This structure really needs to be cleaned up.
+ * Most of it is for TCP, and not used by any of
+@@ -251,8 +253,12 @@ struct sock {
+ int (*sk_backlog_rcv)(struct sock *sk,
+ struct sk_buff *skb);
+ void (*sk_destruct)(struct sock *sk);
++ struct sock_beancounter sk_bc;
++ struct ve_struct *sk_owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(SK, struct sock, sk_owner_env)
++
+ /*
+ * Hashed lists helper routines
+ */
+@@ -485,7 +491,8 @@ static inline void sk_add_backlog(struct
+ })
+
+ extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
+-extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
++extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p,
++ unsigned long amount);
+ extern void sk_stream_wait_close(struct sock *sk, long timeo_p);
+ extern int sk_stream_error(struct sock *sk, int flags, int err);
+ extern void sk_stream_kill_queues(struct sock *sk);
+@@ -706,8 +713,11 @@ static inline void sk_stream_writequeue_
+
+ static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
+ {
+- return (int)skb->truesize <= sk->sk_forward_alloc ||
+- sk_stream_mem_schedule(sk, skb->truesize, 1);
++ if ((int)skb->truesize > sk->sk_forward_alloc &&
++ !sk_stream_mem_schedule(sk, skb->truesize, 1))
++ /* The situation is bad according to mainstream. Den */
++ return 0;
++ return ub_tcprcvbuf_charge(sk, skb) == 0;
+ }
+
+ static inline int sk_stream_wmem_schedule(struct sock *sk, int size)
+@@ -765,6 +775,11 @@ extern struct sk_buff *sock_alloc_send
+ unsigned long size,
+ int noblock,
+ int *errcode);
++extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk,
++ unsigned long size,
++ unsigned long size2,
++ int noblock,
++ int *errcode);
+ extern void *sock_kmalloc(struct sock *sk, int size,
+ gfp_t priority);
+ extern void sock_kfree_s(struct sock *sk, void *mem, int size);
+@@ -1062,12 +1077,16 @@ sk_dst_check(struct sock *sk, u32 cookie
+
+ static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+ {
++ extern int sysctl_tcp_use_sg;
++
+ __sk_dst_set(sk, dst);
+ sk->sk_route_caps = dst->dev->features;
+ if (sk->sk_route_caps & NETIF_F_TSO) {
+ if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len)
+ sk->sk_route_caps &= ~NETIF_F_TSO;
+ }
++ if (!sysctl_tcp_use_sg)
++ sk->sk_route_caps &= ~NETIF_F_SG;
+ }
+
+ static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb)
+@@ -1142,6 +1161,10 @@ static inline int sock_queue_rcv_skb(str
+ goto out;
+ }
+
++ err = ub_sockrcvbuf_charge(sk, skb);
++ if (err < 0)
++ goto out;
++
+ /* It would be deadlock, if sock_queue_rcv_skb is used
+ with socket lock! We assume that users of this
+ function are lock free.
+diff -upr linux-2.6.16.orig/include/net/tcp.h linux-2.6.16-026test015/include/net/tcp.h
+--- linux-2.6.16.orig/include/net/tcp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/tcp.h 2006-07-04 14:41:39.000000000 +0400
+@@ -40,6 +40,7 @@
+ #include <net/tcp_states.h>
+
+ #include <linux/seq_file.h>
++#include <ub/ub_net.h>
+
+ extern struct inet_hashinfo tcp_hashinfo;
+
+@@ -219,6 +220,7 @@ extern int sysctl_tcp_nometrics_save;
+ extern int sysctl_tcp_moderate_rcvbuf;
+ extern int sysctl_tcp_tso_win_divisor;
+ extern int sysctl_tcp_abc;
++extern int sysctl_tcp_use_sg;
+
+ extern atomic_t tcp_memory_allocated;
+ extern atomic_t tcp_sockets_allocated;
+@@ -250,12 +252,17 @@ static inline int between(__u32 seq1, __
+ extern struct proto tcp_prot;
+
+ DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+-#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field)
+-#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field)
+-#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field)
+-#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field)
+-#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val)
+-#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val)
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_tcp_statistics (get_exec_env()->_tcp_statistics)
++#else
++#define ve_tcp_statistics tcp_statistics
++#endif
++#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field)
++#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field)
++#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field)
++#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field)
++#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val)
++#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val)
+
+ extern void tcp_v4_err(struct sk_buff *skb, u32);
+
+@@ -493,7 +500,7 @@ extern u32 __tcp_select_window(struct so
+ * to use only the low 32-bits of jiffies and hide the ugly
+ * casts with the following macro.
+ */
+-#define tcp_time_stamp ((__u32)(jiffies))
++#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup))
+
+ /* This is what the send packet queuing engine uses to pass
+ * TCP per-packet control information to the transmission
+diff -upr linux-2.6.16.orig/include/net/udp.h linux-2.6.16-026test015/include/net/udp.h
+--- linux-2.6.16.orig/include/net/udp.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/include/net/udp.h 2006-07-04 14:41:38.000000000 +0400
+@@ -39,13 +39,19 @@ extern rwlock_t udp_hash_lock;
+
+ extern int udp_port_rover;
+
+-static inline int udp_lport_inuse(u16 num)
++static inline int udp_hashfn(u16 num, unsigned veid)
++{
++ return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1));
++}
++
++static inline int udp_lport_inuse(u16 num, struct ve_struct *env)
+ {
+ struct sock *sk;
+ struct hlist_node *node;
+
+- sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)])
+- if (inet_sk(sk)->num == num)
++ sk_for_each(sk, node, &udp_hash[udp_hashfn(num, VEID(env))])
++ if (inet_sk(sk)->num == num &&
++ ve_accessible_strict(sk->sk_owner_env, env))
+ return 1;
+ return 0;
+ }
+@@ -75,9 +81,14 @@ extern unsigned int udp_poll(struct file
+ poll_table *wait);
+
+ DECLARE_SNMP_STAT(struct udp_mib, udp_statistics);
+-#define UDP_INC_STATS(field) SNMP_INC_STATS(udp_statistics, field)
+-#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field)
+-#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field)
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_udp_statistics (get_exec_env()->_udp_statistics)
++#else
++#define ve_udp_statistics udp_statistics
++#endif
++#define UDP_INC_STATS(field) SNMP_INC_STATS(ve_udp_statistics, field)
++#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_statistics, field)
++#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_statistics, field)
+
+ /* /proc */
+ struct udp_seq_afinfo {
+diff -upr linux-2.6.16.orig/include/ub/beancounter.h linux-2.6.16-026test015/include/ub/beancounter.h
+--- linux-2.6.16.orig/include/ub/beancounter.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/beancounter.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,329 @@
++/*
++ * include/ub/beancounter.h
++ *
++ * Copyright (C) 1999-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * Andrey Savochkin saw@sw-soft.com
++ *
++ */
++
++#ifndef _LINUX_BEANCOUNTER_H
++#define _LINUX_BEANCOUNTER_H
++
++#include <linux/config.h>
++
++/*
++ * Generic ratelimiting stuff.
++ */
++
++struct ub_rate_info {
++ int burst;
++ int interval; /* jiffy_t per event */
++ int bucket; /* kind of leaky bucket */
++ unsigned long last; /* last event */
++};
++
++/* Return true if rate limit permits. */
++int ub_ratelimit(struct ub_rate_info *);
++
++
++/*
++ * This magic is used to distinuish user beancounter and pages beancounter
++ * in struct page. page_ub and page_bc are placed in union and MAGIC
++ * ensures us that we don't use pbc as ubc in ub_page_uncharge().
++ */
++#define UB_MAGIC 0x62756275
++
++/*
++ * Resource list.
++ */
++
++#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including
++ * struct task, page directories, etc.
++ */
++#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */
++#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially
++ * private pages as private and used.
++ */
++#define UB_SHMPAGES 3 /* IPC SHM segment size. */
++#define UB_ZSHMPAGES 4 /* Anonymous shared memory. */
++#define UB_NUMPROC 5 /* Number of processes. */
++#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */
++#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation,
++ * checked against PRIVVMPAGES.
++ */
++#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill.
++ * Only limit is used, no accounting.
++ */
++#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */
++#define UB_NUMFLOCK 10 /* Number of file locks. */
++#define UB_NUMPTY 11 /* Number of PTYs. */
++#define UB_NUMSIGINFO 12 /* Number of siginfos. */
++#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */
++#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */
++#define UB_OTHERSOCKBUF 15 /* Total size of other socket
++ * send buffers (all buffers for PF_UNIX).
++ */
++#define UB_DGRAMRCVBUF 16 /* Total size of other socket
++ * receive buffers.
++ */
++#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */
++#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */
++#define UB_NUMFILE 19 /* Number of open files. */
++
++#define UB_RESOURCES 24
++
++#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0)
++#define UB_TMPFSPAGES (UB_RESOURCES + 1)
++#define UB_SWAPPAGES (UB_RESOURCES + 2)
++#define UB_HELDPAGES (UB_RESOURCES + 3)
++
++struct ubparm {
++ /*
++ * A barrier over which resource allocations are failed gracefully.
++ * If the amount of consumed memory is over the barrier further sbrk()
++ * or mmap() calls fail, the existing processes are not killed.
++ */
++ unsigned long barrier;
++ /* hard resource limit */
++ unsigned long limit;
++ /* consumed resources */
++ unsigned long held;
++ /* maximum amount of consumed resources through the last period */
++ unsigned long maxheld;
++ /* minimum amount of consumed resources through the last period */
++ unsigned long minheld;
++ /* count of failed charges */
++ unsigned long failcnt;
++};
++
++/*
++ * Kernel internal part.
++ */
++
++#ifdef __KERNEL__
++
++#include <ub/ub_debug.h>
++#include <linux/interrupt.h>
++#include <asm/atomic.h>
++#include <linux/spinlock.h>
++#include <linux/cache.h>
++#include <linux/threads.h>
++
++/*
++ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
++ */
++#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1)
++
++
++/*
++ * Resource management structures
++ * Serialization issues:
++ * beancounter list management is protected via ub_hash_lock
++ * task pointers are set only for current task and only once
++ * refcount is managed atomically
++ * value and limit comparison and change are protected by per-ub spinlock
++ */
++
++struct page_beancounter;
++struct task_beancounter;
++struct sock_beancounter;
++
++struct page_private {
++ unsigned long ubp_unused_privvmpages;
++ unsigned long ubp_tmpfs_respages;
++ unsigned long ubp_swap_pages;
++ unsigned long long ubp_held_pages;
++};
++
++struct sock_private {
++ unsigned long ubp_rmem_thres;
++ unsigned long ubp_wmem_pressure;
++ unsigned long ubp_maxadvmss;
++ unsigned long ubp_rmem_pressure;
++#define UB_RMEM_EXPAND 0
++#define UB_RMEM_KEEP 1
++#define UB_RMEM_SHRINK 2
++ struct list_head ubp_other_socks;
++ struct list_head ubp_tcp_socks;
++ atomic_t ubp_orphan_count;
++};
++
++struct ub_perfstat {
++ unsigned long unmap;
++ unsigned long swapin;
++
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ long pages_charged;
++ long vmalloc_charged;
++ long pbcs;
++#endif
++} ____cacheline_aligned_in_smp;
++
++struct user_beancounter
++{
++ unsigned long ub_magic;
++ atomic_t ub_refcount;
++ struct user_beancounter *ub_next;
++ spinlock_t ub_lock;
++ uid_t ub_uid;
++
++ struct ub_rate_info ub_limit_rl;
++ int ub_oom_noproc;
++
++ struct page_private ppriv;
++#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages
++#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages
++#define ub_swap_pages ppriv.ubp_swap_pages
++#define ub_held_pages ppriv.ubp_held_pages
++ struct sock_private spriv;
++#define ub_rmem_thres spriv.ubp_rmem_thres
++#define ub_maxadvmss spriv.ubp_maxadvmss
++#define ub_rmem_pressure spriv.ubp_rmem_pressure
++#define ub_wmem_pressure spriv.ubp_wmem_pressure
++#define ub_tcp_sk_list spriv.ubp_tcp_socks
++#define ub_other_sk_list spriv.ubp_other_socks
++#define ub_orphan_count spriv.ubp_orphan_count
++
++ struct user_beancounter *parent;
++ void *private_data;
++
++ /* resources statistic and settings */
++ struct ubparm ub_parms[UB_RESOURCES];
++ /* resources statistic for last interval */
++ struct ubparm ub_store[UB_RESOURCES];
++
++ struct ub_perfstat ub_stat[NR_CPUS];
++
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ struct list_head ub_cclist;
++#endif
++};
++
++enum severity { UB_HARD, UB_SOFT, UB_FORCE };
++
++static inline int ub_barrier_hit(struct user_beancounter *ub, int resource)
++{
++ return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier;
++}
++
++static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource)
++{
++ return (ub->ub_parms[resource].held >
++ ((ub->ub_parms[resource].barrier) >> 1));
++}
++
++#ifndef CONFIG_USER_RESOURCE
++
++extern inline struct user_beancounter *get_beancounter_byuid
++ (uid_t uid, int create) { return NULL; }
++extern inline struct user_beancounter *get_beancounter
++ (struct user_beancounter *ub) { return NULL; }
++extern inline void put_beancounter(struct user_beancounter *ub) {;}
++
++static inline void ub_init_cache(unsigned long mempages) { };
++static inline void ub_init_ub0(void) { };
++
++#define get_ub0() NULL
++
++#else /* CONFIG_USER_RESOURCE */
++
++/*
++ * Charge/uncharge operations
++ */
++
++extern int __charge_beancounter_locked(struct user_beancounter *ub,
++ int resource, unsigned long val, enum severity strict);
++
++extern void __uncharge_beancounter_locked(struct user_beancounter *ub,
++ int resource, unsigned long val);
++
++extern void __put_beancounter(struct user_beancounter *ub);
++
++extern void uncharge_warn(struct user_beancounter *ub, int resource,
++ unsigned long val, unsigned long held);
++
++extern const char *ub_rnames[];
++/*
++ * Put a beancounter reference
++ */
++
++static inline void put_beancounter(struct user_beancounter *ub)
++{
++ if (unlikely(ub == NULL))
++ return;
++
++ __put_beancounter(ub);
++}
++
++/*
++ * Create a new beancounter reference
++ */
++extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create);
++
++static inline
++struct user_beancounter *get_beancounter(struct user_beancounter *ub)
++{
++ if (unlikely(ub == NULL))
++ return NULL;
++
++ atomic_inc(&ub->ub_refcount);
++ return ub;
++}
++
++extern struct user_beancounter *get_subbeancounter_byid(
++ struct user_beancounter *,
++ int id, int create);
++extern struct user_beancounter *subbeancounter_findcreate(
++ struct user_beancounter *p, int id);
++
++extern struct user_beancounter ub0;
++
++extern void ub_init_cache(unsigned long);
++extern void ub_init_ub0(void);
++#define get_ub0() (&ub0)
++
++extern void print_ub_uid(struct user_beancounter *ub, char *buf, int size);
++
++/*
++ * Resource charging
++ * Change user's account and compare against limits
++ */
++
++static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource)
++{
++ if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
++ ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
++ if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
++ ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
++}
++
++#endif /* CONFIG_USER_RESOURCE */
++
++#include <ub/ub_decl.h>
++UB_DECLARE_FUNC(int, charge_beancounter(struct user_beancounter *ub,
++ int resource, unsigned long val, enum severity strict));
++UB_DECLARE_VOID_FUNC(uncharge_beancounter(struct user_beancounter *ub,
++ int resource, unsigned long val));
++
++UB_DECLARE_VOID_FUNC(charge_beancounter_notop(struct user_beancounter *ub,
++ int resource, unsigned long val));
++UB_DECLARE_VOID_FUNC(uncharge_beancounter_notop(struct user_beancounter *ub,
++ int resource, unsigned long val));
++
++#ifndef CONFIG_USER_RESOURCE_PROC
++static inline void ub_init_proc(void) { };
++#else
++extern void ub_init_proc(void);
++#endif
++
++#ifdef CONFIG_USER_RSS_ACCOUNTING
++extern void ub_init_pbc(void);
++#else
++static inline void ub_ini_pbc(void) { }
++#endif
++#endif /* __KERNEL__ */
++#endif /* _LINUX_BEANCOUNTER_H */
+diff -upr linux-2.6.16.orig/include/ub/ub_dcache.h linux-2.6.16-026test015/include/ub/ub_dcache.h
+--- linux-2.6.16.orig/include/ub/ub_dcache.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_dcache.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,57 @@
++/*
++ * include/ub/ub_dcache.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_DCACHE_H_
++#define __UB_DCACHE_H_
++
++#include <ub/ub_decl.h>
++
++/*
++ * UB_DCACHESIZE accounting
++ */
++
++struct dentry_beancounter
++{
++ /*
++ * d_inuse =
++ * <number of external refs> +
++ * <number of 'used' childs>
++ *
++ * d_inuse == -1 means that dentry is unused
++ * state change -1 => 0 causes charge
++ * state change 0 => -1 causes uncharge
++ */
++ atomic_t d_inuse;
++ /* charged size, including name length if name is not inline */
++ unsigned long d_ubsize;
++ struct user_beancounter *d_ub;
++};
++
++struct dentry;
++
++UB_DECLARE_FUNC(int, ub_dentry_alloc(struct dentry *d))
++UB_DECLARE_VOID_FUNC(ub_dentry_charge_nofail(struct dentry *d))
++UB_DECLARE_VOID_FUNC(ub_dentry_uncharge(struct dentry *d))
++
++#ifdef CONFIG_USER_RESOURCE
++UB_DECLARE_FUNC(int, ub_dentry_charge(struct dentry *d))
++#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse))
++#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse))
++#define INUSE_INIT 0
++#else
++#define ub_dentry_charge(d) ({ \
++ spin_unlock(&d->d_lock); \
++ rcu_read_unlock(); \
++ 0; \
++ })
++#define ub_dget_testone(d) (0)
++#define ub_dput_testzero(d) (0)
++#endif
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_debug.h linux-2.6.16-026test015/include/ub/ub_debug.h
+--- linux-2.6.16.orig/include/ub/ub_debug.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_debug.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,95 @@
++/*
++ * include/ub/ub_debug.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_DEBUG_H_
++#define __UB_DEBUG_H_
++
++/*
++ * general debugging
++ */
++
++#define UBD_ALLOC 0x1
++#define UBD_CHARGE 0x2
++#define UBD_LIMIT 0x4
++#define UBD_TRACE 0x8
++
++/*
++ * ub_net debugging
++ */
++
++#define UBD_NET_SOCKET 0x10
++#define UBD_NET_SLEEP 0x20
++#define UBD_NET_SEND 0x40
++#define UBD_NET_RECV 0x80
++
++/*
++ * Main routines
++ */
++
++#define UB_DEBUG (0)
++#define DEBUG_RESOURCE (0ULL)
++
++#define ub_dbg_cond(__cond, __str, args...) \
++ do { \
++ if ((__cond) != 0) \
++ printk(__str, ##args); \
++ } while(0)
++
++#define ub_debug(__section, __str, args...) \
++ ub_dbg_cond(UB_DEBUG & (__section), __str, ##args)
++
++#define ub_debug_resource(__resource, __str, args...) \
++ ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \
++ (DEBUG_RESOURCE & (1 << (__resource))), \
++ __str, ##args)
++
++#if UB_DEBUG & UBD_TRACE
++#define ub_debug_trace(__cond, __b, __r) \
++ do { \
++ static struct ub_rate_info ri = { __b, __r }; \
++ if ((__cond) != 0 && ub_ratelimit(&ri)) \
++ dump_stack(); \
++ } while(0)
++#else
++#define ub_debug_trace(__cond, __burst, __rate)
++#endif
++
++#include <linux/config.h>
++
++#ifdef CONFIG_UBC_DEBUG_KMEM
++#include <linux/list.h>
++#include <linux/kmem_cache.h>
++
++struct user_beancounter;
++struct ub_cache_counter {
++ struct list_head ulist;
++ struct ub_cache_counter *next;
++ struct user_beancounter *ub;
++ kmem_cache_t *cachep;
++ unsigned long counter;
++};
++
++extern spinlock_t cc_lock;
++extern void init_cache_counters(void);
++extern void ub_free_counters(struct user_beancounter *);
++extern void ub_kmemcache_free(kmem_cache_t *cachep);
++
++struct vm_struct;
++extern void inc_vmalloc_charged(struct vm_struct *, int);
++extern void dec_vmalloc_charged(struct vm_struct *);
++#else
++#define init_cache_counters() do { } while (0)
++#define inc_vmalloc_charged(vm, f) do { } while (0)
++#define dec_vmalloc_charged(vm) do { } while (0)
++#define ub_free_counters(ub) do { } while (0)
++#define ub_kmemcache_free(cachep) do { } while (0)
++#endif
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_decl.h linux-2.6.16-026test015/include/ub/ub_decl.h
+--- linux-2.6.16.orig/include/ub/ub_decl.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_decl.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,40 @@
++/*
++ * include/ub/ub_decl.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_DECL_H_
++#define __UB_DECL_H_
++
++#include <linux/config.h>
++
++/*
++ * Naming convension:
++ * ub_<section|object>_<operation>
++ */
++
++#ifdef CONFIG_USER_RESOURCE
++
++#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl;
++#define UB_DECLARE_VOID_FUNC(decl) extern void decl;
++
++#else /* CONFIG_USER_RESOURCE */
++
++#define UB_DECLARE_FUNC(ret_type, decl) \
++ static inline ret_type decl \
++ { \
++ return (ret_type)0; \
++ }
++#define UB_DECLARE_VOID_FUNC(decl) \
++ static inline void decl \
++ { \
++ }
++
++#endif /* CONFIG_USER_RESOURCE */
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_hash.h linux-2.6.16-026test015/include/ub/ub_hash.h
+--- linux-2.6.16.orig/include/ub/ub_hash.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_hash.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,41 @@
++/*
++ * include/ub/ub_hash.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef _LINUX_UBHASH_H
++#define _LINUX_UBHASH_H
++
++#ifdef __KERNEL__
++
++#define UB_HASH_SIZE 256
++
++struct ub_hash_slot {
++ struct user_beancounter *ubh_beans;
++};
++
++extern struct ub_hash_slot ub_hash[];
++extern spinlock_t ub_hash_lock;
++
++#ifdef CONFIG_USER_RESOURCE
++
++/*
++ * Iterate over beancounters
++ * @__slot - hash slot
++ * @__ubp - beancounter ptr
++ * Can use break :)
++ */
++#define for_each_beancounter(__slot, __ubp) \
++ for (__slot = 0, __ubp = NULL; \
++ __slot < UB_HASH_SIZE && __ubp == NULL; __slot++) \
++ for (__ubp = ub_hash[__slot].ubh_beans; __ubp; \
++ __ubp = __ubp->ub_next)
++
++#endif /* CONFIG_USER_RESOURCE */
++#endif /* __KERNEL__ */
++#endif /* _LINUX_UBHASH_H */
+diff -upr linux-2.6.16.orig/include/ub/ub_mem.h linux-2.6.16-026test015/include/ub/ub_mem.h
+--- linux-2.6.16.orig/include/ub/ub_mem.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_mem.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,76 @@
++/*
++ * include/ub/ub_mem.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_SLAB_H_
++#define __UB_SLAB_H_
++
++#include <linux/config.h>
++#include <linux/kmem_slab.h>
++#include <ub/beancounter.h>
++#include <ub/ub_decl.h>
++
++/*
++ * UB_KMEMSIZE accounting
++ */
++
++#ifdef CONFIG_UBC_DEBUG_ITEMS
++#define CHARGE_ORDER(__o) (1 << __o)
++#define CHARGE_SIZE(__s) 1
++#else
++#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o))
++#define CHARGE_SIZE(__s) (__s)
++#endif
++
++#define page_ub(__page) ((__page)->bc.page_ub)
++
++struct mm_struct;
++struct page;
++
++UB_DECLARE_FUNC(struct user_beancounter *, slab_ub(void *obj))
++UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj))
++UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj))
++
++UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, int mask))
++UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order))
++UB_DECLARE_FUNC(int, ub_slab_charge(void *objp, int flags))
++UB_DECLARE_VOID_FUNC(ub_slab_uncharge(void *obj))
++
++#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\
++ (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\
++ sizeof(void *))))
++
++#ifdef CONFIG_USER_RESOURCE
++extern struct user_beancounter *ub_select_worst(long *);
++
++/* mm/slab.c needed stuff */
++#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1)
++#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0)
++#define set_cache_objuse(cachep) do { \
++ (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \
++ (cachep)->num - 1) / (cachep)->num; \
++ if (!OFF_SLAB(cachep)) \
++ break; \
++ (cachep)->objuse += ((cachep)->slabp_cache->objuse + \
++ (cachep)->num - 1) / (cachep)->num; \
++ } while (0)
++#define init_slab_ubps(cachep, slabp) do { \
++ if (!((cachep)->flags & SLAB_UBC)) \
++ break; \
++ memset(slab_ubcs(cachep, slabp), 0, \
++ (cachep)->num * sizeof(void *)); \
++ } while (0)
++#define kmem_obj_memusage(o) (virt_to_cache(o)->objuse)
++#else
++#define UB_ALIGN(flags) 1
++#define UB_EXTRA(flags) 0
++#define set_cache_objuse(c) do { } while (0)
++#define init_slab_ubps(c, s) do { } while (0)
++#endif
++#endif /* __UB_SLAB_H_ */
+diff -upr linux-2.6.16.orig/include/ub/ub_misc.h linux-2.6.16-026test015/include/ub/ub_misc.h
+--- linux-2.6.16.orig/include/ub/ub_misc.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_misc.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,54 @@
++/*
++ * include/ub/ub_misc.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_MISC_H_
++#define __UB_MISC_H_
++
++#include <ub/ub_decl.h>
++
++struct tty_struct;
++struct file;
++struct file_lock;
++struct sigqueue;
++
++UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
++UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
++UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
++UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl))
++UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q,
++ struct user_beancounter *ub))
++UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q))
++UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent,
++ struct task_struct *task))
++UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task))
++UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty))
++UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty))
++
++#ifdef CONFIG_USER_RESOURCE
++#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0)
++#define unset_flock_charged(fl) do { \
++ WARN_ON((fl)->fl_charged == 0); \
++ (fl)->fl_charged = 0; \
++ } while (0)
++#define set_mm_ub(mm, tsk) do { \
++ (mm)->mm_ub = get_beancounter(tsk ? \
++ tsk->task_bc.task_ub : get_exec_ub()); \
++ } while (0)
++#define put_mm_ub(mm) do { \
++ put_beancounter((mm)->mm_ub); \
++ (mm)->mm_ub = NULL; \
++ } while (0)
++#else
++#define set_flock_charged(fl) do { } while (0)
++#define ubset_flock_charged(fl) do { } while (0)
++#define set_mm_ub(mm, tsk) do { } while (0)
++#define put_mm_ub(mm) do { } while (0)
++#endif
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_net.h linux-2.6.16-026test015/include/ub/ub_net.h
+--- linux-2.6.16.orig/include/ub/ub_net.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_net.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,141 @@
++/*
++ * include/ub/ub_net.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_NET_H_
++#define __UB_NET_H_
++
++/*
++ * UB_NUMXXXSOCK, UB_XXXBUF accounting
++ */
++
++#include <ub/ub_decl.h>
++#include <ub/ub_sk.h>
++
++#define bid2sid(__bufid) \
++ ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK)
++
++#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \
++ ~(SMP_CACHE_BYTES-1)))
++#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE)
++
++
++#define IS_TCP_SOCK(__family, __type) \
++ (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM)
++
++UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type))
++UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk))
++UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk))
++UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk))
++UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb))
++UB_DECLARE_FUNC(int, ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask))
++UB_DECLARE_VOID_FUNC(ub_skb_free_bc(struct sk_buff *skb))
++UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk))
++UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb))
++UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource,
++ unsigned long size))
++UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo,
++ unsigned long size))
++
++UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb))
++UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge_forced(struct sock *sk,
++ struct sk_buff *skb))
++UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb))
++UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge_forced(struct sock *sk,
++ struct sk_buff *skb))
++
++/* Charge size */
++static inline unsigned long skb_charge_datalen(unsigned long chargesize)
++{
++#ifdef CONFIG_USER_RESOURCE
++ unsigned long slabsize;
++
++ chargesize -= sizeof(struct sk_buff);
++ slabsize = 64;
++ do {
++ slabsize <<= 1;
++ } while (slabsize <= chargesize);
++
++ slabsize >>= 1;
++ return (slabsize - sizeof(struct skb_shared_info)) &
++ ~(SMP_CACHE_BYTES-1);
++#else
++ return 0;
++#endif
++}
++
++static inline unsigned long skb_charge_size_gen(unsigned long size)
++{
++#ifdef CONFIG_USER_RESOURCE
++ unsigned int slabsize;
++
++ size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info);
++ slabsize = 32; /* min size is 64 because of skb_shared_info */
++ do {
++ slabsize <<= 1;
++ } while (slabsize < size);
++
++ return slabsize + sizeof(struct sk_buff);
++#else
++ return 0;
++#endif
++
++}
++
++static inline unsigned long skb_charge_size_const(unsigned long size)
++{
++#ifdef CONFIG_USER_RESOURCE
++ unsigned int ret;
++ if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64)
++ ret = 64 + sizeof(struct sk_buff);
++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128)
++ ret = 128 + sizeof(struct sk_buff);
++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256)
++ ret = 256 + sizeof(struct sk_buff);
++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512)
++ ret = 512 + sizeof(struct sk_buff);
++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024)
++ ret = 1024 + sizeof(struct sk_buff);
++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048)
++ ret = 2048 + sizeof(struct sk_buff);
++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096)
++ ret = 4096 + sizeof(struct sk_buff);
++ else
++ ret = skb_charge_size_gen(size);
++ return ret;
++#else
++ return 0;
++#endif
++}
++
++
++#define skb_charge_size(__size) \
++ (__builtin_constant_p(__size) ? \
++ skb_charge_size_const(__size) : \
++ skb_charge_size_gen(__size))
++
++UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb))
++UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb,
++ struct sock *sk, unsigned long size, int res))
++
++/* Poll reserv */
++UB_DECLARE_FUNC(int, ub_sock_makewres_other(struct sock *sk, unsigned long sz))
++UB_DECLARE_FUNC(int, ub_sock_makewres_tcp(struct sock *sk, unsigned long size))
++UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, unsigned long size))
++UB_DECLARE_FUNC(int, ub_sock_getwres_tcp(struct sock *sk, unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, unsigned long size,
++ unsigned long ressize))
++UB_DECLARE_VOID_FUNC(ub_sock_retwres_tcp(struct sock *sk, unsigned long size,
++ unsigned long ressize))
++UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_other(struct sock *sk,
++ unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz))
++UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk))
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_orphan.h linux-2.6.16-026test015/include/ub/ub_orphan.h
+--- linux-2.6.16.orig/include/ub/ub_orphan.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_orphan.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,56 @@
++/*
++ * include/ub/ub_orphan.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_ORPHAN_H_
++#define __UB_ORPHAN_H_
++
++#include <net/tcp.h>
++
++#include "ub/beancounter.h"
++#include "ub/ub_net.h"
++
++
++static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk)
++{
++#ifdef CONFIG_USER_RESOURCE
++ if (sock_has_ubc(sk))
++ return &sock_bc(sk)->ub->ub_orphan_count;
++#endif
++ return sk->sk_prot->orphan_count;
++}
++
++static inline void ub_inc_orphan_count(struct sock *sk)
++{
++ atomic_inc(__ub_get_orphan_count_ptr(sk));
++}
++
++static inline void ub_dec_orphan_count(struct sock *sk)
++{
++ atomic_dec(__ub_get_orphan_count_ptr(sk));
++}
++
++static inline int ub_get_orphan_count(struct sock *sk)
++{
++ return atomic_read(__ub_get_orphan_count_ptr(sk));
++}
++
++extern int __ub_too_many_orphans(struct sock *sk, int count);
++static inline int ub_too_many_orphans(struct sock *sk, int count)
++{
++#ifdef CONFIG_USER_RESOURCE
++ if (__ub_too_many_orphans(sk, count))
++ return 1;
++#endif
++ return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans ||
++ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
++ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]));
++}
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_page.h linux-2.6.16-026test015/include/ub/ub_page.h
+--- linux-2.6.16.orig/include/ub/ub_page.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_page.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,48 @@
++/*
++ * include/ub/ub_page.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_PAGE_H_
++#define __UB_PAGE_H_
++
++#include <linux/config.h>
++
++/*
++ * Page_beancounters
++ */
++
++struct page;
++struct user_beancounter;
++
++#define PB_MAGIC 0x62700001UL
++
++struct page_beancounter {
++ unsigned long pb_magic;
++ struct page *page;
++ struct user_beancounter *ub;
++ struct page_beancounter *next_hash;
++ unsigned refcount;
++ struct list_head page_list;
++};
++
++#define PB_REFCOUNT_BITS 24
++#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS)
++#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS))
++#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS))
++#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1))
++#define PB_COUNT_INC(c) ((c)++)
++#define PB_COUNT_DEC(c) ((c)--)
++#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c))
++
++#define page_pbc(__page) ((__page)->bc.page_pb)
++
++struct address_space;
++extern int is_shmem_mapping(struct address_space *);
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_sk.h linux-2.6.16-026test015/include/ub/ub_sk.h
+--- linux-2.6.16.orig/include/ub/ub_sk.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_sk.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,43 @@
++/*
++ * include/ub/ub_sk.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_SK_H_
++#define __UB_SK_H_
++
++#include <linux/config.h>
++#include <ub/ub_task.h>
++
++struct sock;
++struct sk_buff;
++
++struct skb_beancounter {
++ struct user_beancounter *ub;
++ unsigned long charged:27, resource:5;
++};
++
++struct sock_beancounter {
++ /*
++ * already charged for future sends, to make poll work;
++ * changes are protected by bc spinlock, read is under socket
++ * semaphore for sends and unprotected in poll
++ */
++ unsigned long poll_reserv;
++ unsigned long ub_waitspc; /* space waiting for */
++ unsigned long ub_wcharged;
++ struct list_head ub_sock_list;
++ struct user_beancounter *ub;
++};
++
++#define sock_bc(__sk) (&(__sk)->sk_bc)
++#define skb_bc(__skb) (&(__skb)->skb_bc)
++#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc))
++#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL)
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_stat.h linux-2.6.16-026test015/include/ub/ub_stat.h
+--- linux-2.6.16.orig/include/ub/ub_stat.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_stat.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,70 @@
++/*
++ * include/ub/ub_stat.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_STAT_H_
++#define __UB_STAT_H_
++
++/* sys_ubstat commands list */
++#define UBSTAT_READ_ONE 0x010000
++#define UBSTAT_READ_ALL 0x020000
++#define UBSTAT_READ_FULL 0x030000
++#define UBSTAT_UBLIST 0x040000
++#define UBSTAT_UBPARMNUM 0x050000
++#define UBSTAT_GETTIME 0x060000
++
++#define UBSTAT_CMD(func) ((func) & 0xF0000)
++#define UBSTAT_PARMID(func) ((func) & 0x0FFFF)
++
++#define TIME_MAX_SEC (LONG_MAX / HZ)
++#define TIME_MAX_JIF (TIME_MAX_SEC * HZ)
++
++typedef unsigned long ubstattime_t;
++
++typedef struct {
++ ubstattime_t start_time;
++ ubstattime_t end_time;
++ ubstattime_t cur_time;
++} ubgettime_t;
++
++typedef struct {
++ long maxinterval;
++ int signum;
++} ubnotifrq_t;
++
++typedef struct {
++ unsigned long maxheld;
++ unsigned long failcnt;
++} ubstatparm_t;
++
++typedef struct {
++ unsigned long barrier;
++ unsigned long limit;
++ unsigned long held;
++ unsigned long maxheld;
++ unsigned long minheld;
++ unsigned long failcnt;
++ unsigned long __unused1;
++ unsigned long __unused2;
++} ubstatparmf_t;
++
++typedef struct {
++ ubstattime_t start_time;
++ ubstattime_t end_time;
++ ubstatparmf_t param[0];
++} ubstatfull_t;
++
++#ifdef __KERNEL__
++struct ub_stat_notify {
++ struct list_head list;
++ struct task_struct *task;
++ int signum;
++};
++#endif
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_task.h linux-2.6.16-026test015/include/ub/ub_task.h
+--- linux-2.6.16.orig/include/ub/ub_task.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_task.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,49 @@
++/*
++ * include/ub/ub_task.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_TASK_H_
++#define __UB_TASK_H_
++
++#include <linux/config.h>
++
++struct user_beancounter;
++
++
++#ifdef CONFIG_USER_RESOURCE
++
++struct task_beancounter {
++ struct user_beancounter *exec_ub;
++ struct user_beancounter *task_ub;
++ struct user_beancounter *fork_sub;
++ void *task_fnode, *task_freserv;
++ unsigned long oom_generation;
++ unsigned long task_data[4];
++};
++
++#define get_exec_ub() (current->task_bc.exec_ub)
++#define get_task_ub(__task) ((__task)->task_bc.task_ub)
++#define set_exec_ub(__newub) \
++({ \
++ struct user_beancounter *old; \
++ struct task_beancounter *tbc; \
++ tbc = &current->task_bc; \
++ old = tbc->exec_ub; \
++ tbc->exec_ub = __newub; \
++ old; \
++})
++
++#else /* CONFIG_USER_RESOURCE */
++
++#define get_exec_ub() (NULL)
++#define get_task_ub(task) (NULL)
++#define set_exec_ub(__ub) (NULL)
++
++#endif /* CONFIG_USER_RESOURCE */
++#endif /* __UB_TASK_H_ */
+diff -upr linux-2.6.16.orig/include/ub/ub_tcp.h linux-2.6.16-026test015/include/ub/ub_tcp.h
+--- linux-2.6.16.orig/include/ub/ub_tcp.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_tcp.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,79 @@
++/*
++ * include/ub/ub_tcp.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_TCP_H_
++#define __UB_TCP_H_
++
++/*
++ * UB_NUMXXXSOCK, UB_XXXBUF accounting
++ */
++
++#include <ub/ub_sk.h>
++#include <ub/beancounter.h>
++
++static inline void ub_tcp_update_maxadvmss(struct sock *sk)
++{
++#ifdef CONFIG_USER_RESOURCE
++ if (!sock_has_ubc(sk))
++ return;
++ if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss)
++ return;
++
++ sock_bc(sk)->ub->ub_maxadvmss =
++ skb_charge_size(MAX_HEADER + sizeof(struct iphdr)
++ + sizeof(struct tcphdr) + tcp_sk(sk)->advmss);
++#endif
++}
++
++static inline int ub_tcp_rmem_allows_expand(struct sock *sk)
++{
++ if (tcp_memory_pressure)
++ return 0;
++#ifdef CONFIG_USER_RESOURCE
++ if (sock_has_ubc(sk)) {
++ struct user_beancounter *ub;
++
++ ub = sock_bc(sk)->ub;
++ if (ub->ub_rmem_pressure == UB_RMEM_EXPAND)
++ return 1;
++ if (ub->ub_rmem_pressure == UB_RMEM_SHRINK)
++ return 0;
++ return sk->sk_rcvbuf <= ub->ub_rmem_thres;
++ }
++#endif
++ return 1;
++}
++
++static inline int ub_tcp_memory_pressure(struct sock *sk)
++{
++ if (tcp_memory_pressure)
++ return 1;
++#ifdef CONFIG_USER_RESOURCE
++ if (sock_has_ubc(sk))
++ return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND;
++#endif
++ return 0;
++}
++
++static inline int ub_tcp_shrink_rcvbuf(struct sock *sk)
++{
++ if (tcp_memory_pressure)
++ return 1;
++#ifdef CONFIG_USER_RESOURCE
++ if (sock_has_ubc(sk))
++ return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK;
++#endif
++ return 0;
++}
++
++UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk))
++UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk))
++
++#endif
+diff -upr linux-2.6.16.orig/include/ub/ub_vmpages.h linux-2.6.16-026test015/include/ub/ub_vmpages.h
+--- linux-2.6.16.orig/include/ub/ub_vmpages.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/include/ub/ub_vmpages.h 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,167 @@
++/*
++ * include/ub/ub_vmpages.h
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#ifndef __UB_PAGES_H_
++#define __UB_PAGES_H_
++
++#include <linux/linkage.h>
++#include <linux/config.h>
++#include <ub/beancounter.h>
++#include <ub/ub_decl.h>
++
++/*
++ * Check whether vma has private or copy-on-write mapping.
++ * Should match checks in ub_protected_charge().
++ */
++#define VM_UB_PRIVATE(__flags, __file) \
++ ( ((__flags) & VM_WRITE) ? \
++ (__file) == NULL || !((__flags) & VM_SHARED) : \
++ 0 \
++ )
++
++/* Mprotect charging result */
++#define PRIVVM_ERROR -1
++#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */
++#define PRIVVM_TO_PRIVATE 1
++#define PRIVVM_TO_SHARED 2
++
++UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm,
++ unsigned long size,
++ unsigned long newflags,
++ struct vm_area_struct *vma))
++
++UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm,
++ struct vm_area_struct *vma,
++ unsigned long num))
++#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1)
++UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm,
++ struct vm_area_struct *vma,
++ unsigned long num))
++#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1)
++
++UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm,
++ long sz))
++
++UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm,
++ unsigned long size,
++ unsigned vm_flags,
++ struct file *vm_file,
++ int strict))
++UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm,
++ unsigned long size,
++ unsigned vm_flags,
++ struct file *vm_file))
++
++struct shmem_inode_info;
++UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i,
++ unsigned long sz))
++UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i,
++ unsigned long sz))
++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi))
++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
++ unsigned long size))
++#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1)
++
++#ifdef CONFIG_USER_RESOURCE
++#define shmi_ub_set(shi, ub) do { \
++ (shi)->shmi_ub = get_beancounter(ub); \
++ } while (0)
++#define shmi_ub_put(shi) do { \
++ put_beancounter((shi)->shmi_ub); \
++ (shi)->shmi_ub = NULL; \
++ } while (0)
++#else
++#define shmi_ub_set(shi, ub) do { } while (0)
++#define shmi_ub_put(shi) do { } while (0)
++#endif
++
++UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm,
++ unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm,
++ unsigned long size))
++UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
++ unsigned long size))
++UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
++ unsigned long size))
++
++UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma,
++ unsigned long addr, unsigned long end))
++UB_DECLARE_VOID_FUNC(warn_bad_rss(struct vm_area_struct *vma,
++ unsigned long freed))
++#define pages_in_vma(vma) (pages_in_vma_range(vma, \
++ vma->vm_start, vma->vm_end))
++
++#define UB_PAGE_WEIGHT_SHIFT 24
++#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT)
++
++struct page_beancounter;
++#define PBC_COPY_SAME ((struct page_beancounter *) 1)
++
++/* Mprotect charging result */
++#define PRIVVM_ERROR -1
++#define PRIVVM_NO_CHARGE 0
++#define PRIVVM_TO_PRIVATE 1
++#define PRIVVM_TO_SHARED 2
++
++extern void fastcall __ub_update_physpages(struct user_beancounter *ub);
++extern void fastcall __ub_update_oomguarpages(struct user_beancounter *ub);
++extern void fastcall __ub_update_privvm(struct user_beancounter *ub);
++
++#ifdef CONFIG_USER_RSS_ACCOUNTING
++#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl)
++#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl)
++#else
++#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;}
++#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { }
++#endif
++
++PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc))
++PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num))
++PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc))
++PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page,
++ struct mm_struct *mm,
++ struct page_beancounter **pbc))
++PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page,
++ struct mm_struct *mm,
++ struct page_beancounter **pbc))
++PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb))
++PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb))
++PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page,
++ struct mm_struct *mm))
++
++PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page))
++#endif
++
++#ifdef CONFIG_USER_SWAP_ACCOUNTING
++#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl)
++#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl)
++#else
++#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;}
++#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { }
++#endif
++
++struct swap_info_struct;
++SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n))
++SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si))
++SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n,
++ struct user_beancounter *ub))
++SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n))
++
++#ifdef CONFIG_USER_RESOURCE
++#define ub_unmap_inc(mm) do { \
++ (mm)->mm_ub->ub_stat[smp_processor_id()].unmap++; \
++ } while (0)
++#define ub_swapin_inc(mm) do { \
++ (mm)->mm_ub->ub_stat[smp_processor_id()].swapin++; \
++ } while (0)
++#else
++#define ub_unmap_inc(mm) do { } while (0)
++#define ub_swapin_inc(mm) do { } while (0)
++#endif
+diff -upr linux-2.6.16.orig/init/calibrate.c linux-2.6.16-026test015/init/calibrate.c
+--- linux-2.6.16.orig/init/calibrate.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/init/calibrate.c 2006-07-04 14:41:38.000000000 +0400
+@@ -7,6 +7,7 @@
+ #include <linux/sched.h>
+ #include <linux/delay.h>
+ #include <linux/init.h>
++#include <linux/module.h>
+
+ #include <asm/timex.h>
+
+@@ -105,6 +106,60 @@ static unsigned long __devinit calibrate
+ static unsigned long __devinit calibrate_delay_direct(void) {return 0;}
+ #endif
+
++unsigned long cycles_per_jiffy, cycles_per_clock;
++
++static __devinit void calibrate_cycles(void)
++{
++ unsigned long ticks;
++ cycles_t time;
++
++ ticks = jiffies;
++ while (ticks == jiffies)
++ /* nothing */;
++ time = get_cycles();
++ ticks = jiffies;
++ while (ticks == jiffies)
++ /* nothing */;
++
++ time = get_cycles() - time;
++ cycles_per_jiffy = time;
++ if ((time >> 32) != 0) {
++ printk("CPU too fast! timings are incorrect\n");
++ cycles_per_jiffy = -1;
++ }
++}
++
++EXPORT_SYMBOL(cycles_per_jiffy);
++EXPORT_SYMBOL(cycles_per_clock);
++
++static __devinit void calc_cycles_per_jiffy(void)
++{
++#if defined(__i386__)
++ extern unsigned long fast_gettimeoffset_quotient;
++ unsigned long low, high;
++
++ if (fast_gettimeoffset_quotient != 0) {
++ __asm__("divl %2"
++ :"=a" (low), "=d" (high)
++ :"r" (fast_gettimeoffset_quotient),
++ "0" (0), "1" (1000000/HZ));
++
++ cycles_per_jiffy = low;
++ }
++#endif
++ if (cycles_per_jiffy == 0)
++ calibrate_cycles();
++
++ if (cycles_per_jiffy == 0) {
++ printk(KERN_WARNING "Cycles are stuck! "
++ "Some VPS statistics will not be available.");
++ /* to prevent division by zero in cycles_to_(clocks|jiffies) */
++ cycles_per_jiffy = 1;
++ cycles_per_clock = 1;
++ } else
++ cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC);
++}
++
+ /*
+ * This is the number of bits of precision for the loops_per_jiffy. Each
+ * bit takes on average 1.5/HZ seconds. This (like the original) is a little
+@@ -170,4 +225,5 @@ void __devinit calibrate_delay(void)
+ loops_per_jiffy);
+ }
+
++ calc_cycles_per_jiffy();
+ }
+diff -upr linux-2.6.16.orig/init/main.c linux-2.6.16-026test015/init/main.c
+--- linux-2.6.16.orig/init/main.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/init/main.c 2006-07-04 14:41:39.000000000 +0400
+@@ -48,6 +48,8 @@
+ #include <linux/mempolicy.h>
+ #include <linux/key.h>
+
++#include <ub/beancounter.h>
++
+ #include <asm/io.h>
+ #include <asm/bugs.h>
+ #include <asm/setup.h>
+@@ -80,6 +82,7 @@ extern void sbus_init(void);
+ extern void sysctl_init(void);
+ extern void signals_init(void);
+ extern void buffer_init(void);
++extern void fairsched_init_late(void);
+ extern void pidhash_init(void);
+ extern void pidmap_init(void);
+ extern void prio_tree_init(void);
+@@ -104,6 +107,24 @@ extern void tc_init(void);
+ enum system_states system_state;
+ EXPORT_SYMBOL(system_state);
+
++#ifdef CONFIG_VE
++extern void init_ve_system(void);
++extern void prepare_ve0_process(struct task_struct *tsk);
++extern void prepare_ve0_proc_root(void);
++extern void prepare_ve0_sysctl(void);
++#else
++#define init_ve_system() do { } while (0)
++#define prepare_ve0_process(tsk) do { } while (0)
++#define prepare_ve0_proc_root() do { } while (0)
++#define prepare_ve0_sysctl() do { } while (0)
++#endif
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++extern void prepare_ve0_loopback(void);
++#else
++#define prepare_ve0_loopback() do { } while (0)
++#endif
++
+ /*
+ * Boot command-line arguments
+ */
+@@ -447,6 +468,10 @@ asmlinkage void __init start_kernel(void
+ * enable them
+ */
+ lock_kernel();
++ /*
++ * Prepare ub0 to account early allocations if any
++ */
++ ub_init_ub0();
+ page_address_init();
+ printk(KERN_NOTICE);
+ printk(linux_banner);
+@@ -459,6 +484,8 @@ asmlinkage void __init start_kernel(void
+ */
+ smp_prepare_boot_cpu();
+
++ prepare_ve0_process(&init_task);
++
+ /*
+ * Set up the scheduler prior starting any interrupts (such as the
+ * timer interrupt). Full topology setup happens at smp_init()
+@@ -524,6 +551,7 @@ asmlinkage void __init start_kernel(void
+ #endif
+ fork_init(num_physpages);
+ proc_caches_init();
++ ub_init_cache(num_physpages);
+ buffer_init();
+ unnamed_dev_init();
+ key_init();
+@@ -534,7 +562,10 @@ asmlinkage void __init start_kernel(void
+ /* rootfs populating might need page-writeback */
+ page_writeback_init();
+ #ifdef CONFIG_PROC_FS
++ prepare_ve0_proc_root();
++ prepare_ve0_sysctl();
+ proc_root_init();
++ ub_init_proc();
+ #endif
+ cpuset_init();
+
+@@ -542,6 +573,10 @@ asmlinkage void __init start_kernel(void
+
+ acpi_early_init(); /* before LAPIC and SMP init */
+
++#ifdef CONFIG_USER_RSS_ACCOUNTING
++ ub_init_pbc();
++#endif
++
+ /* Do the rest non-__init'ed, we're now alive */
+ rest_init();
+ }
+@@ -603,6 +638,9 @@ static void __init do_initcalls(void)
+ */
+ static void __init do_basic_setup(void)
+ {
++ prepare_ve0_loopback();
++ init_ve_system();
++
+ /* drivers will send hotplug events */
+ init_workqueues();
+ usermodehelper_init();
+@@ -618,7 +656,7 @@ static void __init do_basic_setup(void)
+ static void do_pre_smp_initcalls(void)
+ {
+ extern int spawn_ksoftirqd(void);
+-#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU)
+ extern int migration_init(void);
+
+ migration_init();
+@@ -674,6 +712,12 @@ static int init(void * unused)
+
+ fixup_cpu_present_map();
+ smp_init();
++
++ /*
++ * This should be done after all cpus are known to
++ * be online. smp_init gives us confidence in it.
++ */
++ fairsched_init_late();
+ sched_init_smp();
+
+ cpuset_init_smp();
+diff -upr linux-2.6.16.orig/init/version.c linux-2.6.16-026test015/init/version.c
+--- linux-2.6.16.orig/init/version.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/init/version.c 2006-07-04 14:41:38.000000000 +0400
+@@ -28,6 +28,12 @@ struct new_utsname system_utsname = {
+
+ EXPORT_SYMBOL(system_utsname);
+
++struct new_utsname virt_utsname = {
++ /* we need only this field */
++ .release = UTS_RELEASE,
++};
++EXPORT_SYMBOL(virt_utsname);
++
+ const char linux_banner[] =
+ "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
+ LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n";
+diff -upr linux-2.6.16.orig/ipc/mqueue.c linux-2.6.16-026test015/ipc/mqueue.c
+--- linux-2.6.16.orig/ipc/mqueue.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/mqueue.c 2006-07-04 14:41:37.000000000 +0400
+@@ -639,7 +639,8 @@ static int oflag2acc[O_ACCMODE] = { MAY_
+ return ERR_PTR(-EINVAL);
+ }
+
+- if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) {
++ if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE],
++ NULL, NULL)) {
+ dput(dentry);
+ mntput(mqueue_mnt);
+ return ERR_PTR(-EACCES);
+diff -upr linux-2.6.16.orig/ipc/msg.c linux-2.6.16-026test015/ipc/msg.c
+--- linux-2.6.16.orig/ipc/msg.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/msg.c 2006-07-04 14:41:39.000000000 +0400
+@@ -88,6 +88,45 @@ void __init msg_init (void)
+ sysvipc_msg_proc_show);
+ }
+
++#ifdef CONFIG_VE
++void __init prepare_msg(void)
++{
++ get_ve0()->_msg_ids = &msg_ids;
++ get_ve0()->_msg_ctlmax = msg_ctlmax;
++ get_ve0()->_msg_ctlmnb = msg_ctlmnb;
++ get_ve0()->_msg_ctlmni = msg_ctlmni;
++}
++
++#define msg_ids (*(get_exec_env()->_msg_ids))
++#define msg_ctlmax (get_exec_env()->_msg_ctlmax)
++#define msg_ctlmnb (get_exec_env()->_msg_ctlmnb)
++#define msg_ctlmni (get_exec_env()->_msg_ctlmni)
++
++void init_ve_ipc_msg(void)
++{
++ msg_ctlmax = MSGMAX;
++ msg_ctlmnb = MSGMNB;
++ msg_ctlmni = MSGMNI;
++ ipc_init_ids(&msg_ids, MSGMNI);
++}
++
++void cleanup_ve_ipc_msg(void)
++{
++ int i;
++ struct msg_queue *msq;
++
++ down(&msg_ids.sem);
++ for (i = 0; i <= msg_ids.max_id; i++) {
++ msq = msg_lock(i);
++ if (msq == NULL)
++ continue;
++
++ freeque(msq, i);
++ }
++ up(&msg_ids.sem);
++}
++#endif
++
+ static int newque (key_t key, int msgflg)
+ {
+ int id;
+@@ -108,7 +147,7 @@ static int newque (key_t key, int msgflg
+ return retval;
+ }
+
+- id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni);
++ id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni, -1);
+ if(id == -1) {
+ security_msg_queue_free(msq);
+ ipc_rcu_putref(msq);
+@@ -450,7 +489,7 @@ asmlinkage long sys_msgctl (int msqid, i
+ ipcp = &msq->q_perm;
+ err = -EPERM;
+ if (current->euid != ipcp->cuid &&
+- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN))
++ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN))
+ /* We _could_ check for CAP_CHOWN above, but we don't */
+ goto out_unlock_up;
+
+@@ -540,7 +579,7 @@ static inline int pipelined_send(struct
+ msr->r_msg = ERR_PTR(-E2BIG);
+ } else {
+ msr->r_msg = NULL;
+- msq->q_lrpid = msr->r_tsk->pid;
++ msq->q_lrpid = virt_pid(msr->r_tsk);
+ msq->q_rtime = get_seconds();
+ wake_up_process(msr->r_tsk);
+ smp_mb();
+@@ -622,7 +661,7 @@ asmlinkage long sys_msgsnd (int msqid, s
+ }
+ }
+
+- msq->q_lspid = current->tgid;
++ msq->q_lspid = virt_tgid(current);
+ msq->q_stime = get_seconds();
+
+ if(!pipelined_send(msq,msg)) {
+@@ -718,7 +757,7 @@ asmlinkage long sys_msgrcv (int msqid, s
+ list_del(&msg->m_list);
+ msq->q_qnum--;
+ msq->q_rtime = get_seconds();
+- msq->q_lrpid = current->tgid;
++ msq->q_lrpid = virt_tgid(current);
+ msq->q_cbytes -= msg->m_ts;
+ atomic_sub(msg->m_ts,&msg_bytes);
+ atomic_dec(&msg_hdrs);
+@@ -833,3 +872,27 @@ static int sysvipc_msg_proc_show(struct
+ msq->q_ctime);
+ }
+ #endif
++
++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE)
++#include <linux/module.h>
++
++int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg)
++{
++ int i;
++ int err = 0;
++ struct msg_queue * msq;
++
++ down(&msg_ids.sem);
++ for(i = 0; i <= msg_ids.max_id; i++) {
++ if ((msq = msg_lock(i)) == NULL)
++ continue;
++ err = func(msg_buildid(i,msq->q_perm.seq), msq, arg);
++ msg_unlock(msq);
++ if (err)
++ break;
++ }
++ up(&msg_ids.sem);
++ return err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_walk_msg);
++#endif
+diff -upr linux-2.6.16.orig/ipc/msgutil.c linux-2.6.16-026test015/ipc/msgutil.c
+--- linux-2.6.16.orig/ipc/msgutil.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/msgutil.c 2006-07-04 14:41:37.000000000 +0400
+@@ -17,6 +17,8 @@
+
+ #include "util.h"
+
++#include <ub/ub_mem.h>
++
+ struct msg_msgseg {
+ struct msg_msgseg* next;
+ /* the next part of the message follows immediately */
+@@ -36,7 +38,7 @@ struct msg_msg *load_msg(const void __us
+ if (alen > DATALEN_MSG)
+ alen = DATALEN_MSG;
+
+- msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
++ msg = (struct msg_msg *)ub_kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+ if (msg == NULL)
+ return ERR_PTR(-ENOMEM);
+
+@@ -56,7 +58,7 @@ struct msg_msg *load_msg(const void __us
+ alen = len;
+ if (alen > DATALEN_SEG)
+ alen = DATALEN_SEG;
+- seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen,
++ seg = (struct msg_msgseg *)ub_kmalloc(sizeof(*seg) + alen,
+ GFP_KERNEL);
+ if (seg == NULL) {
+ err = -ENOMEM;
+diff -upr linux-2.6.16.orig/ipc/sem.c linux-2.6.16-026test015/ipc/sem.c
+--- linux-2.6.16.orig/ipc/sem.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/sem.c 2006-07-04 14:41:39.000000000 +0400
+@@ -78,6 +78,7 @@
+ #include <asm/uaccess.h>
+ #include "util.h"
+
++#include <ub/ub_mem.h>
+
+ #define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id))
+ #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm)
+@@ -88,7 +89,7 @@
+ ipc_buildid(&sem_ids, id, seq)
+ static struct ipc_ids sem_ids;
+
+-static int newary (key_t, int, int);
++static int newary (key_t, int, int, int);
+ static void freeary (struct sem_array *sma, int id);
+ #ifdef CONFIG_PROC_FS
+ static int sysvipc_sem_proc_show(struct seq_file *s, void *it);
+@@ -124,6 +125,48 @@ void __init sem_init (void)
+ sysvipc_sem_proc_show);
+ }
+
++#ifdef CONFIG_VE
++void __init prepare_sem(void)
++{
++ get_ve0()->_sem_ids = &sem_ids;
++ get_ve0()->_used_sems = used_sems;
++ get_ve0()->_sem_ctls[0] = sem_ctls[0];
++ get_ve0()->_sem_ctls[1] = sem_ctls[1];
++ get_ve0()->_sem_ctls[2] = sem_ctls[2];
++ get_ve0()->_sem_ctls[3] = sem_ctls[3];
++}
++
++#define sem_ids (*(get_exec_env()->_sem_ids))
++#define used_sems (get_exec_env()->_used_sems)
++#define sem_ctls (get_exec_env()->_sem_ctls)
++
++void init_ve_ipc_sem(void)
++{
++ used_sems = 0;
++ sem_ctls[0] = SEMMSL;
++ sem_ctls[1] = SEMMNS;
++ sem_ctls[2] = SEMOPM;
++ sem_ctls[3] = SEMMNI;
++ ipc_init_ids(&sem_ids, SEMMNI);
++}
++
++void cleanup_ve_ipc_sem(void)
++{
++ int i;
++ struct sem_array *sma;
++
++ down(&sem_ids.sem);
++ for (i = 0; i <= sem_ids.max_id; i++) {
++ sma = sem_lock(i);
++ if (sma == NULL)
++ continue;
++
++ freeary(sma, i);
++ }
++ up(&sem_ids.sem);
++}
++#endif
++
+ /*
+ * Lockless wakeup algorithm:
+ * Without the check/retry algorithm a lockless wakeup is possible:
+@@ -158,7 +201,7 @@ void __init sem_init (void)
+ */
+ #define IN_WAKEUP 1
+
+-static int newary (key_t key, int nsems, int semflg)
++static int newary (key_t key, int semid, int nsems, int semflg)
+ {
+ int id;
+ int retval;
+@@ -187,7 +230,7 @@ static int newary (key_t key, int nsems,
+ return retval;
+ }
+
+- id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni);
++ id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni, semid);
+ if(id == -1) {
+ security_sem_free(sma);
+ ipc_rcu_putref(sma);
+@@ -217,12 +260,12 @@ asmlinkage long sys_semget (key_t key, i
+ down(&sem_ids.sem);
+
+ if (key == IPC_PRIVATE) {
+- err = newary(key, nsems, semflg);
++ err = newary(key, -1, nsems, semflg);
+ } else if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */
+ if (!(semflg & IPC_CREAT))
+ err = -ENOENT;
+ else
+- err = newary(key, nsems, semflg);
++ err = newary(key, -1, nsems, semflg);
+ } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) {
+ err = -EEXIST;
+ } else {
+@@ -743,7 +786,7 @@ static int semctl_main(int semid, int se
+ for (un = sma->undo; un; un = un->id_next)
+ un->semadj[semnum] = 0;
+ curr->semval = val;
+- curr->sempid = current->tgid;
++ curr->sempid = virt_tgid(current);
+ sma->sem_ctime = get_seconds();
+ /* maybe some queued-up processes were waiting for this */
+ update_queue(sma);
+@@ -823,7 +866,7 @@ static int semctl_down(int semid, int se
+ ipcp = &sma->sem_perm;
+
+ if (current->euid != ipcp->cuid &&
+- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) {
++ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) {
+ err=-EPERM;
+ goto out_unlock;
+ }
+@@ -944,7 +987,8 @@ static inline int get_undo_list(struct s
+ undo_list = current->sysvsem.undo_list;
+ if (!undo_list) {
+ size = sizeof(struct sem_undo_list);
+- undo_list = (struct sem_undo_list *) kmalloc(size, GFP_KERNEL);
++ undo_list = (struct sem_undo_list *) ub_kmalloc(size,
++ GFP_KERNEL);
+ if (undo_list == NULL)
+ return -ENOMEM;
+ memset(undo_list, 0, size);
+@@ -1008,7 +1052,8 @@ static struct sem_undo *find_undo(int se
+ ipc_rcu_getref(sma);
+ sem_unlock(sma);
+
+- new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
++ new = (struct sem_undo *) ub_kmalloc(sizeof(struct sem_undo) +
++ sizeof(short)*nsems, GFP_KERNEL);
+ if (!new) {
+ ipc_lock_by_ptr(&sma->sem_perm);
+ ipc_rcu_putref(sma);
+@@ -1066,7 +1111,7 @@ asmlinkage long sys_semtimedop(int semid
+ if (nsops > sc_semopm)
+ return -E2BIG;
+ if(nsops > SEMOPM_FAST) {
+- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
++ sops = ub_kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
+ if(sops==NULL)
+ return -ENOMEM;
+ }
+@@ -1150,7 +1195,7 @@ retry_undos:
+ queue.sops = sops;
+ queue.nsops = nsops;
+ queue.undo = un;
+- queue.pid = current->tgid;
++ queue.pid = virt_tgid(current);
+ queue.id = semid;
+ queue.alter = alter;
+ if (alter)
+@@ -1320,7 +1365,7 @@ found:
+ sem->semval = 0;
+ if (sem->semval > SEMVMX)
+ sem->semval = SEMVMX;
+- sem->sempid = current->tgid;
++ sem->sempid = virt_tgid(current);
+ }
+ }
+ sma->sem_otime = get_seconds();
+@@ -1351,3 +1396,48 @@ static int sysvipc_sem_proc_show(struct
+ sma->sem_ctime);
+ }
+ #endif
++
++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE)
++#include <linux/module.h>
++
++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg)
++{
++ int err = 0;
++ struct sem_array *sma;
++
++ down(&sem_ids.sem);
++ sma = sem_lock(semid);
++ if (!sma) {
++ err = newary(key, semid, size, semflg);
++ if (err >= 0)
++ sma = sem_lock(semid);
++ }
++ if (sma)
++ sem_unlock(sma);
++ up(&sem_ids.sem);
++
++ return err > 0 ? 0 : err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_setup_sem);
++
++int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg)
++{
++ int i;
++ int err = 0;
++ struct sem_array *sma;
++
++ down(&sem_ids.sem);
++ for (i = 0; i <= sem_ids.max_id; i++) {
++ if ((sma = sem_lock(i)) == NULL)
++ continue;
++ err = func(sem_buildid(i,sma->sem_perm.seq), sma, arg);
++ sem_unlock(sma);
++ if (err)
++ break;
++ }
++ up(&sem_ids.sem);
++ return err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_walk_sem);
++EXPORT_SYMBOL_GPL(exit_sem);
++#endif
+diff -upr linux-2.6.16.orig/ipc/shm.c linux-2.6.16-026test015/ipc/shm.c
+--- linux-2.6.16.orig/ipc/shm.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/shm.c 2006-07-04 14:41:39.000000000 +0400
+@@ -30,9 +30,13 @@
+ #include <linux/capability.h>
+ #include <linux/ptrace.h>
+ #include <linux/seq_file.h>
++#include <linux/shmem_fs.h>
+
+ #include <asm/uaccess.h>
+
++#include <ub/beancounter.h>
++#include <ub/ub_vmpages.h>
++
+ #include "util.h"
+
+ static struct file_operations shm_file_operations;
+@@ -46,9 +50,11 @@ static struct ipc_ids shm_ids;
+ #define shm_buildid(id, seq) \
+ ipc_buildid(&shm_ids, id, seq)
+
+-static int newseg (key_t key, int shmflg, size_t size);
++static int newseg (key_t key, int shmid, int shmflg, size_t size);
+ static void shm_open (struct vm_area_struct *shmd);
+ static void shm_close (struct vm_area_struct *shmd);
++static void shm_destroy (struct shmid_kernel *shmd);
++static void do_shm_rmid(struct shmid_kernel *shp);
+ #ifdef CONFIG_PROC_FS
+ static int sysvipc_shm_proc_show(struct seq_file *s, void *it);
+ #endif
+@@ -68,6 +74,68 @@ void __init shm_init (void)
+ sysvipc_shm_proc_show);
+ }
+
++#ifdef CONFIG_VE
++void __init prepare_shm(void)
++{
++ get_ve0()->_shm_ids = &shm_ids;
++ get_ve0()->_shm_ctlmax = shm_ctlmax;
++ get_ve0()->_shm_ctlall = shm_ctlall;
++ get_ve0()->_shm_ctlmni = shm_ctlmni;
++ get_ve0()->_shm_tot = shm_tot;
++}
++
++#define shm_ids (*(get_exec_env()->_shm_ids))
++#define shm_ctlmax (get_exec_env()->_shm_ctlmax)
++#define shm_ctlall (get_exec_env()->_shm_ctlall)
++#define shm_ctlmni (get_exec_env()->_shm_ctlmni)
++#define shm_total (get_exec_env()->_shm_tot)
++
++void init_ve_ipc_shm(void)
++{
++ shm_ctlmax = SHMMAX;
++ shm_ctlall = SHMALL;
++ shm_ctlmni = SHMMNI;
++ shm_total = 0;
++ ipc_init_ids(&shm_ids, 1);
++}
++
++void cleanup_ve_ipc_shm(void)
++{
++ int i;
++ struct shmid_kernel *shp;
++
++ down(&shm_ids.sem);
++ for (i = 0; i <= shm_ids.max_id; i++) {
++ shp = shm_lock(i);
++ if (shp == NULL)
++ continue;
++
++ do_shm_rmid(shp);
++ }
++ up(&shm_ids.sem);
++}
++#define sb_ve(sb) VE_OWNER_FSTYPE(sb->s_type)
++#define shm_total_sb(sb) (&sb_ve(sb)->_shm_tot)
++#define shm_lock_sb(id, sb) ((struct shmid_kernel *) \
++ ipc_lock(sb_ve(sb)->_shm_ids, id))
++#else
++/* renamed since there is a struct field named shm_tot */
++#define shm_total shm_tot
++#define shm_total_sb(sb) (&shm_tot)
++#define shm_lock_sb(id, sb) shm_lock(id)
++#endif
++
++static void do_shm_rmid(struct shmid_kernel *shp)
++{
++ if (shp->shm_nattch){
++ shp->shm_perm.mode |= SHM_DEST;
++ /* Do not find it any more */
++ shp->shm_perm.key = IPC_PRIVATE;
++ shm_unlock(shp);
++ } else
++ shm_destroy (shp);
++}
++
+ static inline int shm_checkid(struct shmid_kernel *s, int id)
+ {
+ if (ipc_checkid(&shm_ids,&s->shm_perm,id))
+@@ -75,25 +143,25 @@ static inline int shm_checkid(struct shm
+ return 0;
+ }
+
+-static inline struct shmid_kernel *shm_rmid(int id)
++static inline struct shmid_kernel *shm_rmid(struct ipc_ids *ids, int id)
+ {
+- return (struct shmid_kernel *)ipc_rmid(&shm_ids,id);
++ return (struct shmid_kernel *)ipc_rmid(ids,id);
+ }
+
+-static inline int shm_addid(struct shmid_kernel *shp)
++static inline int shm_addid(struct shmid_kernel *shp, int reqid)
+ {
+- return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni);
++ return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni, reqid);
+ }
+
+
+
+-static inline void shm_inc (int id) {
++static inline void shm_inc(int id, struct super_block *sb) {
+ struct shmid_kernel *shp;
+
+- if(!(shp = shm_lock(id)))
++ if(!(shp = shm_lock_sb(id, sb)))
+ BUG();
+ shp->shm_atim = get_seconds();
+- shp->shm_lprid = current->tgid;
++ shp->shm_lprid = virt_tgid(current);
+ shp->shm_nattch++;
+ shm_unlock(shp);
+ }
+@@ -101,7 +169,50 @@ static inline void shm_inc (int id) {
+ /* This is called by fork, once for every shm attach. */
+ static void shm_open (struct vm_area_struct *shmd)
+ {
+- shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino);
++ shm_inc(shmd->vm_file->f_dentry->d_inode->i_ino,
++ shmd->vm_file->f_dentry->d_inode->i_sb);
++}
++
++static int shmem_lock(struct shmid_kernel *shp, int lock,
++ struct user_struct *user)
++{
++ struct file *file = shp->shm_file;
++ struct inode *inode = file->f_dentry->d_inode;
++ struct shmem_inode_info *info = SHMEM_I(inode);
++ unsigned long size;
++
++ size = shp->shm_segsz + PAGE_SIZE - 1;
++
++#ifdef CONFIG_SHMEM
++ spin_lock(&info->lock);
++ if (lock && !(info->flags & VM_LOCKED)) {
++ if (ub_lockedshm_charge(info, size) < 0)
++ goto out_ch;
++
++ if (!user_shm_lock(inode->i_size, user))
++ goto out_user;
++ info->flags |= VM_LOCKED;
++ }
++ if (!lock && (info->flags & VM_LOCKED) && user) {
++ ub_lockedshm_uncharge(info, size);
++ user_shm_unlock(inode->i_size, user);
++ info->flags &= ~VM_LOCKED;
++ }
++ spin_unlock(&info->lock);
++ return 0;
++
++out_user:
++ ub_lockedshm_uncharge(info, size);
++out_ch:
++ spin_unlock(&info->lock);
++ return -ENOMEM;
++#else
++ if (lock && ub_lockedshm_charge(info, size))
++ return -ENOMEM;
++ if (!lock)
++ ub_lockedshm_uncharge(info, size);
++ return 0;
++#endif
+ }
+
+ /*
+@@ -114,15 +225,24 @@ static void shm_open (struct vm_area_str
+ */
+ static void shm_destroy (struct shmid_kernel *shp)
+ {
+- shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+- shm_rmid (shp->id);
++ int numpages, *shm_totalp;
++ struct file *f;
++ struct super_block *sb;
++
++ f = shp->shm_file;
++ sb = f->f_dentry->d_inode->i_sb;
++ numpages = (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
++ shm_totalp = shm_total_sb(sb);
++ *shm_totalp -= numpages;
++
++ shm_rmid (shp->_shm_ids, shp->id);
+ shm_unlock(shp);
+ if (!is_file_hugepages(shp->shm_file))
+- shmem_lock(shp->shm_file, 0, shp->mlock_user);
++ shmem_lock(shp, 0, shp->mlock_user);
+ else
+ user_shm_unlock(shp->shm_file->f_dentry->d_inode->i_size,
+ shp->mlock_user);
+- fput (shp->shm_file);
++ fput(f);
+ security_shm_free(shp);
+ ipc_rcu_putref(shp);
+ }
+@@ -138,12 +258,24 @@ static void shm_close (struct vm_area_st
+ struct file * file = shmd->vm_file;
+ int id = file->f_dentry->d_inode->i_ino;
+ struct shmid_kernel *shp;
++ struct super_block *sb;
++ struct ipc_ids *ids;
++#ifdef CONFIG_VE
++ struct ve_struct *ve;
++
++ sb = file->f_dentry->d_inode->i_sb;
++ ve = get_ve(sb_ve(sb));
++ ids = ve->_shm_ids;
++#else
++ sb = file->f_dentry->d_inode->i_sb;
++ ids = &shm_ids;
++#endif
+
+- down (&shm_ids.sem);
++ down (&ids->sem);
+ /* remove from the list of attaches of the shm segment */
+- if(!(shp = shm_lock(id)))
++ if(!(shp = shm_lock_sb(id, sb)))
+ BUG();
+- shp->shm_lprid = current->tgid;
++ shp->shm_lprid = virt_tgid(current);
+ shp->shm_dtim = get_seconds();
+ shp->shm_nattch--;
+ if(shp->shm_nattch == 0 &&
+@@ -151,7 +283,10 @@ static void shm_close (struct vm_area_st
+ shm_destroy (shp);
+ else
+ shm_unlock(shp);
+- up (&shm_ids.sem);
++ up(&ids->sem);
++#ifdef CONFIG_VE
++ put_ve(ve);
++#endif
+ }
+
+ static int shm_mmap(struct file * file, struct vm_area_struct * vma)
+@@ -161,7 +296,10 @@ static int shm_mmap(struct file * file,
+ ret = shmem_mmap(file, vma);
+ if (ret == 0) {
+ vma->vm_ops = &shm_vm_ops;
+- shm_inc(file->f_dentry->d_inode->i_ino);
++ if (!(vma->vm_flags & VM_WRITE))
++ vma->vm_flags &= ~VM_MAYWRITE;
++ shm_inc(file->f_dentry->d_inode->i_ino,
++ file->f_dentry->d_inode->i_sb);
+ }
+
+ return ret;
+@@ -184,19 +322,19 @@ static struct vm_operations_struct shm_v
+ #endif
+ };
+
+-static int newseg (key_t key, int shmflg, size_t size)
++static int newseg (key_t key, int shmid, int shmflg, size_t size)
+ {
+ int error;
+ struct shmid_kernel *shp;
+ int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ struct file * file;
+- char name[13];
++ char name[26];
+ int id;
+
+ if (size < SHMMIN || size > shm_ctlmax)
+ return -EINVAL;
+
+- if (shm_tot + numpages >= shm_ctlall)
++ if (shm_total + numpages >= shm_ctlall)
+ return -ENOSPC;
+
+ shp = ipc_rcu_alloc(sizeof(*shp));
+@@ -227,7 +365,11 @@ static int newseg (key_t key, int shmflg
+ if ((shmflg & SHM_NORESERVE) &&
+ sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ acctflag = 0;
++#ifdef CONFIG_VE
++ sprintf (name, "VE%d.SYSV%08x", get_exec_env()->veid, key);
++#else
+ sprintf (name, "SYSV%08x", key);
++#endif
+ file = shmem_file_setup(name, size, acctflag);
+ }
+ error = PTR_ERR(file);
+@@ -235,17 +377,18 @@ static int newseg (key_t key, int shmflg
+ goto no_file;
+
+ error = -ENOSPC;
+- id = shm_addid(shp);
++ id = shm_addid(shp, shmid);
+ if(id == -1)
+ goto no_id;
+
+- shp->shm_cprid = current->tgid;
++ shp->shm_cprid = virt_tgid(current);
+ shp->shm_lprid = 0;
+ shp->shm_atim = shp->shm_dtim = 0;
+ shp->shm_ctim = get_seconds();
+ shp->shm_segsz = size;
+ shp->shm_nattch = 0;
+ shp->id = shm_buildid(id,shp->shm_perm.seq);
++ shp->_shm_ids = &shm_ids;
+ shp->shm_file = file;
+ file->f_dentry->d_inode->i_ino = shp->id;
+
+@@ -253,7 +396,7 @@ static int newseg (key_t key, int shmflg
+ if (!(shmflg & SHM_HUGETLB))
+ file->f_op = &shm_file_operations;
+
+- shm_tot += numpages;
++ shm_total += numpages;
+ shm_unlock(shp);
+ return shp->id;
+
+@@ -272,12 +415,12 @@ asmlinkage long sys_shmget (key_t key, s
+
+ down(&shm_ids.sem);
+ if (key == IPC_PRIVATE) {
+- err = newseg(key, shmflg, size);
++ err = newseg(key, -1, shmflg, size);
+ } else if ((id = ipc_findkey(&shm_ids, key)) == -1) {
+ if (!(shmflg & IPC_CREAT))
+ err = -ENOENT;
+ else
+- err = newseg(key, shmflg, size);
++ err = newseg(key, -1, shmflg, size);
+ } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
+ err = -EEXIST;
+ } else {
+@@ -470,7 +613,7 @@ asmlinkage long sys_shmctl (int shmid, i
+ down(&shm_ids.sem);
+ shm_info.used_ids = shm_ids.in_use;
+ shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp);
+- shm_info.shm_tot = shm_tot;
++ shm_info.shm_tot = shm_total;
+ shm_info.swap_attempts = 0;
+ shm_info.swap_successes = 0;
+ err = shm_ids.max_id;
+@@ -557,14 +700,14 @@ asmlinkage long sys_shmctl (int shmid, i
+ if(cmd==SHM_LOCK) {
+ struct user_struct * user = current->user;
+ if (!is_file_hugepages(shp->shm_file)) {
+- err = shmem_lock(shp->shm_file, 1, user);
++ err = shmem_lock(shp, 1, user);
+ if (!err) {
+ shp->shm_perm.mode |= SHM_LOCKED;
+ shp->mlock_user = user;
+ }
+ }
+ } else if (!is_file_hugepages(shp->shm_file)) {
+- shmem_lock(shp->shm_file, 0, shp->mlock_user);
++ shmem_lock(shp, 0, shp->mlock_user);
+ shp->shm_perm.mode &= ~SHM_LOCKED;
+ shp->mlock_user = NULL;
+ }
+@@ -594,7 +737,7 @@ asmlinkage long sys_shmctl (int shmid, i
+
+ if (current->euid != shp->shm_perm.uid &&
+ current->euid != shp->shm_perm.cuid &&
+- !capable(CAP_SYS_ADMIN)) {
++ !capable(CAP_VE_SYS_ADMIN)) {
+ err=-EPERM;
+ goto out_unlock_up;
+ }
+@@ -603,13 +746,7 @@ asmlinkage long sys_shmctl (int shmid, i
+ if (err)
+ goto out_unlock_up;
+
+- if (shp->shm_nattch){
+- shp->shm_perm.mode |= SHM_DEST;
+- /* Do not find it any more */
+- shp->shm_perm.key = IPC_PRIVATE;
+- shm_unlock(shp);
+- } else
+- shm_destroy (shp);
++ do_shm_rmid(shp);
+ up(&shm_ids.sem);
+ goto out;
+ }
+@@ -633,7 +770,7 @@ asmlinkage long sys_shmctl (int shmid, i
+ err=-EPERM;
+ if (current->euid != shp->shm_perm.uid &&
+ current->euid != shp->shm_perm.cuid &&
+- !capable(CAP_SYS_ADMIN)) {
++ !capable(CAP_VE_SYS_ADMIN)) {
+ goto out_unlock_up;
+ }
+
+@@ -916,3 +1053,55 @@ static int sysvipc_shm_proc_show(struct
+ shp->shm_ctim);
+ }
+ #endif
++
++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE)
++#include <linux/module.h>
++
++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg)
++{
++ struct shmid_kernel *shp;
++ struct file *file;
++
++ down(&shm_ids.sem);
++ shp = shm_lock(shmid);
++ if (!shp) {
++ int err;
++
++ err = newseg(key, shmid, shmflg, size);
++ file = ERR_PTR(err);
++ if (err < 0)
++ goto out;
++ shp = shm_lock(shmid);
++ }
++ file = ERR_PTR(-EINVAL);
++ if (shp) {
++ file = shp->shm_file;
++ get_file(file);
++ shm_unlock(shp);
++ }
++out:
++ up(&shm_ids.sem);
++ return file;
++}
++EXPORT_SYMBOL_GPL(sysvipc_setup_shm);
++
++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg)
++{
++ int i;
++ int err = 0;
++ struct shmid_kernel* shp;
++
++ down(&shm_ids.sem);
++ for(i = 0; i <= shm_ids.max_id; i++) {
++ if ((shp = shm_lock(i)) == NULL)
++ continue;
++ err = func(shp, arg);
++ shm_unlock(shp);
++ if (err)
++ break;
++ }
++ up(&shm_ids.sem);
++ return err;
++}
++EXPORT_SYMBOL_GPL(sysvipc_walk_shm);
++#endif
+diff -upr linux-2.6.16.orig/ipc/util.c linux-2.6.16-026test015/ipc/util.c
+--- linux-2.6.16.orig/ipc/util.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/util.c 2006-07-04 14:41:39.000000000 +0400
+@@ -13,6 +13,7 @@
+ */
+
+ #include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/mm.h>
+ #include <linux/shm.h>
+ #include <linux/init.h>
+@@ -30,6 +31,8 @@
+
+ #include <asm/unistd.h>
+
++#include <ub/ub_mem.h>
++
+ #include "util.h"
+
+ struct ipc_proc_iface {
+@@ -65,7 +68,7 @@ __initcall(ipc_init);
+ * array itself.
+ */
+
+-void __init ipc_init_ids(struct ipc_ids* ids, int size)
++void __ve_init ipc_init_ids(struct ipc_ids* ids, int size)
+ {
+ int i;
+ sema_init(&ids->sem,1);
+@@ -94,7 +97,21 @@ void __init ipc_init_ids(struct ipc_ids*
+ ids->entries->size = size;
+ for(i=0;i<size;i++)
+ ids->entries->p[i] = NULL;
++
++ ids->owner_env = get_exec_env();
++}
++
++#ifdef CONFIG_VE
++static inline void ipc_free_ids(struct ipc_ids *ids)
++{
++ if (ids == NULL)
++ return;
++
++ if (ids->entries != &ids->nullentry)
++ ipc_rcu_putref(ids->entries);
++ kfree(ids);
+ }
++#endif
+
+ #ifdef CONFIG_PROC_FS
+ static struct file_operations sysvipc_proc_fops;
+@@ -182,8 +199,7 @@ static int grow_ary(struct ipc_ids* ids,
+ if(new == NULL)
+ return size;
+ new->size = newsize;
+- memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size +
+- sizeof(struct ipc_id_ary));
++ memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size);
+ for(i=size;i<newsize;i++) {
+ new->p[i] = NULL;
+ }
+@@ -213,10 +229,20 @@ static int grow_ary(struct ipc_ids* ids,
+ * Called with ipc_ids.sem held.
+ */
+
+-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
++int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid)
+ {
+ int id;
+
++ if (reqid >= 0) {
++ id = reqid%SEQ_MULTIPLIER;
++ size = grow_ary(ids,id+1);
++ if (id >= size)
++ return -1;
++ if (ids->entries->p[id] == NULL)
++ goto found;
++ return -1;
++ }
++
+ size = grow_ary(ids,size);
+
+ /*
+@@ -229,16 +255,21 @@ int ipc_addid(struct ipc_ids* ids, struc
+ }
+ return -1;
+ found:
+- ids->in_use++;
++ if (ids->in_use++ == 0)
++ (void)get_ve(ids->owner_env);
+ if (id > ids->max_id)
+ ids->max_id = id;
+
+ new->cuid = new->uid = current->euid;
+ new->gid = new->cgid = current->egid;
+
+- new->seq = ids->seq++;
+- if(ids->seq > ids->seq_max)
+- ids->seq = 0;
++ if (reqid >= 0) {
++ new->seq = reqid/SEQ_MULTIPLIER;
++ } else {
++ new->seq = ids->seq++;
++ if(ids->seq > ids->seq_max)
++ ids->seq = 0;
++ }
+
+ spin_lock_init(&new->lock);
+ new->deleted = 0;
+@@ -276,7 +307,8 @@ struct kern_ipc_perm* ipc_rmid(struct ip
+ ids->entries->p[lid] = NULL;
+ if(p==NULL)
+ BUG();
+- ids->in_use--;
++ if (--ids->in_use == 0)
++ put_ve(ids->owner_env);
+
+ if (lid == ids->max_id) {
+ do {
+@@ -302,9 +334,9 @@ void* ipc_alloc(int size)
+ {
+ void* out;
+ if(size > PAGE_SIZE)
+- out = vmalloc(size);
++ out = ub_vmalloc(size);
+ else
+- out = kmalloc(size, GFP_KERNEL);
++ out = ub_kmalloc(size, GFP_KERNEL);
+ return out;
+ }
+
+@@ -387,14 +419,14 @@ void* ipc_rcu_alloc(int size)
+ * workqueue if necessary (for vmalloc).
+ */
+ if (rcu_use_vmalloc(size)) {
+- out = vmalloc(HDRLEN_VMALLOC + size);
++ out = ub_vmalloc(HDRLEN_VMALLOC + size);
+ if (out) {
+ out += HDRLEN_VMALLOC;
+ container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1;
+ container_of(out, struct ipc_rcu_hdr, data)->refcount = 1;
+ }
+ } else {
+- out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL);
++ out = ub_kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL);
+ if (out) {
+ out += HDRLEN_KMALLOC;
+ container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0;
+@@ -603,6 +635,71 @@ int ipc_checkid(struct ipc_ids* ids, str
+ return 0;
+ }
+
++#ifdef CONFIG_VE
++void __init prepare_ipc(void)
++{
++ prepare_msg();
++ prepare_sem();
++ prepare_shm();
++}
++
++int init_ve_ipc(struct ve_struct * envid)
++{
++ envid->_msg_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *),
++ GFP_KERNEL);
++ if (envid->_msg_ids == NULL)
++ goto out_nomem;
++ envid->_sem_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *),
++ GFP_KERNEL);
++ if (envid->_sem_ids == NULL)
++ goto out_free_msg;
++ envid->_shm_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *),
++ GFP_KERNEL);
++ if (envid->_shm_ids == NULL)
++ goto out_free_sem;
++
++ init_ve_ipc_msg();
++ init_ve_ipc_sem();
++ init_ve_ipc_shm();
++ return 0;
++
++out_free_sem:
++ kfree(envid->_sem_ids);
++out_free_msg:
++ kfree(envid->_msg_ids);
++out_nomem:
++ return -ENOMEM;
++}
++
++void ve_ipc_cleanup(void)
++{
++ cleanup_ve_ipc_msg();
++ cleanup_ve_ipc_sem();
++ cleanup_ve_ipc_shm();
++}
++
++void ve_ipc_free(struct ve_struct *env)
++{
++ ipc_free_ids(env->_msg_ids);
++ ipc_free_ids(env->_sem_ids);
++ ipc_free_ids(env->_shm_ids);
++ env->_msg_ids = NULL;
++ env->_sem_ids = NULL;
++ env->_shm_ids = NULL;
++}
++
++void fini_ve_ipc(struct ve_struct *ptr)
++{
++ ve_ipc_cleanup();
++ ve_ipc_free(ptr);
++}
++
++EXPORT_SYMBOL(init_ve_ipc);
++EXPORT_SYMBOL(ve_ipc_cleanup);
++EXPORT_SYMBOL(ve_ipc_free);
++EXPORT_SYMBOL(fini_ve_ipc);
++#endif /* CONFIG_VE */
++
+ #ifdef __ARCH_WANT_IPC_PARSE_VERSION
+
+
+diff -upr linux-2.6.16.orig/ipc/util.h linux-2.6.16-026test015/ipc/util.h
+--- linux-2.6.16.orig/ipc/util.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/ipc/util.h 2006-07-04 14:41:39.000000000 +0400
+@@ -15,6 +15,22 @@ void sem_init (void);
+ void msg_init (void);
+ void shm_init (void);
+
++#ifdef CONFIG_VE
++void prepare_msg(void);
++void prepare_sem(void);
++void prepare_shm(void);
++void init_ve_ipc_msg(void);
++void init_ve_ipc_sem(void);
++void init_ve_ipc_shm(void);
++void cleanup_ve_ipc_msg(void);
++void cleanup_ve_ipc_sem(void);
++void cleanup_ve_ipc_shm(void);
++
++#define __ve_init
++#else
++#define __ve_init __init
++#endif
++
+ struct ipc_id_ary {
+ int size;
+ struct kern_ipc_perm *p[0];
+@@ -28,10 +44,11 @@ struct ipc_ids {
+ struct semaphore sem;
+ struct ipc_id_ary nullentry;
+ struct ipc_id_ary* entries;
++ struct ve_struct *owner_env;
+ };
+
+ struct seq_file;
+-void __init ipc_init_ids(struct ipc_ids* ids, int size);
++void __ve_init ipc_init_ids(struct ipc_ids *ids, int size);
+ #ifdef CONFIG_PROC_FS
+ void __init ipc_init_proc_interface(const char *path, const char *header,
+ struct ipc_ids *ids,
+@@ -42,7 +59,7 @@ void __init ipc_init_proc_interface(cons
+
+ /* must be called with ids->sem acquired.*/
+ int ipc_findkey(struct ipc_ids* ids, key_t key);
+-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size);
++int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid);
+
+ /* must be called with both locks acquired. */
+ struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id);
+diff -upr linux-2.6.16.orig/kernel/Kconfig.fairsched linux-2.6.16-026test015/kernel/Kconfig.fairsched
+--- linux-2.6.16.orig/kernel/Kconfig.fairsched 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/Kconfig.fairsched 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,22 @@
++config SCHED_VCPU
++ bool "VCPU scheduler support"
++ default n
++ help
++ VCPU scheduler support adds additional layer of abstraction
++ which allows to virtualize cpu notion and split physical cpus
++ and virtual cpus. This support allows to use CPU fair scheduler,
++ dynamically add/remove cpus to/from VPS and so on.
++
++config FAIRSCHED
++ bool "Fair CPU scheduler (EXPERIMENTAL)"
++ depends on SCHED_VCPU
++ default SCHED_VCPU
++ help
++ Config option for Fair CPU scheduler (fairsched).
++ This option allows to group processes to scheduling nodes
++ which receive CPU proportional to their weight.
++ This is very important feature for process groups isolation and
++ QoS management.
++
++ If unsure, say N.
++
+diff -upr linux-2.6.16.orig/kernel/Kconfig.openvz linux-2.6.16-026test015/kernel/Kconfig.openvz
+--- linux-2.6.16.orig/kernel/Kconfig.openvz 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/Kconfig.openvz 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,61 @@
++# Copyright (C) 2005 SWsoft
++# All rights reserved.
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++menu "OpenVZ"
++
++config VE
++ bool "Virtual Environment support"
++ default y
++ help
++ This option adds support of virtual Linux running on the original box
++ with fully supported virtual network driver, tty subsystem and
++ configurable access for hardware and other resources.
++
++config VE_CALLS
++ tristate "VE calls interface"
++ depends on VE
++ default m
++ help
++ This option controls how to build vzmon code containing VE calls.
++ By default it's build in module vzmon.o
++
++config VE_NETDEV
++ tristate "VE networking"
++ depends on VE_CALLS
++ default m
++ help
++ This option controls whether to build VE networking code.
++
++config VE_ETHDEV
++ tristate "Virtual ethernet device"
++ depends on VE_CALLS
++ default m
++ help
++ This option controls whether to build virtual ethernet device.
++
++config VE_IPTABLES
++ bool "VE netfiltering"
++ depends on VE && VE_NETDEV && INET && NETFILTER
++ default y
++ help
++ This option controls whether to build VE netfiltering code.
++
++config VZ_WDOG
++ tristate "VE watchdog module"
++ depends on VE_CALLS
++ default m
++ help
++ This option controls building of vzwdog module, which dumps
++ a lot of useful system info on console periodically.
++
++config VZ_CHECKPOINT
++ tristate "Checkpointing & restoring Virtual Environments"
++ depends on SOFTWARE_SUSPEND && VE_CALLS
++ default m
++ help
++ This option adds two modules, "cpt" and "rst", which allow
++ to save a running Virtual Environment and restore it
++ on another host (live migration) or on the same host (checkpointing).
++
++endmenu
+diff -upr linux-2.6.16.orig/kernel/Makefile linux-2.6.16-026test015/kernel/Makefile
+--- linux-2.6.16.orig/kernel/Makefile 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/Makefile 2006-07-04 14:41:39.000000000 +0400
+@@ -2,7 +2,8 @@
+ # Makefile for the linux kernel.
+ #
+
+-obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
++obj-y = sched.o fairsched.o \
++ fork.o exec_domain.o panic.o printk.o profile.o \
+ exit.o itimer.o time.o softirq.o resource.o \
+ sysctl.o capability.o ptrace.o timer.o user.o \
+ signal.o sys.o kmod.o workqueue.o pid.o \
+@@ -10,6 +11,18 @@ obj-y = sched.o fork.o exec_domain.o
+ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
+ hrtimer.o
+
++obj-y += ub/
++
++obj-$(CONFIG_VE) += ve.o
++obj-$(CONFIG_VE) += veowner.o
++obj-$(CONFIG_VE_CALLS) += vzdev.o
++obj-$(CONFIG_VZ_WDOG) += vzwdog.o
++obj-$(CONFIG_VE_CALLS) += vzmon.o
++
++vzmon-objs = vecalls.o
++
++obj-$(CONFIG_VZ_CHECKPOINT) += cpt/
++
+ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+ obj-$(CONFIG_FUTEX) += futex.o
+ obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
+diff -upr linux-2.6.16.orig/kernel/audit.c linux-2.6.16-026test015/kernel/audit.c
+--- linux-2.6.16.orig/kernel/audit.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/audit.c 2006-07-04 14:41:38.000000000 +0400
+@@ -372,6 +372,9 @@ static int audit_receive_msg(struct sk_b
+ uid_t loginuid; /* loginuid of sender */
+ struct audit_sig_info sig_data;
+
++ if (!ve_is_super(VE_OWNER_SKB(skb)))
++ return -ECONNREFUSED;
++
+ err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
+ if (err)
+ return err;
+diff -upr linux-2.6.16.orig/kernel/auditsc.c linux-2.6.16-026test015/kernel/auditsc.c
+--- linux-2.6.16.orig/kernel/auditsc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/auditsc.c 2006-07-04 14:41:36.000000000 +0400
+@@ -966,11 +966,6 @@ void audit_syscall_entry(struct task_str
+ if (context->in_syscall) {
+ struct audit_context *newctx;
+
+-#if defined(__NR_vm86) && defined(__NR_vm86old)
+- /* vm86 mode should only be entered once */
+- if (major == __NR_vm86 || major == __NR_vm86old)
+- return;
+-#endif
+ #if AUDIT_DEBUG
+ printk(KERN_ERR
+ "audit(:%d) pid=%d in syscall=%d;"
+diff -upr linux-2.6.16.orig/kernel/capability.c linux-2.6.16-026test015/kernel/capability.c
+--- linux-2.6.16.orig/kernel/capability.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/capability.c 2006-07-04 14:41:39.000000000 +0400
+@@ -24,7 +24,8 @@ EXPORT_SYMBOL(cap_bset);
+ * This lock protects task->cap_* for all tasks including current.
+ * Locking rule: acquire this prior to tasklist_lock.
+ */
+-static DEFINE_SPINLOCK(task_capability_lock);
++DEFINE_SPINLOCK(task_capability_lock);
++EXPORT_SYMBOL(task_capability_lock);
+
+ /*
+ * For sys_getproccap() and sys_setproccap(), any of the three
+@@ -67,8 +68,8 @@ asmlinkage long sys_capget(cap_user_head
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+- if (pid && pid != current->pid) {
+- target = find_task_by_pid(pid);
++ if (pid && pid != virt_pid(current)) {
++ target = find_task_by_pid_ve(pid);
+ if (!target) {
+ ret = -ESRCH;
+ goto out;
+@@ -100,9 +101,13 @@ static inline int cap_set_pg(int pgrp, k
+ int ret = -EPERM;
+ int found = 0;
+
+- do_each_task_pid(pgrp, PIDTYPE_PGID, g) {
++ pgrp = vpid_to_pid(pgrp);
++ if (pgrp < 0)
++ return ret;
++
++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, g) {
+ target = g;
+- while_each_thread(g, target) {
++ while_each_thread_ve(g, target) {
+ if (!security_capset_check(target, effective,
+ inheritable,
+ permitted)) {
+@@ -113,7 +118,7 @@ static inline int cap_set_pg(int pgrp, k
+ }
+ found = 1;
+ }
+- } while_each_task_pid(pgrp, PIDTYPE_PGID, g);
++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, g);
+
+ if (!found)
+ ret = 0;
+@@ -132,7 +137,7 @@ static inline int cap_set_all(kernel_cap
+ int ret = -EPERM;
+ int found = 0;
+
+- do_each_thread(g, target) {
++ do_each_thread_ve(g, target) {
+ if (target == current || target->pid == 1)
+ continue;
+ found = 1;
+@@ -141,7 +146,7 @@ static inline int cap_set_all(kernel_cap
+ continue;
+ ret = 0;
+ security_capset_set(target, effective, inheritable, permitted);
+- } while_each_thread(g, target);
++ } while_each_thread_ve(g, target);
+
+ if (!found)
+ ret = 0;
+@@ -188,7 +193,7 @@ asmlinkage long sys_capset(cap_user_head
+ if (get_user(pid, &header->pid))
+ return -EFAULT;
+
+- if (pid && pid != current->pid && !capable(CAP_SETPCAP))
++ if (pid && pid != virt_pid(current) && !capable(CAP_SETPCAP))
+ return -EPERM;
+
+ if (copy_from_user(&effective, &data->effective, sizeof(effective)) ||
+@@ -199,8 +204,8 @@ asmlinkage long sys_capset(cap_user_head
+ spin_lock(&task_capability_lock);
+ read_lock(&tasklist_lock);
+
+- if (pid > 0 && pid != current->pid) {
+- target = find_task_by_pid(pid);
++ if (pid > 0 && pid != virt_pid(current)) {
++ target = find_task_by_pid_ve(pid);
+ if (!target) {
+ ret = -ESRCH;
+ goto out;
+diff -upr linux-2.6.16.orig/kernel/compat.c linux-2.6.16-026test015/kernel/compat.c
+--- linux-2.6.16.orig/kernel/compat.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/compat.c 2006-07-04 14:41:39.000000000 +0400
+@@ -21,6 +21,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/unistd.h>
+ #include <linux/security.h>
++#include <linux/hrtimer.h>
++#include <linux/module.h>
+
+ #include <asm/uaccess.h>
+
+@@ -38,61 +40,73 @@ int put_compat_timespec(const struct tim
+ __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
+ }
+
+-static long compat_nanosleep_restart(struct restart_block *restart)
++long compat_nanosleep_restart(struct restart_block *restart)
+ {
+- unsigned long expire = restart->arg0, now = jiffies;
+ struct compat_timespec __user *rmtp;
++ struct timespec tu;
++ void *rfn_save = restart->fn;
++ struct hrtimer timer;
++ ktime_t rem;
+
+- /* Did it expire while we handled signals? */
+- if (!time_after(expire, now))
+- return 0;
++ restart->fn = do_no_restart_syscall;
++
++ hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
++
++ timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
+
+- expire = schedule_timeout_interruptible(expire - now);
+- if (expire == 0)
++ set_current_state(TASK_INTERRUPTIBLE);
++ rem = schedule_hrtimer(&timer, HRTIMER_ABS);
++
++ if (rem.tv64 <= 0)
+ return 0;
+
+- rmtp = (struct compat_timespec __user *)restart->arg1;
+- if (rmtp) {
+- struct compat_timespec ct;
+- struct timespec t;
+-
+- jiffies_to_timespec(expire, &t);
+- ct.tv_sec = t.tv_sec;
+- ct.tv_nsec = t.tv_nsec;
+- if (copy_to_user(rmtp, &ct, sizeof(ct)))
+- return -EFAULT;
+- }
+- /* The 'restart' block is already filled in */
++ rmtp = (struct compat_timespec __user *) restart->arg2;
++ tu = ktime_to_timespec(rem);
++ if (rmtp && put_compat_timespec(&tu, rmtp))
++ return -EFAULT;
++
++ restart->fn = rfn_save;
++
++ /* The other values in restart are already filled in */
+ return -ERESTART_RESTARTBLOCK;
+ }
++EXPORT_SYMBOL_GPL(compat_nanosleep_restart);
+
+ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
+ struct compat_timespec __user *rmtp)
+ {
+ struct timespec t;
+ struct restart_block *restart;
+- unsigned long expire;
++ struct hrtimer timer;
++ ktime_t rem;
+
+ if (get_compat_timespec(&t, rqtp))
+ return -EFAULT;
+
+- if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
++ if (!timespec_valid(&t))
+ return -EINVAL;
+
+- expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
+- expire = schedule_timeout_interruptible(expire);
+- if (expire == 0)
++ hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL);
++
++ timer.expires = timespec_to_ktime(t);
++
++ set_current_state(TASK_INTERRUPTIBLE);
++ rem = schedule_hrtimer(&timer, HRTIMER_REL);
++ if (rem.tv64 <= 0)
+ return 0;
+
+- if (rmtp) {
+- jiffies_to_timespec(expire, &t);
+- if (put_compat_timespec(&t, rmtp))
+- return -EFAULT;
+- }
++ t = ktime_to_timespec(rem);
++
++ if (rmtp && put_compat_timespec(&t, rmtp))
++ return -EFAULT;
++
+ restart = &current_thread_info()->restart_block;
+ restart->fn = compat_nanosleep_restart;
+- restart->arg0 = jiffies + expire;
+- restart->arg1 = (unsigned long) rmtp;
++ restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
++ restart->arg1 = timer.expires.tv64 >> 32;
++ restart->arg2 = (unsigned long) rmtp;
++ restart->arg3 = (unsigned long) timer.base->index;
++
+ return -ERESTART_RESTARTBLOCK;
+ }
+
+diff -upr linux-2.6.16.orig/kernel/configs.c linux-2.6.16-026test015/kernel/configs.c
+--- linux-2.6.16.orig/kernel/configs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/configs.c 2006-07-04 14:41:38.000000000 +0400
+@@ -89,8 +89,7 @@ static int __init ikconfig_init(void)
+ struct proc_dir_entry *entry;
+
+ /* create the current config file */
+- entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO,
+- &proc_root);
++ entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL);
+ if (!entry)
+ return -ENOMEM;
+
+diff -upr linux-2.6.16.orig/kernel/cpt/Makefile linux-2.6.16-026test015/kernel/cpt/Makefile
+--- linux-2.6.16.orig/kernel/cpt/Makefile 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/Makefile 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,41 @@
++#
++#
++# kernel/cpt/Makefile
++#
++# Copyright (C) 2000-2005 SWsoft
++# All rights reserved.
++#
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o
++
++vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \
++ cpt_mm.o cpt_files.o cpt_kernel.o \
++ cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \
++ cpt_conntrack.o cpt_ubc.o cpt_epoll.o
++
++vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \
++ rst_mm.o rst_files.o \
++ rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \
++ rst_conntrack.o rst_ubc.o rst_epoll.o
++
++ifeq ($(CONFIG_VZ_CHECKPOINT), m)
++vzrst-objs += cpt_obj.o cpt_kernel.o
++endif
++
++ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y)
++vzcpt-objs += cpt_pagein.o
++vzrst-objs += rst_pagein.o
++endif
++
++ifeq ($(CONFIG_X86_64), y)
++vzcpt-objs += cpt_x8664.o
++vzrst-objs += rst_x8664.o
++ifeq ($(CONFIG_VZ_CHECKPOINT), m)
++vzrst-objs += cpt_x8664.o
++endif
++endif
++
++ifeq ($(CONFIG_X86_32), y)
++vzrst-objs += rst_i386.o
++endif
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_conntrack.c linux-2.6.16-026test015/kernel/cpt/cpt_conntrack.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_conntrack.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_conntrack.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,370 @@
++/*
++ *
++ * kernel/cpt/cpt_conntrack.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/unistd.h>
++#include <linux/ve.h>
++#include <linux/vzcalluser.h>
++#include <linux/cpt_image.h>
++#include <linux/icmp.h>
++#include <linux/ip.h>
++
++#if defined(CONFIG_VE_IPTABLES) && \
++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
++
++#include <linux/netfilter.h>
++#include <linux/netfilter_ipv4/ip_conntrack.h>
++#include <linux/netfilter_ipv4/ip_nat.h>
++#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
++#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
++#include <linux/netfilter_ipv4/ip_conntrack_core.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++
++/* How does it work?
++ *
++ * Network is disabled, so new conntrack entries will not appear.
++ * However, some of them can disappear because of timeouts.
++ *
++ * So, we take read_lock, collect all required information atomically,
++ * essentially, creating parallel "refcount" structures holding pointers.
++ * We delete conntrack timers as well, so the structures cannot disappear
++ * after releasing the lock. Now, after releasing lock we can dump everything
++ * safely. And on exit we restore timers to their original values.
++ *
++ * Note, this approach is not going to work in VE0.
++ */
++
++struct ct_holder
++{
++ struct ct_holder *next;
++ struct ip_conntrack_tuple_hash *cth;
++ int index;
++};
++
++static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple)
++{
++ v->cpt_dst = tuple->dst.ip;
++ v->cpt_dstport = tuple->dst.u.all;
++ v->cpt_protonum = tuple->dst.protonum;
++ v->cpt_dir = tuple->dst.dir;
++
++ v->cpt_src = tuple->src.ip;
++ v->cpt_srcport = tuple->src.u.all;
++}
++
++static int dump_one_expect(struct cpt_ip_connexpect_image *v,
++ struct ip_conntrack_expect *exp,
++ int sibling, cpt_context_t *ctx)
++{
++ int err = 0;
++
++ v->cpt_next = sizeof(*v);
++ v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_VOID;
++
++ encode_tuple(&v->cpt_tuple, &exp->tuple);
++ encode_tuple(&v->cpt_mask, &exp->mask);
++ v->cpt_sibling_conntrack = sibling;
++ v->cpt_flags = exp->flags;
++ v->cpt_seq = exp->id;
++ v->cpt_dir = 0;
++ v->cpt_manip_proto = 0;
++#ifdef CONFIG_IP_NF_NAT_NEEDED
++ v->cpt_manip_proto = exp->saved_proto.all;
++ v->cpt_dir = exp->dir;
++#endif
++ v->cpt_timeout = 0;
++ if (exp->master->helper->timeout)
++ v->cpt_timeout = exp->timeout.expires - jiffies;
++ return err;
++}
++
++/* NOTE. We use one page to dump list of expectations. This may be not enough
++ * in theory. In practice there is only one expectation per conntrack record.
++ * Moreover, taking into account that _ALL_ of expecations are saved in one
++ * global list, which is looked up each incoming/outpging packet, the system
++ * would be severely dead when even one conntrack would have so much of
++ * expectations. Shortly, I am not going to repair this.
++ */
++
++static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list,
++ cpt_context_t *ctx)
++{
++ int err = 0;
++ unsigned long pg;
++ struct cpt_ip_connexpect_image *v;
++ struct ip_conntrack_expect *exp;
++
++ if (ct->expecting == 0)
++ return err;
++ if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE)
++ return -ENOBUFS;
++
++ pg = __get_free_page(GFP_KERNEL);
++ if (!pg)
++ return -ENOMEM;
++ v = (struct cpt_ip_connexpect_image *)pg;
++
++ read_lock_bh(&ip_conntrack_lock);
++ list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) {
++ int sibling;
++
++ if (exp->master != ct)
++ continue;
++
++ if (ct->helper == NULL) {
++ eprintk_ctx("conntrack: no helper and non-trivial expectation\n");
++ err = -EINVAL;
++ break;
++ }
++
++ sibling = 0;
++#if 0
++ /* That's all? No need to calculate sibling? */
++ if (exp->sibling) {
++ struct ct_holder *c;
++ for (c = list; c; c = c->next) {
++ if (tuplehash_to_ctrack(c->cth) == exp->sibling) {
++ sibling = c->index;
++ break;
++ }
++ }
++ /* NOTE: exp->sibling could be not "confirmed" and, hence,
++ * out of hash table. We should just ignore such a sibling,
++ * the connection is going to be retried, the packet
++ * apparently was lost somewhere.
++ */
++ if (sibling == 0)
++ dprintk_ctx("sibling conntrack is not found\n");
++ }
++#endif
++
++ /* If the expectation still does not have exp->sibling
++ * and timer is not running, it is about to die on another
++ * cpu. Skip it. */
++ if (!sibling &&
++ ct->helper->timeout &&
++ !timer_pending(&exp->timeout)) {
++ dprintk_ctx("conntrack: expectation: no timer\n");
++ continue;
++ }
++
++ err = dump_one_expect(v, exp, sibling, ctx);
++ if (err)
++ break;
++
++ v++;
++ }
++ read_unlock_bh(&ip_conntrack_lock);
++
++ if (err == 0 && (unsigned long)v != pg)
++ ctx->write((void*)pg, (unsigned long)v - pg, ctx);
++
++ free_page(pg);
++ return err;
++}
++
++static int dump_one_ct(struct ct_holder *c, struct ct_holder *list,
++ cpt_context_t *ctx)
++{
++ struct ip_conntrack_tuple_hash *h = c->cth;
++ struct ip_conntrack *ct = tuplehash_to_ctrack(h);
++ struct cpt_ip_conntrack_image v;
++ int err = 0;
++
++ if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) {
++ eprintk_ctx("conntrack module ct->proto version mismatch\n");
++ return -EINVAL;
++ }
++ if (sizeof(v.cpt_help_data) != sizeof(ct->help)) {
++ eprintk_ctx("conntrack module ct->help version mismatch\n");
++ return -EINVAL;
++ }
++
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NET_CONNTRACK;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_ARRAY;
++
++ read_lock_bh(&ip_conntrack_lock);
++ v.cpt_status = ct->status;
++ v.cpt_timeout = ct->timeout.expires - jiffies;
++ v.cpt_ct_helper = (ct->helper != NULL);
++ v.cpt_index = c->index;
++ v.cpt_id = ct->id;
++ v.cpt_mark = 0;
++#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
++ v.cpt_mark = ct->mark;
++#endif
++ encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple);
++ encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple);
++ memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data));
++ memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data));
++
++ v.cpt_masq_index = 0;
++ v.cpt_initialized = 0;
++ v.cpt_num_manips = 0;
++ v.cpt_nat_helper = 0;
++#ifdef CONFIG_IP_NF_NAT_NEEDED
++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
++ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
++ v.cpt_masq_index = ct->nat.masq_index;
++#endif
++ /* "help" data is used by pptp, difficult to support */
++ v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos;
++ v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before;
++ v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after;
++ v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos;
++ v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before;
++ v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after;
++#endif
++ read_unlock_bh(&ip_conntrack_lock);
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ err = dump_expect_list(ct, list, ctx);
++
++ cpt_close_object(ctx);
++ return err;
++}
++
++int cpt_dump_ip_conntrack(cpt_context_t * ctx)
++{
++ struct ct_holder *ct_list = NULL;
++ struct ct_holder *c, **cp;
++ int err = 0;
++ int index = 0;
++ int idx;
++
++ if (get_exec_env()->_ip_conntrack == NULL)
++ return 0;
++
++ for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) {
++ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
++ if (c == NULL) {
++ err = -ENOMEM;
++ goto done;
++ }
++ memset(c, 0, sizeof(struct ct_holder));
++ c->next = ct_list;
++ ct_list = c;
++ }
++
++ c = ct_list;
++
++ read_lock_bh(&ip_conntrack_lock);
++ for (idx = 0; idx < ip_conntrack_htable_size; idx++) {
++ struct ip_conntrack_tuple_hash *h;
++ list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) {
++ /* Skip reply tuples, they are covered by original
++ * direction. */
++ if (DIRECTION(h))
++ continue;
++
++ /* Oops, we have not enough of holders...
++ * It is impossible. */
++ if (unlikely(c == NULL)) {
++ read_unlock_bh(&ip_conntrack_lock);
++ eprintk_ctx("unexpected conntrack appeared\n");
++ err = -ENOMEM;
++ goto done;
++ }
++
++ /* If timer is not running, it means that it
++ * has just been scheduled on another cpu.
++ * We should skip this conntrack, it is about to be
++ * destroyed. */
++ if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) {
++ dprintk_ctx("conntrack: no timer\n");
++ continue;
++ }
++
++ /* Timer is deleted. refcnt is _not_ decreased.
++ * We are going to restore the timer on exit
++ * from this function. */
++ c->cth = h;
++ c->index = ++index;
++ c = c->next;
++ }
++ }
++ read_unlock_bh(&ip_conntrack_lock);
++
++ /* No conntracks? Good. */
++ if (index == 0)
++ goto done;
++
++ /* Comb the list a little. */
++ cp = &ct_list;
++ while ((c = *cp) != NULL) {
++ /* Discard unused entries; they can appear, if some
++ * entries were timed out since we preallocated the list.
++ */
++ if (c->cth == NULL) {
++ *cp = c->next;
++ kfree(c);
++ continue;
++ }
++
++ /* Move conntracks attached to expectations to the beginning
++ * of the list. */
++ if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) {
++ *cp = c->next;
++ c->next = ct_list;
++ ct_list = c;
++ dprintk_ctx("conntrack: %d moved in list\n", c->index);
++ continue;
++ }
++ cp = &c->next;
++ }
++
++ cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK);
++
++ for (c = ct_list; c; c = c->next) {
++ err = dump_one_ct(c, ct_list, ctx);
++ if (err)
++ goto done;
++ }
++
++ cpt_close_section(ctx);
++
++done:
++ while ((c = ct_list) != NULL) {
++ ct_list = c->next;
++ if (c->cth) {
++ /* Restore timer. refcnt is preserved. */
++ add_timer(&tuplehash_to_ctrack(c->cth)->timeout);
++ }
++ kfree(c);
++ }
++ return err;
++}
++
++#endif
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_context.c linux-2.6.16-026test015/kernel/cpt/cpt_context.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_context.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_context.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,255 @@
++/*
++ *
++ * kernel/cpt/cpt_context.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++
++static void file_write(const void *addr, size_t count, struct cpt_context *ctx)
++{
++ mm_segment_t oldfs;
++ ssize_t err = -EBADF;
++ struct file *file = ctx->file;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ if (file)
++ err = file->f_op->write(file, addr, count, &file->f_pos);
++ set_fs(oldfs);
++ if (err != count && !ctx->write_error)
++ ctx->write_error = err < 0 ? err : -EIO;
++}
++
++static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
++{
++ mm_segment_t oldfs;
++ ssize_t err = -EBADF;
++ struct file *file = ctx->file;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ if (file)
++ err = file->f_op->write(file, addr, count, &pos);
++ set_fs(oldfs);
++ if (err != count && !ctx->write_error)
++ ctx->write_error = err < 0 ? err : -EIO;
++}
++
++static void file_align(struct cpt_context *ctx)
++{
++ struct file *file = ctx->file;
++
++ if (file)
++ file->f_pos = CPT_ALIGN(file->f_pos);
++}
++
++void cpt_context_init(struct cpt_context *ctx)
++{
++ int i;
++
++ memset(ctx, 0, sizeof(*ctx));
++
++ init_MUTEX(&ctx->main_sem);
++ ctx->refcount = 1;
++
++ ctx->current_section = -1;
++ ctx->current_object = -1;
++ ctx->pagesize = PAGE_SIZE;
++ ctx->write = file_write;
++ ctx->pwrite = file_pwrite;
++ ctx->align = file_align;
++ for (i=0; i < CPT_SECT_MAX; i++)
++ ctx->sections[i] = CPT_NULL;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ init_completion(&ctx->pgin_notify);
++#endif
++ cpt_object_init(ctx);
++}
++
++int cpt_open_dumpfile(struct cpt_context *ctx)
++{
++ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
++ if (ctx->tmpbuf == NULL)
++ return -ENOMEM;
++ __cpt_release_buf(ctx);
++ return 0;
++}
++
++int cpt_close_dumpfile(struct cpt_context *ctx)
++{
++ if (ctx->file) {
++ fput(ctx->file);
++ ctx->file = NULL;
++ }
++ if (ctx->tmpbuf) {
++ free_page((unsigned long)ctx->tmpbuf);
++ ctx->tmpbuf = NULL;
++ }
++ if (ctx->write_error)
++ eprintk_ctx("error while writing dump file: %d\n", ctx->write_error);
++ return ctx->write_error;
++}
++
++int cpt_major_hdr_out(struct cpt_context *ctx)
++{
++ struct cpt_major_hdr hdr;
++
++ if (ctx->file == NULL)
++ return 0;
++
++ memset(&hdr, 0, sizeof(hdr));
++ hdr.cpt_signature[0] = CPT_SIGNATURE0;
++ hdr.cpt_signature[1] = CPT_SIGNATURE1;
++ hdr.cpt_signature[2] = CPT_SIGNATURE2;
++ hdr.cpt_signature[3] = CPT_SIGNATURE3;
++ hdr.cpt_hdrlen = sizeof(hdr);
++ hdr.cpt_image_version = 1;
++#ifdef CONFIG_X86_32
++ hdr.cpt_os_arch = CPT_OS_ARCH_I386;
++#endif
++#ifdef CONFIG_X86_64
++ hdr.cpt_os_arch = CPT_OS_ARCH_EMT64;
++#endif
++ hdr.cpt_os_version = 0;
++ hdr.cpt_os_features = 0;
++ hdr.cpt_pagesize = PAGE_SIZE;
++ hdr.cpt_hz = HZ;
++ hdr.cpt_start_jiffies64 = ctx->virt_jiffies64;
++ hdr.cpt_start_sec = ctx->start_time.tv_sec;
++ hdr.cpt_start_nsec = ctx->start_time.tv_nsec;
++ hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags;
++ hdr.cpt_kernel_config[0] = ctx->kernel_config_flags;
++ hdr.cpt_iptables_mask = ctx->iptables_mask;
++
++ ctx->write(&hdr, sizeof(hdr), ctx);
++ return 0;
++}
++
++int cpt_close_section(struct cpt_context *ctx)
++{
++ if (ctx->file && ctx->current_section >= 0) {
++ __u64 next = ctx->file->f_pos - ctx->current_section;
++ ctx->pwrite(&next, 8, ctx, ctx->current_section);
++ ctx->current_section = -1;
++ }
++ return 0;
++}
++EXPORT_SYMBOL(cpt_close_section);
++
++int cpt_open_section(struct cpt_context *ctx, __u32 type)
++{
++ struct cpt_section_hdr hdr;
++
++ if (ctx->file == NULL)
++ return 0;
++
++ cpt_close_section(ctx);
++
++ ctx->current_section = ctx->file->f_pos;
++ ctx->sections[type] = ctx->current_section;
++
++ hdr.cpt_next = 0;
++ hdr.cpt_section = type;
++ hdr.cpt_hdrlen = sizeof(hdr);
++ hdr.cpt_align = 0;
++ ctx->write(&hdr, sizeof(hdr), ctx);
++
++ return 0;
++}
++EXPORT_SYMBOL(cpt_open_section);
++
++
++int cpt_close_object(struct cpt_context *ctx)
++{
++ if (ctx->file && ctx->current_object >= 0) {
++ __u64 next = ctx->file->f_pos - ctx->current_object;
++ ctx->pwrite(&next, 8, ctx, ctx->current_object);
++ ctx->current_object = -1;
++ }
++ return 0;
++}
++EXPORT_SYMBOL(cpt_close_object);
++
++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ if (ctx->file == NULL)
++ return 0;
++
++ cpt_close_object(ctx);
++
++ ctx->current_object = ctx->file->f_pos;
++ if (obj)
++ cpt_obj_setpos(obj, ctx->current_object, ctx);
++
++ return 0;
++}
++EXPORT_SYMBOL(cpt_open_object);
++
++int cpt_push_object(loff_t *saved, struct cpt_context *ctx)
++{
++ if (ctx->file) {
++ *saved = ctx->current_object;
++ ctx->current_object = ctx->file->f_pos;
++ }
++ return 0;
++}
++EXPORT_SYMBOL(cpt_push_object);
++
++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx)
++{
++ ctx->current_object = *saved;
++ return 0;
++}
++EXPORT_SYMBOL(cpt_pop_object);
++
++int cpt_dump_tail(struct cpt_context *ctx)
++{
++ struct cpt_major_tail hdr;
++ int i;
++
++ if (ctx->file == NULL)
++ return 0;
++
++ cpt_open_section(ctx, CPT_SECT_TRAILER);
++ memset(&hdr, 0, sizeof(hdr));
++ hdr.cpt_next = sizeof(hdr);
++ hdr.cpt_object = CPT_OBJ_TRAILER;
++ hdr.cpt_hdrlen = sizeof(hdr);
++ hdr.cpt_content = CPT_CONTENT_VOID;
++ hdr.cpt_lazypages = 0;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ hdr.cpt_lazypages = ctx->lazypages;
++#endif
++ hdr.cpt_64bit = ctx->tasks64;
++ hdr.cpt_signature[0] = CPT_SIGNATURE0;
++ hdr.cpt_signature[1] = CPT_SIGNATURE1;
++ hdr.cpt_signature[2] = CPT_SIGNATURE2;
++ hdr.cpt_signature[3] = CPT_SIGNATURE3;
++ hdr.cpt_nsect = CPT_SECT_MAX_INDEX;
++ for (i = 0; i < CPT_SECT_MAX_INDEX; i++)
++ hdr.cpt_sections[i] = ctx->sections[i];
++
++ ctx->write(&hdr, sizeof(hdr), ctx);
++ cpt_close_section(ctx);
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_context.h linux-2.6.16-026test015/kernel/cpt/cpt_context.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_context.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_context.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,196 @@
++#include <linux/fs.h>
++#include <asm/uaccess.h>
++
++#define CPT_CTX_ERROR -1
++#define CPT_CTX_IDLE 0
++#define CPT_CTX_SUSPENDING 1
++#define CPT_CTX_SUSPENDED 2
++#define CPT_CTX_DUMPING 3
++#define CPT_CTX_UNDUMPING 4
++#define CPT_CTX_UNDUMPED 5
++
++#define CPT_TID(tsk) (tsk)->pid, virt_pid(tsk), (tsk)->comm
++#define CPT_FID "%d,%d(%s)"
++
++
++typedef struct cpt_context
++{
++ struct list_head ctx_list;
++ int refcount;
++ int ctx_state;
++ int objcount;
++ int sticky;
++ struct semaphore main_sem;
++
++ struct file *errorfile;
++ struct file *statusfile;
++ struct file *lockfile;
++
++ int errno;
++ char *error_msg;
++ loff_t err_offset;
++
++ struct file *file;
++ char *tmpbuf;
++ int pagesize;
++
++ loff_t current_section;
++ loff_t current_object;
++
++ loff_t sections[CPT_SECT_MAX];
++
++ __u32 errormask;
++ __u32 write_error;
++
++ struct list_head object_array[CPT_OBJ_MAX];
++
++ void (*write)(const void *addr, size_t count, struct cpt_context *ctx);
++ void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
++ ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx);
++ ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos);
++ void (*align)(struct cpt_context *ctx);
++ int ve_id;
++ int contextid;
++ __u64 cpt_jiffies64; /* Host jiffies64 at the moment of cpt/rst,
++ * corresponging to start_time */
++ __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when
++ * VE did not migrate. */
++ struct timespec start_time;
++ struct timespec delta_time;
++ int image_version;
++ int lo_index;
++ int lo_index_old;
++ int venet_index;
++ int venet_index_old;
++ __u64 iptables_mask;
++
++#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9)
++#define CPT_ANONVMA_HSIZE (1<<CPT_ANONVMA_HBITS)
++ struct hlist_head *anonvmas;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ struct file *pagein_file_in;
++ struct file *pagein_file_out;
++ int lazy_vm;
++ int lazypages;
++ int lazytype;
++ task_t *pgin_task;
++ unsigned long last_pagein;
++ struct pagein_desc **pgin_dir;
++ struct pgin_device *pagein_dev;
++ struct completion pgin_notify;
++ struct completion *pgind_completion;
++ struct swap_info_struct *pgin_swp;
++#endif
++ int tasks64;
++ __u32 src_cpu_flags;
++ __u32 dst_cpu_flags;
++ __u32 kernel_config_flags;
++
++ struct filejob *filejob_queue;
++} cpt_context_t;
++
++typedef struct {
++ int pid;
++ cpt_context_t *ctx;
++ struct completion done;
++} pagein_info_t;
++
++int pagein_info_printf(char *buf, cpt_context_t *ctx);
++
++int cpt_open_dumpfile(struct cpt_context *);
++int cpt_close_dumpfile(struct cpt_context *);
++int rst_open_dumpfile(struct cpt_context *);
++void rst_close_dumpfile(struct cpt_context *);
++void cpt_context_init(struct cpt_context *);
++void rst_context_init(struct cpt_context *);
++void cpt_context_destroy(struct cpt_context *);
++
++void rst_report_error(int err, cpt_context_t *ctx);
++
++
++int cpt_major_hdr_out(struct cpt_context *ctx);
++int cpt_dump_tail(struct cpt_context *ctx);
++int cpt_close_section(struct cpt_context *ctx);
++int cpt_open_section(struct cpt_context *ctx, __u32 type);
++int cpt_close_object(struct cpt_context *ctx);
++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx);
++int cpt_push_object(loff_t *saved, struct cpt_context *ctx);
++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx);
++
++int rst_get_section(int type, struct cpt_context * ctx, loff_t *, loff_t *);
++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx);
++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx);
++void rst_put_name(__u8 *name, struct cpt_context *ctx);
++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx);
++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx);
++
++#define rst_get_object(type, pos, tmp, ctx) \
++ _rst_get_object((type), (pos), (tmp), sizeof(*(tmp)), (ctx))
++
++extern int debug_level;
++
++#define cpt_printk(lvl, fmt, args...) do { \
++ if (lvl <= debug_level) \
++ printk(fmt, ##args); \
++ } while (0)
++
++#define dprintk(a...) cpt_printk(3, "CPT DBG: " a)
++#define dprintk_ctx(f, arg...) dprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
++
++#define wprintk(a...) cpt_printk(2, "CPT WRN: " a)
++#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg)
++
++#define eprintk(a...) cpt_printk(1, "CPT ERR: " a)
++#define eprintk_ctx(f, arg...) \
++do { \
++ eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \
++ if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \
++ ctx->err_offset += snprintf((char*)(ctx->error_msg + \
++ ctx->err_offset), \
++ PAGE_SIZE - ctx->err_offset, f, ##arg); \
++} while(0)
++
++#define CPT_TMPBUF_FREE 0x789adf12
++#define CPT_TMPBUF_BUSY 0xabcd9876
++
++static inline void *cpt_get_buf(cpt_context_t *ctx)
++{
++ void *buf = ctx->tmpbuf;
++
++ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE);
++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY;
++ return buf;
++}
++
++static inline void __cpt_release_buf(cpt_context_t *ctx)
++{
++ void *buf = ctx->tmpbuf;
++
++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
++}
++
++static inline void cpt_release_buf(cpt_context_t *ctx)
++{
++ void *buf = ctx->tmpbuf;
++
++ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY);
++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE;
++}
++
++static inline void cpt_flush_error(cpt_context_t *ctx)
++{
++ mm_segment_t oldfs;
++
++ if (ctx->errorfile && ctx->error_msg && ctx->err_offset) {
++ if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) {
++ oldfs = get_fs();
++ set_fs(KERNEL_DS);
++ ctx->errorfile->f_op->write(ctx->errorfile,
++ ctx->error_msg, ctx->err_offset,
++ &ctx->errorfile->f_pos);
++ set_fs(oldfs);
++ }
++ ctx->error_msg[0] = 0;
++ ctx->err_offset = 0;
++ }
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_dump.c linux-2.6.16-026test015/kernel/cpt/cpt_dump.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_dump.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_dump.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,840 @@
++/*
++ *
++ * kernel/cpt/cpt_dump.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/ptrace.h>
++#include <linux/smp_lock.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/virtinfo.h>
++#include <ub/ub_task.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_dump.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_process.h"
++#include "cpt_net.h"
++#include "cpt_socket.h"
++#include "cpt_ubc.h"
++#include "cpt_kernel.h"
++
++
++static int vps_child_level(task_t *root, task_t *c)
++{
++ int level = 0;
++ int veid = VE_TASK_INFO(c)->owner_env->veid;
++
++ while (VE_TASK_INFO(c)->owner_env->veid == veid) {
++ if (c->pid != c->tgid)
++ c = c->group_leader;
++ if (c == root)
++ return level;
++
++ c = c->real_parent;
++ level++;
++ }
++ return -1;
++}
++
++static inline int freezable(struct task_struct * p)
++{
++ if (p->exit_state)
++ return 0;
++
++ switch (p->state) {
++ case EXIT_ZOMBIE:
++ case EXIT_DEAD:
++ case TASK_STOPPED:
++#if TASK_TRACED != TASK_STOPPED
++ case TASK_TRACED:
++#endif
++ return 0;
++ default:
++ return 1;
++ }
++}
++
++/*
++ * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE...
++ *
++ * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context
++ * of another process. Apparently, it is unacceptable on SMP.
++ * Let's take freeze_processes() in kernel/power/process.c as an example.
++ * Unserialized modifications tsk->flags easily
++ * (believe or not, but it happens with probability of almost 100% :-))
++ * creates the situation when setting PF_FREEZE in freeze_processes(),
++ * which quickly spins raising PF_FREEZE of all the processes,
++ * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks.
++ *
++ * So, to make things clean, we require that those flags may be modified
++ * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE
++ * is just a kind of signal.
++ *
++ * It is not enough, because we are still not allowed to change tsk->flags
++ * in context of another process, we can corrupt another flags, when the process
++ * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags,
++ * which can be changed atomically.
++ *
++ * PF_FROZEN also changes in context of another process, but this happens
++ * only when the process is already in refrigerator() which does not modify
++ * tsk->flags.
++ */
++
++static int vps_stop_tasks(struct cpt_context *ctx)
++{
++ unsigned long start_time = jiffies;
++ int err;
++ task_t *p, *g;
++ int todo;
++ int round = 0;
++
++ do_gettimespec(&ctx->start_time);
++ ctx->cpt_jiffies64 = get_jiffies_64();
++ ctx->virt_jiffies64 = ctx->cpt_jiffies64 + get_exec_env()->jiffies_fixup;
++
++ read_lock(&tasklist_lock);
++ for(;;) {
++ task_t *root;
++ todo = 0;
++
++ root = find_task_by_pid_ve(1);
++ if (!root) {
++ read_unlock(&tasklist_lock);
++ eprintk_ctx("cannot find ve init\n");
++ return -ESRCH;
++ }
++
++ do_each_thread_ve(g, p) {
++ if (vps_child_level(root, p) >= 0) {
++ if (!is_virtual_pid(virt_pid(p))) {
++ eprintk_ctx("external process %d/%d(%s) inside VPS (e.g. vzctl enter or vzctl exec).\n", virt_pid(p), p->pid, p->comm);
++ todo = -1;
++ goto out;
++ }
++ if (p->vfork_done) {
++ /* Task between vfork()...exec()
++ * cannot be frozen, because parent
++ * wait in uninterruptible state.
++ * So, we do nothing, waiting for
++ * exec(), unless:
++ */
++ if (p->state == TASK_STOPPED ||
++ p->state == TASK_TRACED) {
++ eprintk_ctx("task %d/%d(%s) is stopped while vfork(). Checkpointing is impossible.\n", virt_pid(p), p->pid, p->comm);
++ todo = -1;
++ /* It is fatal, _user_ stopped
++ * vfork()ing task, so that we
++ * cannot suspend now.
++ */
++ } else {
++ todo = -3;
++ }
++ goto out;
++ }
++ if (p->state == TASK_TRACED
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++ && !p->stopped_state
++#endif
++ ) {
++ int ptrace_id = p->pn_state;
++ /* Debugger waits for signal. */
++ switch (ptrace_id) {
++ case PN_STOP_TF:
++ case PN_STOP_TF_RT:
++ case PN_STOP_ENTRY:
++ case PN_STOP_FORK:
++ case PN_STOP_VFORK:
++ case PN_STOP_SIGNAL:
++ case PN_STOP_EXIT:
++ case PN_STOP_LEAVE:
++ break;
++ default:
++ eprintk_ctx("task %d/%d(%s) is stopped by debugger while %d.\n", virt_pid(p), p->pid, p->comm, ptrace_id);
++ todo = -1;
++ goto out;
++ }
++ }
++ if (p->flags & PF_NOFREEZE)
++ goto out;
++ if (p->flags & PF_FROZEN)
++ continue;
++ if (!freezable(p))
++ continue;
++
++ spin_lock_irq(&p->sighand->siglock);
++ set_tsk_thread_flag(p, TIF_FREEZE);
++ signal_wake_up(p, 0);
++ spin_unlock_irq(&p->sighand->siglock);
++
++ if (round == 10)
++ wprintk_ctx("%d/%d(%s) is running\n", virt_pid(p), p->pid, p->comm);
++
++ todo++;
++ } else {
++ if (p != current) {
++ eprintk_ctx("foreign process %d/%d(%s) inside VPS (e.g. vzctl enter or vzctl exec).\n", virt_pid(p), p->pid, p->comm);
++ todo = -1;
++ goto out;
++ }
++ }
++ } while_each_thread_ve(g, p);
++
++out:
++ if (todo &&
++ (time_after(jiffies, start_time + 10*HZ) ||
++ signal_pending(current) || todo < 0)) {
++ do_each_thread_ve(g, p) {
++ if (vps_child_level(root, p) >= 0) {
++ spin_lock_irq(&p->sighand->siglock);
++ clear_tsk_thread_flag(p, TIF_FREEZE);
++ if (p->flags & PF_FROZEN) {
++ p->flags &= ~PF_FROZEN;
++ wake_up_process(p);
++ }
++ spin_unlock_irq(&p->sighand->siglock);
++ }
++ } while_each_thread_ve(g, p);
++ if (todo > 0)
++ todo = -2;
++ /* This is sign of failure of printk(), which is not
++ * ours. So, no prefixes. */
++ printk(">\n");
++ }
++
++ read_unlock(&tasklist_lock);
++
++ if (!todo)
++ return 0;
++
++ if (todo == -1) {
++ eprintk_ctx("suspend is impossible now.\n");
++ return -EAGAIN;
++ }
++
++ if (todo == -2) {
++ eprintk_ctx("interrupted or timed out.\n");
++ return -EINTR;
++ }
++
++ if (time_after(jiffies, start_time + 10*HZ) ||
++ signal_pending(current)) {
++ if (todo == -3) {
++ eprintk_ctx("vfork() is active, suspend is impossible now.\n");
++ } else {
++ eprintk_ctx("suspend is impossible, reason %d\n", todo);
++ }
++ return -EAGAIN;
++ }
++
++ if (todo < 0 || round > 0) {
++ current->state = TASK_INTERRUPTIBLE;
++ schedule_timeout(HZ/50);
++ } else {
++ yield();
++ }
++
++ read_lock(&tasklist_lock);
++ round++;
++ }
++
++ read_unlock(&tasklist_lock);
++ return err;
++}
++
++static int cpt_unlock_ve(struct cpt_context *ctx)
++{
++ struct ve_struct *env;
++
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++ down_write(&env->op_sem);
++ env->is_locked = 0;
++ up_write(&env->op_sem);
++ put_ve(env);
++ return 0;
++}
++
++int cpt_resume(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ cpt_unlock_sockets(ctx);
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ if (ctx->pgin_task) {
++ wait_for_completion(&ctx->pgin_notify);
++ put_task_struct(ctx->pgin_task);
++ ctx->pgin_task = NULL;
++ }
++#endif
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++
++ spin_lock_irq(&tsk->sighand->siglock);
++ clear_tsk_thread_flag(tsk, TIF_FREEZE);
++ if (tsk->flags & PF_FROZEN) {
++ tsk->flags &= ~PF_FROZEN;
++ wake_up_process(tsk);
++ } else if (freezable(tsk)) {
++ eprintk_ctx("strange, %s not frozen\n", tsk->comm );
++ }
++ spin_unlock_irq(&tsk->sighand->siglock);
++ put_task_struct(tsk);
++ }
++
++ cpt_resume_network(ctx);
++
++ cpt_unlock_ve(ctx);
++
++ cpt_finish_ubc(ctx);
++ cpt_object_destroy(ctx);
++ return 0;
++}
++
++int cpt_kill(struct cpt_context *ctx)
++{
++ int err = 0;
++ struct ve_struct *env;
++ cpt_object_t *obj;
++ task_t *root_task = NULL;
++ long delay;
++
++ if (!ctx->ve_id)
++ return -EINVAL;
++
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++
++ /* from here cpt_kill succeeds */
++ if (VE_TASK_INFO(current)->owner_env == env) {
++ wprintk_ctx("attempt to kill ve from inside, escaping...\n");
++
++ write_lock_irq(&tasklist_lock);
++ VE_TASK_INFO(current)->owner_env = get_ve0();
++ REMOVE_VE_LINKS(current);
++ SET_VE_LINKS(current);
++
++ atomic_inc(&get_ve0()->pcounter);
++ atomic_dec(&env->pcounter);
++ write_unlock_irq(&tasklist_lock);
++ set_exec_env(get_ve0());
++ }
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ if (ctx->pgin_task) {
++ wait_for_completion(&ctx->pgin_notify);
++ put_task_struct(ctx->pgin_task);
++ ctx->pgin_task = NULL;
++ }
++#endif
++
++ cpt_kill_sockets(ctx);
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++
++ if (tsk->exit_state) {
++ put_task_struct(tsk);
++ continue;
++ }
++
++ if (virt_pid(tsk) == 1) {
++ root_task = tsk;
++ continue;
++ }
++
++ if (tsk->ptrace) {
++ write_lock_irq(&tasklist_lock);
++ tsk->ptrace = 0;
++ if (!list_empty(&tsk->ptrace_list)) {
++ list_del_init(&tsk->ptrace_list);
++ REMOVE_LINKS(tsk);
++ tsk->parent = tsk->real_parent;
++ SET_LINKS(tsk);
++ }
++ write_unlock_irq(&tasklist_lock);
++ }
++
++ send_sig(SIGKILL, tsk, 1);
++
++ spin_lock_irq(&tsk->sighand->siglock);
++ sigfillset(&tsk->blocked);
++ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
++ set_tsk_thread_flag(tsk, TIF_SIGPENDING);
++ clear_tsk_thread_flag(tsk, TIF_FREEZE);
++ if (tsk->flags & PF_FROZEN)
++ tsk->flags &= ~PF_FROZEN;
++ spin_unlock_irq(&tsk->sighand->siglock);
++
++ wake_up_process(tsk);
++ put_task_struct(tsk);
++ }
++
++ yield();
++
++ if (root_task != NULL) {
++ send_sig(SIGKILL, root_task, 1);
++
++ spin_lock_irq(&root_task->sighand->siglock);
++ sigfillset(&root_task->blocked);
++ sigdelsetmask(&root_task->blocked, sigmask(SIGKILL));
++ set_tsk_thread_flag(root_task, TIF_SIGPENDING);
++ clear_tsk_thread_flag(root_task, TIF_FREEZE);
++ if (root_task->flags & PF_FROZEN)
++ root_task->flags &= ~PF_FROZEN;
++ spin_unlock_irq(&root_task->sighand->siglock);
++
++ wake_up_process(root_task);
++ put_task_struct(root_task);
++ }
++
++ cpt_finish_ubc(ctx);
++ cpt_object_destroy(ctx);
++
++ delay = 1;
++ while (atomic_read(&env->counter) != 1) {
++ if (signal_pending(current))
++ break;
++ current->state = TASK_INTERRUPTIBLE;
++ delay = (delay < HZ) ? (delay << 1) : HZ;
++ schedule_timeout(delay);
++ }
++ put_ve(env);
++
++ return err;
++}
++
++static void collect_task_ubc(task_t *t, struct cpt_context *ctx)
++{
++ struct task_beancounter *tbc;
++
++ tbc = &(t->task_bc);
++ cpt_add_ubc(tbc->exec_ub, ctx);
++ cpt_add_ubc(tbc->task_ub, ctx);
++ cpt_add_ubc(tbc->fork_sub, ctx);
++}
++
++static cpt_object_t * remember_task(task_t * child, cpt_object_t * head,
++ cpt_context_t * ctx)
++{
++ cpt_object_t *cobj;
++
++ if (freezable(child) && !(child->flags&PF_FROZEN)) {
++ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child));
++ put_task_struct(child);
++ return NULL;
++ }
++
++ if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG();
++ if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
++ put_task_struct(child);
++ return NULL;
++ }
++ cobj->o_count = 1;
++ cpt_obj_setobj(cobj, child, ctx);
++ insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx);
++ collect_task_ubc(child, ctx);
++ return cobj;
++}
++
++static int vps_collect_tasks(struct cpt_context *ctx)
++{
++ int err = -ESRCH;
++ cpt_object_t *obj;
++ task_t *root;
++
++ read_lock(&tasklist_lock);
++ root = find_task_by_pid_ve(1);
++ if (root)
++ get_task_struct(root);
++ read_unlock(&tasklist_lock);
++
++ if (!root) {
++ err = -ESRCH;
++ eprintk_ctx("vps_collect_tasks: cannot find root\n");
++ goto out;
++ }
++
++ if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) {
++ put_task_struct(root);
++ return -ENOMEM;
++ }
++ obj->o_count = 1;
++ cpt_obj_setobj(obj, root, ctx);
++ intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
++ collect_task_ubc(root, ctx);
++
++ /* Collect process subtree recursively */
++ for_each_object(obj, CPT_OBJ_TASK) {
++ cpt_object_t *head = obj;
++ task_t *tsk = obj->o_obj;
++ task_t *child;
++
++ if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) {
++ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk));
++ err = -EINVAL;
++ goto out;
++ }
++
++ wait_task_inactive(tsk);
++
++ if (tsk->pid == tsk->tgid) {
++ child = tsk;
++ for (;;) {
++ read_lock(&tasklist_lock);
++ child = next_thread(child);
++ if (child != tsk)
++ get_task_struct(child);
++ read_unlock(&tasklist_lock);
++
++ if (child == tsk)
++ break;
++
++ if (child->real_parent != tsk->real_parent) {
++ put_task_struct(child);
++ eprintk_ctx("illegal thread structure, kernel bug\n");
++ return -EINVAL;
++ }
++
++ if ((head = remember_task(child, head, ctx)) == NULL)
++ return -ENOMEM;
++ }
++ }
++
++ /* About locking. VE is frozen. But lists of children
++ * may change at least for init, when entered task reparents
++ * to init and when reparented task exits. If we take care
++ * of this case, we still can unlock while scanning
++ * tasklists.
++ */
++ read_lock(&tasklist_lock);
++ list_for_each_entry(child, &tsk->children, sibling) {
++ if (child->real_parent != tsk)
++ continue;
++ if (child->pid != child->tgid)
++ continue;
++ get_task_struct(child);
++ read_unlock(&tasklist_lock);
++
++ if ((head = remember_task(child, head, ctx)) == NULL)
++ return -ENOMEM;
++
++ read_lock(&tasklist_lock);
++ }
++
++ list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) {
++ if (child->real_parent != tsk)
++ continue;
++ if (child->pid != child->tgid)
++ continue;
++ get_task_struct(child);
++ read_unlock(&tasklist_lock);
++
++ if ((head = remember_task(child, head, ctx)) == NULL)
++ return -ENOMEM;
++
++ read_lock(&tasklist_lock);
++ }
++ read_unlock(&tasklist_lock);
++ }
++
++ return 0;
++
++out:
++ return err;
++}
++
++static int cpt_collect(struct cpt_context *ctx)
++{
++ int err;
++
++ if ((err = cpt_collect_mm(ctx)) != 0)
++ return err;
++
++ if ((err = cpt_collect_sysv(ctx)) != 0)
++ return err;
++
++ if ((err = cpt_collect_files(ctx)) != 0)
++ return err;
++
++ if ((err = cpt_collect_fs(ctx)) != 0)
++ return err;
++
++ if ((err = cpt_collect_namespace(ctx)) != 0)
++ return err;
++
++ if ((err = cpt_collect_signals(ctx)) != 0)
++ return err;
++
++ return 0;
++}
++
++static int cpt_dump_veinfo(cpt_context_t *ctx)
++{
++ struct cpt_veinfo_image i;
++ struct ve_struct *ve;
++ struct timespec delta;
++
++ cpt_open_section(ctx, CPT_SECT_VEINFO);
++ cpt_open_object(NULL, ctx);
++
++ i.cpt_next = CPT_NULL;
++ i.cpt_object = CPT_OBJ_VEINFO;
++ i.cpt_hdrlen = sizeof(i);
++ i.cpt_content = CPT_CONTENT_VOID;
++
++ ve = get_exec_env();
++ i.shm_ctl_all = ve->_shm_ctlall;
++ i.shm_ctl_max = ve->_shm_ctlmax;
++ i.shm_ctl_mni = ve->_shm_ctlmni;
++
++ i.msg_ctl_max = ve->_msg_ctlmax;
++ i.msg_ctl_mni = ve->_msg_ctlmni;
++ i.msg_ctl_mnb = ve->_msg_ctlmnb;
++
++ BUG_ON(sizeof(ve->_sem_ctls) != sizeof(i.sem_ctl_arr));
++ i.sem_ctl_arr[0] = ve->_sem_ctls[0];
++ i.sem_ctl_arr[1] = ve->_sem_ctls[1];
++ i.sem_ctl_arr[2] = ve->_sem_ctls[2];
++ i.sem_ctl_arr[3] = ve->_sem_ctls[3];
++
++ do_posix_clock_monotonic_gettime(&delta);
++ _set_normalized_timespec(&delta,
++ delta.tv_sec - ve->start_timespec.tv_sec,
++ delta.tv_nsec - ve->start_timespec.tv_nsec);
++ i.start_timespec_delta = cpt_timespec_export(&delta);
++ i.start_jiffies_delta = get_jiffies_64() - ve->start_jiffies;
++
++ ctx->write(&i, sizeof(i), ctx);
++ cpt_close_object(ctx);
++ cpt_close_section(ctx);
++ return 0;
++}
++
++static int cpt_dump_utsname(cpt_context_t *ctx)
++{
++ int len;
++ struct cpt_object_hdr o;
++
++ cpt_open_section(ctx, CPT_SECT_UTSNAME);
++
++ len = strlen(ve_utsname.nodename);
++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1);
++ o.cpt_object = CPT_OBJ_NAME;
++ o.cpt_hdrlen = sizeof(o);
++ o.cpt_content = CPT_CONTENT_NAME;
++
++ ctx->write(&o, sizeof(o), ctx);
++ ctx->write(ve_utsname.nodename, len+1, ctx);
++ ctx->align(ctx);
++
++ len = strlen(ve_utsname.domainname);
++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1);
++ o.cpt_object = CPT_OBJ_NAME;
++ o.cpt_hdrlen = sizeof(o);
++ o.cpt_content = CPT_CONTENT_NAME;
++
++ ctx->write(&o, sizeof(o), ctx);
++ ctx->write(ve_utsname.domainname, len+1, ctx);
++ ctx->align(ctx);
++
++ cpt_close_section(ctx);
++ return 0;
++}
++
++int cpt_dump(struct cpt_context *ctx)
++{
++ struct ve_struct *oldenv, *env;
++ int err, err2 = 0;
++
++ if (!ctx->ve_id)
++ return -EINVAL;
++
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++
++ down_read(&env->op_sem);
++ err = -ESRCH;
++ if (!env->is_running)
++ goto out_noenv;
++ if (!env->is_locked)
++ goto out_noenv;
++
++ oldenv = set_exec_env(env);
++
++ /* Phase 2: real checkpointing */
++ err = cpt_open_dumpfile(ctx);
++ if (err)
++ goto out;
++
++ cpt_major_hdr_out(ctx);
++
++ if (!err)
++ err = cpt_dump_veinfo(ctx);
++ if (!err)
++ err = cpt_dump_ubc(ctx);
++ if (!err)
++ err = cpt_dump_ifinfo(ctx);
++ if (!err)
++ err = cpt_dump_files(ctx);
++ if (!err)
++ err = cpt_dump_files_struct(ctx);
++ if (!err)
++ err = cpt_dump_fs_struct(ctx);
++ if (!err)
++ err = cpt_dump_namespace(ctx);
++ if (!err)
++ err = cpt_dump_sighand(ctx);
++ if (!err)
++ err = cpt_dump_vm(ctx);
++ if (!err)
++ err = cpt_dump_sysvsem(ctx);
++ if (!err)
++ err = cpt_dump_tasks(ctx);
++ if (!err)
++ err = cpt_dump_orphaned_sockets(ctx);
++#if defined(CONFIG_VE_IPTABLES) && \
++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
++ if (!err)
++ err = cpt_dump_ip_conntrack(ctx);
++#endif
++ if (!err)
++ err = cpt_dump_utsname(ctx);
++
++ if (!err)
++ err = cpt_dump_tail(ctx);
++
++ err2 = cpt_close_dumpfile(ctx);
++
++out:
++ set_exec_env(oldenv);
++out_noenv:
++ up_read(&env->op_sem);
++ put_ve(env);
++ return err ? : err2;
++}
++
++int cpt_vps_suspend(struct cpt_context *ctx)
++{
++ struct ve_struct *oldenv, *env;
++ int err = 0;
++
++ ctx->kernel_config_flags = test_kernel_config();
++ cpt_object_init(ctx);
++
++ if (!ctx->ve_id) {
++ env = get_exec_env();
++ if (env == get_ve0())
++ return -EINVAL;
++ wprintk("undefined ve_id\n");
++ ctx->ve_id = env->veid;
++ get_ve(env);
++ } else {
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++ }
++
++#ifdef CONFIG_VE_IPTABLES
++ ctx->iptables_mask = env->_iptables_modules;
++#endif
++
++ down_write(&env->op_sem);
++ err = -ESRCH;
++ if (!env->is_running)
++ goto out_noenv;
++
++ err = -EBUSY;
++ if (env->is_locked)
++ goto out_noenv;
++ env->is_locked = 1;
++ downgrade_write(&env->op_sem);
++
++ oldenv = set_exec_env(env);
++
++ /* Phase 0: find and stop all the tasks */
++ if ((err = vps_stop_tasks(ctx)) != 0)
++ goto out;
++
++ if ((err = cpt_suspend_network(ctx)) != 0)
++ goto out;
++
++ /* At the moment all the state is frozen. We do not need to lock
++ * the state, which can be changed only if the tasks are running.
++ */
++
++ /* Phase 1: collect task tree */
++ if ((err = vps_collect_tasks(ctx)) != 0)
++ goto out;
++
++ /* Phase 1': collect all the resources */
++ if ((err = cpt_collect(ctx)) != 0)
++ goto out;
++
++out:
++ set_exec_env(oldenv);
++ up_read(&env->op_sem);
++ put_ve(env);
++ return err;
++
++out_noenv:
++ up_write(&env->op_sem);
++ put_ve(env);
++ return err;
++}
++
++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps)
++{
++ task_t *p;
++ struct ve_struct *env;
++ unsigned int flags = test_cpu_caps();
++
++ if (!ctx->ve_id)
++ return -EINVAL;
++
++ env = get_ve_by_id(ctx->ve_id);
++ if (env == NULL)
++ return -ESRCH;
++
++ *caps = flags & (1<<CPT_CPU_X86_CMOV);
++ flags &= ~((1<<CPT_CPU_X86_EMT64)|(1<<CPT_CPU_X86_IA64));
++
++ read_lock(&tasklist_lock);
++ for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) {
++ if (tsk_used_math(p))
++ *caps |= flags;
++#ifdef CONFIG_X86_64
++ if (!(p->thread_info->flags & _TIF_IA32))
++ *caps |= (1<<CPT_CPU_X86_EMT64);
++#endif
++ }
++ read_unlock(&tasklist_lock);
++ put_ve(env);
++
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_dump.h linux-2.6.16-026test015/kernel/cpt/cpt_dump.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_dump.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_dump.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,14 @@
++int cpt_dump(struct cpt_context *cpt);
++int rst_undump(struct cpt_context *cpt);
++int cpt_suspend(struct cpt_context *cpt);
++int cpt_resume(struct cpt_context *cpt);
++int cpt_kill(struct cpt_context *cpt);
++int rst_clean(struct cpt_context *cpt);
++int rst_resume(struct cpt_context *cpt);
++int rst_kill(struct cpt_context *cpt);
++
++int cpt_freeze_one(pid_t pid, int freeze);
++int cpt_vps_suspend(struct cpt_context *ctx);
++int vps_rst_undump(struct cpt_context *ctx);
++
++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_epoll.c linux-2.6.16-026test015/kernel/cpt/cpt_epoll.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_epoll.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_epoll.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,116 @@
++/*
++ *
++ * kernel/cpt/cpt_epoll.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/eventpoll.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++extern struct file_operations eventpoll_fops;
++
++int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx)
++{
++ int err = 0;
++ struct file *file = obj->o_obj;
++ struct eventpoll *ep;
++ struct rb_node *rbp;
++ struct cpt_epoll_image ei;
++
++ if (file->f_op != &eventpoll_fops) {
++ eprintk_ctx("bad epoll file\n");
++ return -EINVAL;
++ }
++
++ ep = file->private_data;
++
++ /* eventpoll.c does not protect open /proc/N/fd, silly.
++ * Opener will get an invalid file with uninitialized private_data
++ */
++ if (unlikely(ep == NULL)) {
++ eprintk_ctx("bad epoll device\n");
++ return -EINVAL;
++ }
++
++ cpt_open_object(NULL, ctx);
++
++ ei.cpt_next = CPT_NULL;
++ ei.cpt_object = CPT_OBJ_EPOLL;
++ ei.cpt_hdrlen = sizeof(ei);
++ ei.cpt_content = CPT_CONTENT_ARRAY;
++ ei.cpt_file = obj->o_pos;
++
++ ctx->write(&ei, sizeof(ei), ctx);
++
++ down(&epsem);
++ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
++ loff_t saved_obj;
++ cpt_object_t *tobj;
++ struct cpt_epoll_file_image efi;
++ struct epitem *epi;
++ epi = rb_entry(rbp, struct epitem, rbn);
++ tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx);
++ if (tobj == NULL) {
++ eprintk_ctx("epoll device refers to an external file\n");
++ err = -EBUSY;
++ break;
++ }
++ cpt_push_object(&saved_obj, ctx);
++ cpt_open_object(NULL, ctx);
++
++ efi.cpt_next = CPT_NULL;
++ efi.cpt_object = CPT_OBJ_EPOLL_FILE;
++ efi.cpt_hdrlen = sizeof(efi);
++ efi.cpt_content = CPT_CONTENT_VOID;
++ efi.cpt_file = tobj->o_pos;
++ efi.cpt_fd = epi->ffd.fd;
++ efi.cpt_events = epi->event.events;
++ efi.cpt_data = epi->event.data;
++ efi.cpt_revents = epi->revents;
++ efi.cpt_ready = 0;
++ if (!list_empty(&epi->rdllink))
++ efi.cpt_ready = 1;
++
++ ctx->write(&efi, sizeof(efi), ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++ up(&epsem);
++
++ cpt_close_object(ctx);
++
++ return err;
++}
++
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_files.c linux-2.6.16-026test015/kernel/cpt/cpt_files.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_files.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_files.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1343 @@
++/*
++ *
++ * kernel/cpt/cpt_files.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <linux/pagemap.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/ve_proto.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt)
++{
++ char *path;
++ unsigned long pg = __get_free_page(GFP_KERNEL);
++
++ if (!pg)
++ return;
++
++ path = d_path(d, mnt, (char *)pg, PAGE_SIZE);
++
++ if (!IS_ERR(path))
++ printk("<%s>", path);
++ free_page(pg);
++}
++
++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt,
++ cpt_context_t *ctx)
++{
++ if (path[0] == '/' && !IS_ROOT(d) && !d_unhashed(d)) {
++ struct nameidata nd;
++ if (path_lookup(path, 0, &nd)) {
++ eprintk_ctx("d_path cannot be looked up %s\n", path);
++ return -EINVAL;
++ }
++ if (nd.dentry != d || nd.mnt != mnt) {
++ eprintk_ctx("d_path is invisible %s\n", path);
++ path_release(&nd);
++ return -EINVAL;
++ }
++ path_release(&nd);
++ }
++ return 0;
++}
++
++int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
++{
++ int len;
++ char *path;
++ char *pg = cpt_get_buf(ctx);
++
++ path = d_path(d, mnt, pg, PAGE_SIZE);
++ len = PTR_ERR(path);
++
++ if (IS_ERR(path)) {
++ struct cpt_object_hdr o;
++ char tmp[1];
++ /* VZ changes d_path() to return EINVAL, when path
++ * is not supposed to be visible inside VE. */
++ if (len != -EINVAL)
++ eprintk_ctx("d_path err=%d\n", len);
++ else
++ len = 0;
++
++ o.cpt_next = sizeof(o) + CPT_ALIGN(1);
++ o.cpt_object = CPT_OBJ_NAME;
++ o.cpt_hdrlen = sizeof(o);
++ o.cpt_content = CPT_CONTENT_NAME;
++ tmp[0] = 0;
++
++ ctx->write(&o, sizeof(o), ctx);
++ ctx->write(tmp, 1, ctx);
++ ctx->align(ctx);
++
++ __cpt_release_buf(ctx);
++ return len;
++ } else {
++ struct cpt_object_hdr o;
++
++ len = pg + PAGE_SIZE - 1 - path;
++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1);
++ o.cpt_object = CPT_OBJ_NAME;
++ o.cpt_hdrlen = sizeof(o);
++ o.cpt_content = CPT_CONTENT_NAME;
++ path[len] = 0;
++
++ if (cpt_verify_overmount(path, d, mnt, ctx)) {
++ __cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++
++ ctx->write(&o, sizeof(o), ctx);
++ ctx->write(path, len+1, ctx);
++ ctx->align(ctx);
++ __cpt_release_buf(ctx);
++ }
++ return 0;
++}
++
++int cpt_dump_string(const char *s, struct cpt_context *ctx)
++{
++ int len;
++ struct cpt_object_hdr o;
++
++ len = strlen(s);
++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1);
++ o.cpt_object = CPT_OBJ_NAME;
++ o.cpt_hdrlen = sizeof(o);
++ o.cpt_content = CPT_CONTENT_NAME;
++
++ ctx->write(&o, sizeof(o), ctx);
++ ctx->write(s, len+1, ctx);
++ ctx->align(ctx);
++ return 0;
++}
++
++int cpt_dump_filename(struct file *file, struct cpt_context *ctx)
++{
++ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, ctx);
++}
++
++int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_inode_image *v = cpt_get_buf(ctx);
++ struct kstat sbuf;
++
++ v->cpt_next = sizeof(*v);
++ v->cpt_object = CPT_OBJ_INODE;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++
++ v->cpt_dev = d->d_inode->i_sb->s_dev;
++ v->cpt_ino = d->d_inode->i_ino;
++ v->cpt_mode = sbuf.mode;
++ v->cpt_nlink = sbuf.nlink;
++ v->cpt_uid = sbuf.uid;
++ v->cpt_gid = sbuf.gid;
++ v->cpt_rdev = d->d_inode->i_rdev;
++ v->cpt_size = sbuf.size;
++ v->cpt_atime = cpt_timespec_export(&sbuf.atime);
++ v->cpt_mtime = cpt_timespec_export(&sbuf.mtime);
++ v->cpt_ctime = cpt_timespec_export(&sbuf.ctime);
++ v->cpt_blksize = sbuf.blksize;
++ v->cpt_blocks = sbuf.blocks;
++ v->cpt_sb = d->d_inode->i_sb->s_magic;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++ return 0;
++}
++
++int cpt_collect_files(cpt_context_t * ctx)
++{
++ int err;
++ cpt_object_t *obj;
++ int index = 0;
++
++ /* Collect process fd sets */
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL)
++ return -ENOMEM;
++ }
++
++ /* Collect files from fd sets */
++ for_each_object(obj, CPT_OBJ_FILES) {
++ int fd;
++ struct files_struct *f = obj->o_obj;
++
++ cpt_obj_setindex(obj, index++, ctx);
++
++ if (obj->o_count != atomic_read(&f->count)) {
++ eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count));
++ return -EBUSY;
++ }
++
++ for (fd = 0; fd < f->fdt->max_fds; fd++) {
++ struct file *file = fcheck_files(f, fd);
++ if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL)
++ return -ENOMEM;
++ }
++ }
++
++ /* Collect files queued by AF_UNIX sockets. */
++ if ((err = cpt_collect_passedfds(ctx)) < 0)
++ return err;
++
++ /* OK. At this point we should count all the references. */
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++ struct file *parent;
++ cpt_object_t *ino_obj;
++
++ if (obj->o_count != atomic_read(&file->f_count)) {
++ eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count));
++ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt);
++ return -EBUSY;
++ }
++
++ switch (file->f_dentry->d_inode->i_sb->s_magic) {
++ case FSMAGIC_FUTEX:
++ case FSMAGIC_MQUEUE:
++ case FSMAGIC_BDEV:
++ eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic);
++ return -EBUSY;
++ }
++
++ /* Collect inode. It is necessary mostly to resolve deleted
++ * hard links. */
++ ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
++ if (ino_obj == NULL)
++ return -ENOMEM;
++
++ parent = ino_obj->o_parent;
++ if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry)))
++ ino_obj->o_parent = file;
++
++ if (S_ISCHR(file->f_dentry->d_inode->i_mode)) {
++ int maj = imajor(file->f_dentry->d_inode);
++ if (maj == PTY_MASTER_MAJOR ||
++ (maj >= UNIX98_PTY_MASTER_MAJOR &&
++ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
++ maj == PTY_SLAVE_MAJOR ||
++ maj == UNIX98_PTY_SLAVE_MAJOR ||
++ maj == TTYAUX_MAJOR) {
++ err = cpt_collect_tty(file, ctx);
++ if (err)
++ return err;
++ }
++ }
++
++ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
++ err = cpt_collect_socket(file, ctx);
++ if (err)
++ return err;
++ }
++ }
++
++ err = cpt_index_sockets(ctx);
++
++ return err;
++}
++
++/* /dev/ptmx is special, all the files share one inode, but real tty backend
++ * is attached via file->private_data.
++ */
++
++static inline int is_cloning_inode(struct inode *ino)
++{
++ return S_ISCHR(ino->i_mode) &&
++ ino->i_rdev == MKDEV(TTYAUX_MAJOR,2);
++}
++
++static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx)
++{
++ pid_t pid;
++ struct cpt_flock_image *v = cpt_get_buf(ctx);
++
++ v->cpt_next = sizeof(*v);
++ v->cpt_object = CPT_OBJ_FLOCK;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_VOID;
++
++ v->cpt_owner = owner;
++
++ pid = fl->fl_pid;
++ if (pid && !is_virtual_pid(fl->fl_pid)) {
++ pid = _pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid);
++ if (pid == -1) {
++ if (!(fl->fl_flags&FL_FLOCK)) {
++ eprintk_ctx("posix lock from another VE?\n");
++ cpt_release_buf(ctx);
++ return -EBUSY;
++ }
++ pid = 0;
++ }
++ }
++
++ v->cpt_pid = pid;
++ v->cpt_start = fl->fl_start;
++ v->cpt_end = fl->fl_end;
++ v->cpt_flags = fl->fl_flags;
++ v->cpt_type = fl->fl_type;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++ return 0;
++}
++
++
++int cpt_dump_flock(struct file *file, struct cpt_context *ctx)
++{
++ int err = 0;
++ struct file_lock *fl;
++
++ lock_kernel();
++ for (fl = file->f_dentry->d_inode->i_flock;
++ fl; fl = fl->fl_next) {
++ if (file != fl->fl_file)
++ continue;
++ if (fl->fl_flags & FL_LEASE) {
++ eprintk_ctx("lease lock is not supported\n");
++ err = -EINVAL;
++ break;
++ }
++ if (fl->fl_flags & FL_POSIX) {
++ cpt_object_t *obj;
++ obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx);
++ if (obj) {
++ dump_one_flock(fl, obj->o_index, ctx);
++ continue;
++ } else {
++ eprintk_ctx("unknown lock owner %p\n", fl->fl_owner);
++ err = -EINVAL;
++ }
++ }
++ if (fl->fl_flags & FL_FLOCK) {
++ dump_one_flock(fl, -1, ctx);
++ continue;
++ }
++ }
++ unlock_kernel();
++ return err;
++}
++
++static int __comb_pid_to_vpid(int pid)
++{
++ int vpid = pid;
++
++ if (pid > 0) {
++ vpid = _pid_type_to_vpid(PIDTYPE_PID, pid);
++ if (unlikely(vpid < 0)) {
++ dprintk("pid %d does not exist amymore.\n", pid);
++ return 0;
++ }
++ } else if (pid < 0) {
++ vpid = _pid_type_to_vpid(PIDTYPE_PGID, -pid);
++ if (unlikely(vpid < 0)) {
++ dprintk("pgid %d does not exist amymore.\n", -pid);
++ return 0;
++ }
++ vpid = -vpid;
++ }
++ return vpid;
++}
++
++static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx)
++{
++ int err = 0;
++ cpt_object_t *iobj;
++ struct cpt_file_image *v = cpt_get_buf(ctx);
++ struct kstat sbuf;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_FILE;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_flags = file->f_flags;
++ v->cpt_mode = file->f_mode;
++ v->cpt_pos = file->f_pos;
++ v->cpt_uid = file->f_uid;
++ v->cpt_gid = file->f_gid;
++
++ vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf);
++
++ v->cpt_i_mode = sbuf.mode;
++ v->cpt_lflags = 0;
++ if (IS_ROOT(file->f_dentry))
++ v->cpt_lflags |= CPT_DENTRY_ROOT;
++ else if (d_unhashed(file->f_dentry))
++ v->cpt_lflags |= CPT_DENTRY_DELETED;
++ if (is_cloning_inode(file->f_dentry->d_inode))
++ v->cpt_lflags |= CPT_DENTRY_CLONING;
++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC)
++ v->cpt_lflags |= CPT_DENTRY_PROC;
++ v->cpt_inode = CPT_NULL;
++ iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
++ if (iobj)
++ v->cpt_inode = iobj->o_pos;
++ v->cpt_priv = CPT_NULL;
++ v->cpt_fown_fd = -1;
++ if (S_ISCHR(v->cpt_i_mode)) {
++ iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx);
++ if (iobj) {
++ v->cpt_priv = iobj->o_pos;
++ if (file->f_flags&FASYNC)
++ v->cpt_fown_fd = cpt_tty_fasync(file, ctx);
++ }
++ }
++ if (S_ISSOCK(v->cpt_i_mode)) {
++ if (obj->o_index < 0) {
++ eprintk_ctx("BUG: no socket index\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_priv = obj->o_index;
++ if (file->f_flags&FASYNC)
++ v->cpt_fown_fd = cpt_socket_fasync(file, ctx);
++ }
++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) {
++ v->cpt_priv = file->f_dentry->d_inode->i_ino;
++ v->cpt_lflags |= CPT_DENTRY_EPOLL;
++ }
++
++ v->cpt_fown_pid = __comb_pid_to_vpid((int)file->f_owner.pid);
++ v->cpt_fown_uid = file->f_owner.uid;
++ v->cpt_fown_euid = file->f_owner.euid;
++ v->cpt_fown_signo = file->f_owner.signum;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ if (!S_ISSOCK(v->cpt_i_mode)) {
++ err = cpt_dump_filename(file, ctx);
++ if (err)
++ return err;
++ }
++
++ if (file->f_dentry->d_inode->i_flock)
++ err = cpt_dump_flock(file, ctx);
++
++ cpt_close_object(ctx);
++
++ return err;
++}
++
++/* About this weird function... Crappy code dealing with SYSV shared memory
++ * defines TMPFS inode and file with f_op doing only mmap. So...
++ * Maybe, this is wrong and leaks something. It is clear access to
++ * SYSV shmem via mmap is quite unusual and impossible from user space.
++ */
++static int dump_content_shm(struct file *file, struct cpt_context *ctx)
++{
++ struct cpt_obj_bits *v;
++ loff_t saved_pos;
++ unsigned long addr;
++
++ addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size,
++ PROT_READ, MAP_SHARED, 0);
++ if (IS_ERR((void*)addr))
++ return PTR_ERR((void*)addr);
++
++ cpt_push_object(&saved_pos, ctx);
++ cpt_open_object(NULL, ctx);
++ v = cpt_get_buf(ctx);
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_BITS;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_DATA;
++ v->cpt_size = file->f_dentry->d_inode->i_size;
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++ ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx);
++ ctx->align(ctx);
++ do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size);
++
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_pos, ctx);
++ return 0;
++}
++
++static int data_is_zero(char *addr, int len)
++{
++ int i;
++ unsigned long zerolong = 0;
++
++ for (i=0; i<len/sizeof(unsigned long); i++) {
++ if (((unsigned long*)(addr))[i] != 0)
++ return 0;
++ }
++ i = len % sizeof(unsigned long);
++ if (!i)
++ return 1;
++ return memcmp(addr + len - i, &zerolong, i) == 0;
++}
++
++
++static int dump_content_regular(struct file *file, struct cpt_context *ctx)
++{
++ loff_t saved_pos;
++ loff_t pos = 0;
++ loff_t obj_opened = CPT_NULL;
++ struct cpt_page_block pgb;
++ ssize_t (*do_read)(struct file *, char __user *, size_t, loff_t *);
++
++ if (file->f_op == NULL)
++ return -EINVAL;
++
++ if ((do_read = file->f_op->read) == NULL) {
++ if (file->f_op->mmap == NULL)
++ return -EINVAL;
++ if (file->f_dentry->d_inode->i_sb->s_magic != FSMAGIC_TMPFS) {
++ eprintk_ctx("unreadable, but not SYSV SHM file\n");
++ return -EINVAL;
++ }
++
++ do_read = file->f_dentry->d_inode->i_fop->read;
++ cpt_dump_content_sysvshm(file, ctx);
++ if (!do_read) {
++ wprintk_ctx("TMPFS is not configured?\n");
++ return dump_content_shm(file, ctx);
++ }
++ }
++
++ if (!(file->f_mode & FMODE_READ) ||
++ (file->f_flags & O_DIRECT)) {
++ file = dentry_open(dget(file->f_dentry),
++ mntget(file->f_vfsmnt), O_RDONLY);
++ } else {
++ atomic_inc(&file->f_count);
++ }
++
++ for (;;) {
++ mm_segment_t oldfs;
++ int err;
++
++ (void)cpt_get_buf(ctx);
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos);
++ set_fs(oldfs);
++ if (err < 0) {
++ eprintk_ctx("dump_content_regular: do_read: %d", err);
++ fput(file);
++ __cpt_release_buf(ctx);
++ return err;
++ }
++ if (err == 0) {
++ __cpt_release_buf(ctx);
++ break;
++ }
++ if (data_is_zero(ctx->tmpbuf, err)) {
++ if (obj_opened != CPT_NULL) {
++ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end));
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_pos, ctx);
++ obj_opened = CPT_NULL;
++ }
++ } else {
++ if (obj_opened == CPT_NULL) {
++ cpt_push_object(&saved_pos, ctx);
++ cpt_open_object(NULL, ctx);
++ obj_opened = ctx->file->f_pos;
++ pgb.cpt_next = CPT_NULL;
++ pgb.cpt_object = CPT_OBJ_PAGES;
++ pgb.cpt_hdrlen = sizeof(pgb);
++ pgb.cpt_content = CPT_CONTENT_DATA;
++ pgb.cpt_start = pos - err;
++ pgb.cpt_end = pgb.cpt_start;
++ ctx->write(&pgb, sizeof(pgb), ctx);
++ }
++ ctx->write(ctx->tmpbuf, err, ctx);
++ pgb.cpt_end += err;
++ }
++ __cpt_release_buf(ctx);
++ }
++
++ fput(file);
++
++ if (obj_opened != CPT_NULL) {
++ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end));
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_pos, ctx);
++ obj_opened = CPT_NULL;
++ }
++ return 0;
++}
++
++
++static int dump_content_chrdev(struct file *file, struct cpt_context *ctx)
++{
++ struct inode *ino = file->f_dentry->d_inode;
++ int maj;
++
++ maj = imajor(ino);
++ if (maj == MEM_MAJOR) {
++ /* Well, OK. */
++ return 0;
++ }
++ if (maj == PTY_MASTER_MAJOR ||
++ (maj >= UNIX98_PTY_MASTER_MAJOR &&
++ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) ||
++ maj == PTY_SLAVE_MAJOR ||
++ maj == UNIX98_PTY_SLAVE_MAJOR ||
++ maj == TTYAUX_MAJOR) {
++ return cpt_dump_content_tty(file, ctx);
++ }
++ eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino));
++ return -EINVAL;
++}
++
++static int dump_content_blkdev(struct file *file, struct cpt_context *ctx)
++{
++ struct inode *ino = file->f_dentry->d_inode;
++
++ /* We are not going to transfer them. */
++ eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino));
++ return -EINVAL;
++}
++
++static int dump_content_fifo(struct file *file, struct cpt_context *ctx)
++{
++ struct inode *ino = file->f_dentry->d_inode;
++ cpt_object_t *obj;
++ loff_t saved_pos;
++ int readers;
++ int writers;
++ int anon = 0;
++
++ mutex_lock(PIPE_MUTEX(*ino));
++ readers = PIPE_READERS(*ino);
++ writers = PIPE_WRITERS(*ino);
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file1 = obj->o_obj;
++ if (file1->f_dentry->d_inode == ino) {
++ if (file1->f_mode & FMODE_READ)
++ readers--;
++ if (file1->f_mode & FMODE_WRITE)
++ writers--;
++ }
++ }
++ mutex_unlock(PIPE_MUTEX(*ino));
++ if (readers || writers) {
++ struct dentry *dr = file->f_dentry->d_sb->s_root;
++ if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0)
++ anon = 1;
++
++ if (anon) {
++ eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers);
++ return -EBUSY;
++ }
++ /* If fifo has external readers/writers, we are in troubles.
++ * If the buffer is not empty, we must move its content.
++ * But if the fifo is owned by a service, we cannot do
++ * this. See?
++ *
++ * For now we assume, that if fifo is opened by another
++ * process, we do not own it and, hence, migrate without
++ * data.
++ */
++ return 0;
++ }
++
++ /* OK, we must save fifo state. No semaphores required. */
++
++ if (ino->i_pipe->nrbufs) {
++ struct cpt_obj_bits *v = cpt_get_buf(ctx);
++ struct pipe_inode_info *info;
++ int count, buf, nrbufs;
++
++ mutex_lock(PIPE_MUTEX(*ino));
++ info = ino->i_pipe;
++ count = 0;
++ buf = info->curbuf;
++ nrbufs = info->nrbufs;
++ while (--nrbufs >= 0) {
++ if (!info->bufs[buf].ops->can_merge) {
++ mutex_unlock(PIPE_MUTEX(*ino));
++ eprintk_ctx("unknown format of pipe buffer\n");
++ return -EINVAL;
++ }
++ count += info->bufs[buf].len;
++ buf = (buf+1) & (PIPE_BUFFERS-1);
++ }
++
++ if (!count) {
++ mutex_unlock(PIPE_MUTEX(*ino));
++ return 0;
++ }
++
++ cpt_push_object(&saved_pos, ctx);
++ cpt_open_object(NULL, ctx);
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_BITS;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_DATA;
++ v->cpt_size = count;
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ count = 0;
++ buf = info->curbuf;
++ nrbufs = info->nrbufs;
++ while (--nrbufs >= 0) {
++ struct pipe_buffer *b = info->bufs + buf;
++ void * addr = b->ops->map(file, info, b);
++ ctx->write(addr + b->offset, b->len, ctx);
++ b->ops->unmap(info, b);
++ buf = (buf+1) & (PIPE_BUFFERS-1);
++ }
++
++ mutex_unlock(PIPE_MUTEX(*ino));
++
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_pos, ctx);
++ }
++
++ return 0;
++}
++
++static int dump_content_socket(struct file *file, struct cpt_context *ctx)
++{
++ return 0;
++}
++
++static int dump_one_inode(struct file *file, struct dentry *d,
++ struct vfsmount *mnt, struct cpt_context *ctx)
++{
++ int err = 0;
++ struct inode *ino = d->d_inode;
++ cpt_object_t *iobj;
++ int dump_it = 0;
++
++ iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx);
++ if (!iobj)
++ return -EINVAL;
++
++ if (iobj->o_pos >= 0)
++ return 0;
++
++ if (!IS_ROOT(d) && d_unhashed(d))
++ dump_it = 1;
++ if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) {
++ /* One more bug in epoll: invalid inode mode.
++ * What a load of crap...
++ */
++ if (ino->i_sb->s_magic == FSMAGIC_EPOLL &&
++ (ino->i_mode & S_IFMT) == 0)
++ return 0;
++ dump_it = 1;
++ }
++
++ if (!dump_it)
++ return 0;
++
++ cpt_open_object(iobj, ctx);
++ cpt_dump_inode(d, mnt, ctx);
++
++ if (!IS_ROOT(d) && d_unhashed(d)) {
++ struct file *parent;
++ parent = iobj->o_parent;
++ if (!parent ||
++ (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) {
++ /* Inode is not deleted, but it does not
++ * have references from inside checkpointed
++ * process group. We have options:
++ * A. Fail, abort checkpointing
++ * B. Proceed. File will be cloned.
++ * A is correct, B is more complicated */
++ /* Just as a hint where to create deleted file */
++ if (ino->i_nlink != 0) {
++ eprintk_ctx("deleted reference to existing inode, checkpointing is impossible\n");
++ return -EBUSY;
++ }
++ } else {
++ /* Refer to _another_ file name. */
++ err = cpt_dump_filename(parent, ctx);
++ if (err)
++ return err;
++ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode))
++ dump_it = 0;
++ }
++ }
++ if (dump_it) {
++ if (S_ISREG(ino->i_mode)) {
++ if ((err = dump_content_regular(file, ctx)) != 0) {
++ eprintk_ctx("dump_content_regular ");
++ cpt_printk_dentry(d, mnt);
++ }
++ } else if (S_ISDIR(ino->i_mode)) {
++ /* We cannot do anything. The directory should be
++ * empty, so it is not a big deal.
++ */
++ } else if (S_ISCHR(ino->i_mode)) {
++ err = dump_content_chrdev(file, ctx);
++ } else if (S_ISBLK(ino->i_mode)) {
++ err = dump_content_blkdev(file, ctx);
++ } else if (S_ISFIFO(ino->i_mode)) {
++ err = dump_content_fifo(file, ctx);
++ } else if (S_ISSOCK(ino->i_mode)) {
++ err = dump_content_socket(file, ctx);
++ } else {
++ eprintk_ctx("unknown inode mode %o\n", ino->i_mode & S_IFMT);
++ err = -EINVAL;
++ }
++ }
++ cpt_close_object(ctx);
++
++ return err;
++}
++
++int cpt_dump_files(struct cpt_context *ctx)
++{
++ int epoll_nr;
++ cpt_object_t *obj;
++
++ cpt_open_section(ctx, CPT_SECT_TTY);
++ for_each_object(obj, CPT_OBJ_TTY) {
++ int err;
++
++ if ((err = cpt_dump_tty(obj, ctx)) != 0)
++ return err;
++ }
++ cpt_close_section(ctx);
++
++ cpt_open_section(ctx, CPT_SECT_INODE);
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++ int err;
++
++ if ((err = dump_one_inode(file, file->f_dentry,
++ file->f_vfsmnt, ctx)) != 0)
++ return err;
++ }
++ for_each_object(obj, CPT_OBJ_FS) {
++ struct fs_struct *fs = obj->o_obj;
++ int err;
++
++ if (fs->root &&
++ (err = dump_one_inode(NULL, fs->root, fs->rootmnt, ctx)) != 0)
++ return err;
++ if (fs->pwd &&
++ (err = dump_one_inode(NULL, fs->pwd, fs->pwdmnt, ctx)) != 0)
++ return err;
++ if (fs->altroot &&
++ (err = dump_one_inode(NULL, fs->altroot, fs->altrootmnt, ctx)) != 0)
++ return err;
++ }
++ cpt_close_section(ctx);
++
++ epoll_nr = 0;
++ cpt_open_section(ctx, CPT_SECT_FILES);
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++ int err;
++
++ if ((err = dump_one_file(obj, file, ctx)) != 0)
++ return err;
++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL)
++ epoll_nr++;
++ }
++ cpt_close_section(ctx);
++
++ if (epoll_nr) {
++ cpt_open_section(ctx, CPT_SECT_EPOLL);
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) {
++ int err;
++ if ((err = cpt_dump_epolldev(obj, ctx)) != 0)
++ return err;
++ }
++ }
++ cpt_close_section(ctx);
++ }
++
++ cpt_open_section(ctx, CPT_SECT_SOCKET);
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ int err;
++
++ if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0)
++ return err;
++ }
++ cpt_close_section(ctx);
++
++ return 0;
++}
++
++static int dump_filedesc(int fd, struct file *file,
++ struct files_struct *f, struct cpt_context *ctx)
++{
++ struct cpt_fd_image *v = cpt_get_buf(ctx);
++ cpt_object_t *obj;
++
++ cpt_open_object(NULL, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_FILEDESC;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_VOID;
++
++ v->cpt_fd = fd;
++ obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx);
++ if (!obj) BUG();
++ v->cpt_file = obj->o_pos;
++ v->cpt_flags = 0;
++ if (FD_ISSET(fd, f->fdt->close_on_exec))
++ v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++ cpt_close_object(ctx);
++
++ return 0;
++}
++
++static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct files_struct *f = obj->o_obj;
++ struct cpt_files_struct_image *v = cpt_get_buf(ctx);
++ int fd;
++ loff_t saved_obj;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_FILES;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_index = obj->o_index;
++ v->cpt_max_fds = f->fdt->max_fds;
++ v->cpt_next_fd = f->fdt->next_fd;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ for (fd = 0; fd < f->fdt->max_fds; fd++) {
++ struct file *file = fcheck_files(f, fd);
++ if (file)
++ dump_filedesc(fd, file, f, ctx);
++ }
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_close_object(ctx);
++
++ return 0;
++}
++
++int cpt_dump_files_struct(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ cpt_open_section(ctx, CPT_SECT_FILES_STRUCT);
++
++ for_each_object(obj, CPT_OBJ_FILES) {
++ int err;
++
++ if ((err = dump_one_file_struct(obj, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_section(ctx);
++ return 0;
++}
++
++int cpt_collect_fs(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ if (tsk->fs) {
++ if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL)
++ return -ENOMEM;
++ if (tsk->fs->pwd &&
++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd->d_inode, ctx) == NULL)
++ return -ENOMEM;
++ if (tsk->fs->root &&
++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->root->d_inode, ctx) == NULL)
++ return -ENOMEM;
++ if (tsk->fs->altroot &&
++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot->d_inode, ctx) == NULL)
++ return -ENOMEM;
++ }
++ }
++ return 0;
++}
++
++static int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx)
++{
++ struct file file;
++
++ memset(&file, 0, sizeof(file));
++
++ file.f_dentry = d;
++ file.f_vfsmnt = mnt;
++ file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK;
++ return dump_one_file(NULL, &file, ctx);
++}
++
++static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct fs_struct *fs = obj->o_obj;
++ struct cpt_fs_struct_image *v = cpt_get_buf(ctx);
++ loff_t saved_obj;
++ int err;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_FS;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_umask = fs->umask;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ err = cpt_dump_dir(fs->root, fs->rootmnt, ctx);
++ if (!err)
++ err = cpt_dump_dir(fs->pwd, fs->pwdmnt, ctx);
++ if (!err && fs->altroot)
++ err = cpt_dump_dir(fs->altroot, fs->altrootmnt, ctx);
++
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_close_object(ctx);
++
++ return err;
++}
++
++int cpt_dump_fs_struct(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ cpt_open_section(ctx, CPT_SECT_FS);
++
++ for_each_object(obj, CPT_OBJ_FS) {
++ int err;
++
++ if ((err = dump_one_fs(obj, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_section(ctx);
++ return 0;
++}
++
++static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ int err = 0;
++ struct namespace *n = obj->o_obj;
++ struct list_head *p;
++ char *path_buf, *path;
++
++ path_buf = (char *) __get_free_page(GFP_KERNEL);
++ if (!path_buf)
++ return -ENOMEM;
++
++ down_read(&namespace_sem);
++ list_for_each(p, &n->list) {
++ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list);
++
++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE);
++ if (IS_ERR(path))
++ continue;
++
++ if (
++ strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 &&
++ strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 &&
++ strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 &&
++ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 &&
++ strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 &&
++ strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 &&
++ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) {
++ eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name);
++ err = -EINVAL;
++ break;
++ }
++ }
++ up_read(&namespace_sem);
++
++ free_page((unsigned long) path_buf);
++
++ return err;
++}
++
++int cpt_collect_namespace(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ if (tsk->namespace && cpt_object_add(CPT_OBJ_NAMESPACE, tsk->namespace, ctx) == NULL)
++ return -ENOMEM;
++ }
++
++ for_each_object(obj, CPT_OBJ_NAMESPACE) {
++ int err;
++ if ((err = check_one_namespace(obj, ctx)) != 0)
++ return err;
++ }
++
++ return 0;
++}
++
++struct args_t
++{
++ int* pfd;
++ char* path;
++};
++
++static int dumptmpfs(void *arg)
++{
++ int i;
++ struct args_t *args = arg;
++ int *pfd = args->pfd;
++ char *path = args->path;
++ char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL };
++
++ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
++ if (i < 0) {
++ eprintk("cannot enter ve to dump tmpfs\n");
++ module_put(THIS_MODULE);
++ return 1;
++ }
++
++ if (pfd[1] != 1)
++ sc_dup2(pfd[1], 1);
++
++ for (i=0; i<current->files->fdt->max_fds; i++) {
++ if (i != 1)
++ sc_close(i);
++ }
++
++ module_put(THIS_MODULE);
++
++ set_fs(KERNEL_DS);
++ i = sc_execve("/bin/tar", argv, NULL);
++ eprintk("failed to exec /bin/tar: %d\n", i);
++ return -1;
++}
++
++static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx)
++{
++ int err;
++ int pid;
++ int pfd[2];
++ struct file *f;
++ struct cpt_object_hdr v;
++ char buf[16];
++ int n;
++ loff_t saved_obj;
++ struct args_t args;
++
++ err = sc_pipe(pfd);
++ if (err < 0)
++ return err;
++ args.pfd = pfd;
++ args.path = path;
++ err = pid = local_kernel_thread(dumptmpfs, (void*)&args, SIGCHLD, 0);
++ if (err < 0)
++ goto out;
++ f = fget(pfd[0]);
++ sc_close(pfd[1]);
++ sc_close(pfd[0]);
++
++ cpt_push_object(&saved_obj, ctx);
++ cpt_open_object(NULL, ctx);
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NAME;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_NAME;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ do {
++ mm_segment_t oldfs;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
++ set_fs(oldfs);
++ if (n > 0)
++ ctx->write(buf, n, ctx);
++ } while (n > 0);
++
++ fput(f);
++
++ if ((err = sc_waitx(pid, 0)) < 0)
++ eprintk_ctx("wait4: %d\n", err);
++
++ buf[0] = 0;
++ ctx->write(buf, 1, ctx);
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ return n;
++
++out:
++ if (pfd[1] >= 0)
++ sc_close(pfd[1]);
++ if (pfd[0] >= 0)
++ sc_close(pfd[0]);
++ return err;
++}
++
++static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx)
++{
++ int err = 0;
++ struct cpt_vfsmount_image v;
++ loff_t saved_obj;
++ char *path_buf, *path;
++
++ path_buf = (char *) __get_free_page(GFP_KERNEL);
++ if (!path_buf)
++ return -ENOMEM;
++
++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE);
++ if (IS_ERR(path)) {
++ free_page((unsigned long) path_buf);
++ return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path);
++ }
++
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = -1;
++ v.cpt_object = CPT_OBJ_VFSMOUNT;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_ARRAY;
++
++ v.cpt_mntflags = mnt->mnt_flags;
++ v.cpt_flags = mnt->mnt_sb->s_flags;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ cpt_dump_string(mnt->mnt_devname ? : "none", ctx);
++ cpt_dump_string(path, ctx);
++ cpt_dump_string(mnt->mnt_sb->s_type->name, ctx);
++#if 0
++ /* This is an evident crap. Ask Savochkin, he might know this.
++ * Goal is to get some path to mount --bind to.
++ */
++ cpt_dump_dentry(mnt->mnt_root, mnt->mnt_parent, ctx);
++#else
++ /* For now we just bail, when some FS is mounted not at root. */
++ if (mnt->mnt_root != mnt->mnt_sb->s_root) {
++ eprintk_ctx("mount --bind prevents checkpointing\n");
++ err = -EINVAL;
++ }
++#endif
++
++ if (strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) {
++ cpt_dump_tmpfs(path, ctx);
++ }
++
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_close_object(ctx);
++
++ free_page((unsigned long) path_buf);
++
++ return err;
++}
++
++static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct namespace *n = obj->o_obj;
++ struct cpt_object_hdr v;
++ struct list_head *p;
++ loff_t saved_obj;
++ int err = 0;
++
++ cpt_open_object(obj, ctx);
++
++ v.cpt_next = -1;
++ v.cpt_object = CPT_OBJ_NAMESPACE;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_ARRAY;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++
++ down_read(&namespace_sem);
++ list_for_each(p, &n->list) {
++ err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx);
++ if (err)
++ break;
++ }
++ up_read(&namespace_sem);
++
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_close_object(ctx);
++
++ return err;
++}
++
++int cpt_dump_namespace(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ cpt_open_section(ctx, CPT_SECT_NAMESPACE);
++
++ for_each_object(obj, CPT_OBJ_NAMESPACE) {
++ int err;
++
++ if ((err = dump_one_namespace(obj, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_section(ctx);
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_files.h linux-2.6.16-026test015/kernel/cpt/cpt_files.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_files.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_files.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,46 @@
++int cpt_collect_files(cpt_context_t *);
++int cpt_collect_fs(cpt_context_t *);
++int cpt_collect_namespace(cpt_context_t *);
++int cpt_collect_sysvsem_undo(cpt_context_t *);
++int cpt_collect_tty(struct file *, cpt_context_t *);
++int cpt_dump_files(struct cpt_context *ctx);
++int cpt_dump_files_struct(struct cpt_context *ctx);
++int cpt_dump_fs_struct(struct cpt_context *ctx);
++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx);
++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx);
++int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx);
++struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx);
++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx);
++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx);
++
++int rst_posix_locks(struct cpt_context *ctx);
++
++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx);
++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_restore_fs(struct cpt_context *ctx);
++
++int cpt_collect_sysv(cpt_context_t *);
++int cpt_dump_sysvsem(struct cpt_context *ctx);
++int rst_sysv_ipc(struct cpt_context *ctx);
++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++
++int cpt_dump_namespace(struct cpt_context *ctx);
++int rst_root_namespace(struct cpt_context *ctx);
++
++int rst_stray_files(struct cpt_context *ctx);
++int rst_tty_jobcontrol(struct cpt_context *ctx);
++
++void rst_flush_filejobs(struct cpt_context *);
++int rst_do_filejobs(struct cpt_context *);
++
++int rst_eventpoll(struct cpt_context *);
++struct file *cpt_open_epolldev(struct cpt_file_image *fi,
++ unsigned flags,
++ struct cpt_context *ctx);
++int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *);
++
++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt,
++ cpt_context_t *ctx);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_fsmagic.h linux-2.6.16-026test015/kernel/cpt/cpt_fsmagic.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_fsmagic.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_fsmagic.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,15 @@
++/* Collected from kernel sources. */
++
++#define FSMAGIC_TMPFS 0x01021994
++#define FSMAGIC_PIPEFS 0x50495045
++#define FSMAGIC_SOCKFS 0x534F434B
++#define FSMAGIC_PFMFS 0xa0b4d889
++#define FSMAGIC_BDEV 0x62646576
++#define FSMAGIC_EPOLL 0x03111965
++#define FSMAGIC_FUTEX 0x0BAD1DEA
++#define FSMAGIC_MQUEUE 0x19800202
++#define FSMAGIC_PROC 0x9fa0
++#define FSMAGIC_DEVPTS 0x1CD1
++#define FSMAGIC_AUTOFS 0x0187
++#define FSMAGIC_EXT2 0xEF53
++#define FSMAGIC_REISER 0x52654973
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_kernel.c linux-2.6.16-026test015/kernel/cpt/cpt_kernel.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_kernel.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_kernel.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,134 @@
++/*
++ *
++ * kernel/cpt/cpt_kernel.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#define __KERNEL_SYSCALLS__ 1
++
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/kernel.h>
++#include <asm/cpufeature.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_kernel.h"
++#include "cpt_syscalls.h"
++
++int debug_level = 1;
++
++#ifndef CONFIG_X86_64
++
++extern void local_kernel_thread_helper(void);
++__asm__(".section .text\n"
++ ".align 4\n"
++ "local_kernel_thread_helper:\n\t"
++ "movl %edx,%eax\n\t"
++ "pushl %edx\n\t"
++ "call *%ebx\n\t"
++ "pushl %eax\n\t"
++ "pushl $0\n\t"
++ "call complete_and_exit\n"
++ ".previous");
++
++/*
++ * Create a kernel thread
++ */
++int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
++{
++ struct pt_regs regs;
++
++ memset(&regs, 0, sizeof(regs));
++
++ regs.ebx = (unsigned long) fn;
++ regs.edx = (unsigned long) arg;
++
++ regs.xds = __USER_DS;
++ regs.xes = __USER_DS;
++ regs.orig_eax = -1;
++ regs.eip = (unsigned long) local_kernel_thread_helper;
++ regs.xcs = __KERNEL_CS;
++ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
++
++ /* Ok, create the new process.. */
++ return do_fork_pid(flags | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL, pid);
++}
++#endif
++
++int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid)
++{
++ pid_t ret;
++
++ if (!try_module_get(THIS_MODULE))
++ return -EBUSY;
++ ret = asm_kernel_thread(fn, arg, flags, pid);
++ if (ret < 0)
++ module_put(THIS_MODULE);
++ return ret;
++}
++
++#ifdef __i386__
++int __execve(const char *file, char **argv, char **envp)
++{
++ long res;
++ __asm__ volatile ("int $0x80"
++ : "=a" (res)
++ : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)),
++ "d" ((long)(envp)) : "memory");
++ return (int)res;
++}
++#endif
++
++int sc_execve(char *cmd, char **argv, char **env)
++{
++ int ret;
++#ifndef __i386__
++ ret = execve(cmd, argv, env);
++#else
++ ret = __execve(cmd, argv, env);
++#endif
++ return ret;
++}
++
++unsigned int test_cpu_caps()
++{
++ unsigned int flags = 0;
++ if (boot_cpu_has(X86_FEATURE_CMOV))
++ flags |= 1 << CPT_CPU_X86_CMOV;
++ if (cpu_has_fxsr)
++ flags |= 1 << CPT_CPU_X86_FXSR;
++ if (cpu_has_xmm)
++ flags |= 1 << CPT_CPU_X86_SSE;
++#ifndef CONFIG_X86_64
++ if (cpu_has_xmm2)
++#endif
++ flags |= 1 << CPT_CPU_X86_SSE2;
++ if (cpu_has_mmx)
++ flags |= 1 << CPT_CPU_X86_MMX;
++ if (boot_cpu_has(X86_FEATURE_3DNOW))
++ flags |= 1 << CPT_CPU_X86_3DNOW;
++ if (boot_cpu_has(X86_FEATURE_3DNOWEXT))
++ flags |= 1 << CPT_CPU_X86_3DNOW2;
++ if (boot_cpu_has(X86_FEATURE_SEP))
++ flags |= 1 << CPT_CPU_X86_SEP;
++#ifdef CONFIG_X86_64
++ flags |= 1 << CPT_CPU_X86_EMT64;
++#endif
++ return flags;
++}
++
++unsigned int test_kernel_config()
++{
++ unsigned int flags = 0;
++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
++ flags |= 1 << CPT_KERNEL_CONFIG_PAE;
++#endif
++ return flags;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_kernel.h linux-2.6.16-026test015/kernel/cpt/cpt_kernel.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_kernel.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_kernel.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,74 @@
++/* Interface to kernel vars which we had to _add_. */
++
++asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
++
++#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)
++#define TASK_TRACED TASK_STOPPED
++#define unix_peer(sk) ((sk)->sk_pair)
++#define page_mapcount(pg) ((pg)->mapcount)
++#else
++#define unix_peer(sk) (unix_sk(sk)->peer)
++#endif
++
++#ifdef CONFIG_X86_64
++#define cpu_has_fxsr 1
++#endif
++
++static inline void do_gettimespec(struct timespec *ts)
++{
++ struct timeval tv;
++ do_gettimeofday(&tv);
++ ts->tv_sec = tv.tv_sec;
++ ts->tv_nsec = tv.tv_usec*1000;
++}
++
++int local_kernel_thread(int (*fn)(void *),
++ void * arg,
++ unsigned long flags,
++ pid_t pid);
++int asm_kernel_thread(int (*fn)(void *),
++ void * arg,
++ unsigned long flags,
++ pid_t pid);
++
++unsigned int test_cpu_caps(void);
++unsigned int test_kernel_config(void);
++
++#define test_one_flag(src, dst, flag, message, ret) \
++if (src & (1 << flag)) \
++ if (!(dst & (1 << flag))) { \
++ wprintk("Destination cpu does not have " message "\n"); \
++ ret = 1; \
++ }
++
++static inline void
++_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
++{
++ while (nsec >= NSEC_PER_SEC) {
++ nsec -= NSEC_PER_SEC;
++ ++sec;
++ }
++ while (nsec < 0) {
++ nsec += NSEC_PER_SEC;
++ --sec;
++ }
++ ts->tv_sec = sec;
++ ts->tv_nsec = nsec;
++}
++
++static inline struct timespec
++_ns_to_timespec(const nsec_t nsec)
++{
++ struct timespec ts;
++
++ if (!nsec)
++ return (struct timespec) {0, 0};
++
++ ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec);
++ if (unlikely(nsec < 0))
++ _set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec);
++
++ return ts;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_mm.c linux-2.6.16-026test015/kernel/cpt/cpt_mm.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_mm.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_mm.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,823 @@
++/*
++ *
++ * kernel/cpt/cpt_mm.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/hugetlb.h>
++#include <linux/errno.h>
++#include <linux/ve.h>
++#include <linux/pagemap.h>
++#include <linux/rmap.h>
++#include <asm/ldt.h>
++#include <asm/mmu.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++#include "cpt_pagein.h"
++#endif
++#include "cpt_ubc.h"
++
++static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
++ cpt_context_t *ctx)
++{
++ if (!list_empty(&aio_ctx->run_list)) {
++ /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */
++ eprintk_ctx("run list is not empty, cannot suspend AIO\n");
++ return -EBUSY;
++ }
++
++ /* Wait for pending IOCBs. Linux AIO is mostly _fake_.
++ * It is actually synchronous, except for direct IO and
++ * some funny raw USB things, which cannot happen inside VE.
++ * However, we do this for future.
++ *
++ * Later note: in 2.6.16 we may allow O_DIRECT, so that
++ * it is not meaningless code.
++ */
++ wait_for_all_aios(aio_ctx);
++
++ if (!list_empty(&aio_ctx->run_list) ||
++ !list_empty(&aio_ctx->active_reqs) ||
++ aio_ctx->reqs_active) {
++ eprintk_ctx("were not able to suspend AIO\n");
++ return -EBUSY;
++ }
++
++ return 0;
++}
++
++static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx)
++{
++ struct vm_area_struct *vma;
++
++ for (vma = mm->mmap; vma; vma = vma->vm_next) {
++ if (vma->vm_file) {
++ if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL)
++ return -ENOMEM;
++ }
++ }
++ if (cpt_add_ubc(mm->mm_ub, ctx) == NULL)
++ return -ENOMEM;
++
++ if (mm->ioctx_list) {
++ struct kioctx *aio_ctx;
++ int err;
++
++ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next)
++ if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
++ return err;
++ }
++
++ return 0;
++}
++
++int cpt_collect_mm(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++ int err;
++ int index;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL)
++ return -ENOMEM;
++ }
++
++ index = 1;
++ for_each_object(obj, CPT_OBJ_MM) {
++ struct mm_struct *mm = obj->o_obj;
++ if (obj->o_count != atomic_read(&mm->mm_users)) {
++ eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users));
++ return -EBUSY;
++ }
++ cpt_obj_setindex(obj, index++, ctx);
++
++ if ((err = collect_one_mm(mm, ctx)) != 0)
++ return err;
++ }
++
++ return 0;
++}
++
++static int zcnt, scnt, scnt0, ucnt;
++
++/* Function where_is_anon_page() returns address of a anonymous page in mm
++ * of already dumped process. This happens f.e. after fork(). We do not use
++ * this right now, just keep statistics, it is diffucult to restore such state,
++ * but the most direct use is to save space in dumped image. */
++
++
++static inline unsigned long
++vma_address0(struct page *page, struct vm_area_struct *vma)
++{
++ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
++ unsigned long address;
++
++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
++ address |= 1;
++ return address;
++}
++
++static int really_this_one(struct vm_area_struct *vma, unsigned long address,
++ struct page *page)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *pte;
++ spinlock_t *ptl;
++ int result;
++
++ pgd = pgd_offset(mm, address);
++ if (unlikely(!pgd_present(*pgd)))
++ return 0;
++
++ pud = pud_offset(pgd, address);
++ if (!pud_present(*pud))
++ return 0;
++
++ pmd = pmd_offset(pud, address);
++ if (unlikely(!pmd_present(*pmd)))
++ return 0;
++
++ result = 0;
++ pte = pte_offset_map(pmd, address);
++ if (!pte_present(*pte)) {
++ pte_unmap(pte);
++ return 0;
++ }
++
++ ptl = pte_lockptr(mm, pmd);
++ if (!spin_trylock(ptl)) {
++ pte_unmap(pte);
++ return 0;
++ }
++ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte))
++ result = 1;
++ pte_unmap_unlock(pte, ptl);
++ return result;
++}
++
++static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr,
++ struct page *page, cpt_context_t * ctx)
++{
++ loff_t mmptr = CPT_NULL;
++ struct anon_vma *anon_vma;
++ struct vm_area_struct *vma;
++ int idx = mmobj->o_index;
++
++ if (!PageAnon(page))
++ return CPT_NULL;
++
++ anon_vma = page_lock_anon_vma(page);
++ if (!anon_vma)
++ return CPT_NULL;
++
++ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
++ unsigned long addr = vma_address0(page, vma);
++ cpt_object_t *obj;
++
++ /* We do not try to support mremapped regions (addr != mapaddr),
++ * only mmaps directly inherited via fork().
++ * With this limitation we may check self-consistency of
++ * vmas (vm_start, vm_pgoff, anon_vma) before
++ * doing __copy_page_range() in rst_mm.
++ */
++ if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) {
++ obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx);
++ if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) {
++ if (really_this_one(vma, addr, page)) {
++ mmptr = obj->o_pos;
++ idx = obj->o_index;
++ }
++ }
++ }
++ }
++ spin_unlock(&anon_vma->lock);
++
++ return mmptr;
++}
++
++struct page_area
++{
++ int type;
++ unsigned long start;
++ unsigned long end;
++ pgoff_t pgoff;
++ loff_t mm;
++};
++
++struct page_desc
++{
++ int type;
++ pgoff_t index;
++ loff_t mm;
++ int shared;
++};
++
++enum {
++ PD_ABSENT,
++ PD_COPY,
++ PD_ZERO,
++ PD_CLONE,
++ PD_FUNKEY,
++ PD_LAZY
++};
++
++/* 0: page can be obtained from backstore, or still not mapped anonymous page,
++ or something else, which does not requre copy.
++ 1: page requires copy
++ 2: page requres copy but its content is zero. Quite useless.
++ 3: wp page is shared after fork(). It is to be COWed when modified.
++ 4: page is something unsupported... We copy it right now.
++ */
++
++
++
++static void page_get_desc(cpt_object_t *mmobj,
++ struct vm_area_struct *vma, unsigned long addr,
++ struct page_desc *pdesc, cpt_context_t * ctx)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *ptep, pte;
++ spinlock_t *ptl;
++ struct page *pg;
++ pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff;
++
++ pdesc->index = linear_index;
++ pdesc->shared = 0;
++
++ if (vma->vm_flags & VM_IO) {
++ pdesc->type = PD_ABSENT;
++ return;
++ }
++
++ pgd = pgd_offset(mm, addr);
++ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
++ goto out_absent;
++ pud = pud_offset(pgd, addr);
++ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
++ goto out_absent;
++ pmd = pmd_offset(pud, addr);
++ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
++ goto out_absent;
++ if (pmd_huge(*pmd)) {
++ eprintk_ctx("page_huge\n");
++ goto out_unsupported;
++ }
++
++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++ if (!ptep)
++ goto out_absent;
++
++ pte = *ptep;
++ if (pte_none(pte))
++ goto out_absent_unmap;
++
++ if (!pte_present(pte)) {
++ if (pte_file(pte)) {
++ pdesc->index = pte_to_pgoff(pte);
++ goto out_absent_unmap;
++ }
++ if (vma->vm_flags & VM_SHARED) {
++ /* It is impossible: shared mappings cannot be in swap */
++ eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos);
++ goto out_unsupported_unmap;
++ }
++ /* Otherwise it is in swap. */
++ goto out_lazy_unmap;
++ } else if ((pg = vm_normal_page(vma, addr, pte)) != NULL) {
++
++ if (pg->mapping && !PageAnon(pg)) {
++ if (vma->vm_file == NULL) {
++ eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr);
++ goto out_unsupported_unmap;
++ }
++ if (vma->vm_file->f_mapping != pg->mapping) {
++ eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", addr, vma->vm_file->f_mapping, pg->mapping, mmobj->o_pos);
++ goto out_unsupported_unmap;
++ }
++ pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT));
++ /* Page is in backstore. For us it is like
++ * it is not present.
++ */
++ goto out_absent_unmap;
++ }
++
++ if (PageReserved(pg)) {
++ /* Special case: ZERO_PAGE is used, when an
++ * anonymous page is accessed but not written. */
++ if (pg == ZERO_PAGE(addr)) {
++ if (pte_write(pte)) {
++ eprintk_ctx("not funny already, writable ZERO_PAGE\n");
++ goto out_unsupported_unmap;
++ }
++ zcnt++;
++ goto out_absent_unmap;
++ }
++ eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, addr, mmobj->o_pos);
++ goto out_unsupported_unmap;
++ }
++
++ if (pg == ZERO_PAGE(addr)) {
++ wprintk_ctx("that's how it works now\n");
++ }
++
++ if (!pg->mapping) {
++ eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, mmobj->o_pos);
++ goto out_unsupported_unmap;
++ }
++
++ if (pg->mapping && page_mapcount(pg) > 1) {
++ pdesc->shared = 1;
++ pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx);
++ if (pdesc->mm != CPT_NULL) {
++ scnt0++;
++ goto out_clone_unmap;
++ } else {
++ scnt++;
++ }
++ }
++
++ if (!pte_young(pte))
++ goto out_lazy_unmap;
++ }
++ pte_unmap_unlock(ptep, ptl);
++ pdesc->type = PD_COPY;
++ return;
++
++out_lazy_unmap:
++ pte_unmap_unlock(ptep, ptl);
++ pdesc->type = PD_LAZY;
++ return;
++
++out_absent_unmap:
++ pte_unmap_unlock(ptep, ptl);
++out_absent:
++ pdesc->type = PD_ABSENT;
++ return;
++
++out_clone_unmap:
++ pte_unmap_unlock(ptep, ptl);
++ pdesc->type = PD_CLONE;
++ return;
++
++out_unsupported_unmap:
++ pte_unmap_unlock(ptep, ptl);
++out_unsupported:
++ ucnt++;
++ pdesc->type = PD_FUNKEY;
++ return;
++}
++
++/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages()
++ * does not really need this thing. It just stores some page fault stats there.
++ *
++ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages
++ * before accessing vma.
++ */
++void dump_pages(struct vm_area_struct *vma, unsigned long start,
++ unsigned long end, struct cpt_context *ctx)
++{
++#define MAX_PAGE_BATCH 16
++ struct page *pg[MAX_PAGE_BATCH];
++ int npages = (end - start)/PAGE_SIZE;
++ int count = 0;
++
++ while (count < npages) {
++ int copy = npages - count;
++ int n;
++
++ if (copy > MAX_PAGE_BATCH)
++ copy = MAX_PAGE_BATCH;
++ n = get_user_pages(current, vma->vm_mm, start, copy,
++ 0, 1, pg, NULL);
++ if (n == copy) {
++ int i;
++ for (i=0; i<n; i++) {
++ char *maddr = kmap(pg[i]);
++ ctx->write(maddr, PAGE_SIZE, ctx);
++ kunmap(pg[i]);
++ }
++ } else {
++ eprintk_ctx("get_user_pages fault");
++ for ( ; n > 0; n--)
++ page_cache_release(pg[n-1]);
++ return;
++ }
++ start += n*PAGE_SIZE;
++ count += n;
++ for ( ; n > 0; n--)
++ page_cache_release(pg[n-1]);
++ }
++ return;
++}
++
++int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb,
++ int copy,
++ struct cpt_context *ctx)
++{
++ loff_t saved_object;
++
++ cpt_push_object(&saved_object, ctx);
++
++ pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES;
++ pgb->cpt_hdrlen = sizeof(*pgb);
++ pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID;
++
++ ctx->write(pgb, sizeof(*pgb), ctx);
++ if (copy == PD_COPY || copy == PD_LAZY)
++ dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_object, ctx);
++ return 0;
++}
++
++int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa,
++ struct cpt_context *ctx)
++{
++ struct cpt_remappage_block pgb;
++ loff_t saved_object;
++
++ cpt_push_object(&saved_object, ctx);
++
++ pgb.cpt_object = CPT_OBJ_REMAPPAGES;
++ pgb.cpt_hdrlen = sizeof(pgb);
++ pgb.cpt_content = CPT_CONTENT_VOID;
++ pgb.cpt_start = pa->start;
++ pgb.cpt_end = pa->end;
++ pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1;
++
++ ctx->write(&pgb, sizeof(pgb), ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_object, ctx);
++ return 0;
++}
++
++int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa,
++ struct cpt_context *ctx)
++{
++ struct cpt_copypage_block pgb;
++ loff_t saved_object;
++
++ cpt_push_object(&saved_object, ctx);
++
++ pgb.cpt_object = CPT_OBJ_COPYPAGES;
++ pgb.cpt_hdrlen = sizeof(pgb);
++ pgb.cpt_content = CPT_CONTENT_VOID;
++ pgb.cpt_start = pa->start;
++ pgb.cpt_end = pa->end;
++ pgb.cpt_source = pa->mm;
++
++ ctx->write(&pgb, sizeof(pgb), ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_object, ctx);
++ return 0;
++}
++
++int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa,
++ cpt_context_t *ctx)
++{
++ struct cpt_lazypage_block pgb;
++ loff_t saved_object;
++
++ cpt_push_object(&saved_object, ctx);
++
++ pgb.cpt_object = CPT_OBJ_LAZYPAGES;
++ pgb.cpt_hdrlen = sizeof(pgb);
++ pgb.cpt_content = CPT_CONTENT_VOID;
++ pgb.cpt_start = pa->start;
++ pgb.cpt_end = pa->end;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start,
++ (pa->end-pa->start)/PAGE_SIZE, ctx);
++#endif
++ ctx->write(&pgb, sizeof(pgb), ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_object, ctx);
++ return 0;
++}
++
++static int can_expand(struct page_area *pa, struct page_desc *pd)
++{
++ if (pa->start == pa->end)
++ return 1;
++ if (pa->type != pd->type)
++ return 0;
++ if (pa->type == PD_ABSENT)
++ return pd->index == pa->pgoff + 1;
++ if (pa->type == PD_CLONE)
++ return pd->mm == pa->mm;
++ return 1;
++}
++
++static int dump_one_vma(cpt_object_t *mmobj,
++ struct vm_area_struct *vma, struct cpt_context *ctx)
++{
++ struct cpt_vma_image *v = cpt_get_buf(ctx);
++ unsigned long addr;
++ loff_t saved_object;
++ struct cpt_page_block pgb;
++ struct page_area pa;
++ int cloned_pages = 0;
++
++ cpt_push_object(&saved_object, ctx);
++
++ v->cpt_object = CPT_OBJ_VMA;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_start = vma->vm_start;
++ v->cpt_end = vma->vm_end;
++ v->cpt_flags = vma->vm_flags;
++ if (vma->vm_flags&VM_HUGETLB) {
++ eprintk_ctx("huge TLB VMAs are still not supported\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_pgprot = vma->vm_page_prot.pgprot;
++ v->cpt_pgoff = vma->vm_pgoff;
++ v->cpt_file = CPT_NULL;
++ v->cpt_type = CPT_VMA_TYPE_0;
++ v->cpt_anonvma = 0;
++
++ /* We have to remember what VMAs are bound to one anon_vma.
++ * So, we store an identifier of group of VMAs. It is handy
++ * to use absolute address of anon_vma as this identifier. */
++ v->cpt_anonvmaid = (unsigned long)vma->anon_vma;
++
++ if (vma->vm_file) {
++ struct file *filp;
++ cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx);
++ if (obj == NULL) BUG();
++ filp = obj->o_obj;
++ if (filp->f_op &&
++ filp->f_op->read == NULL &&
++ filp->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_TMPFS)
++ v->cpt_type = CPT_VMA_TYPE_SHM;
++ v->cpt_file = obj->o_pos;
++ }
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ pa.type = PD_ABSENT;
++ pa.pgoff = vma->vm_pgoff;
++ pa.mm = CPT_NULL;
++ pa.start = vma->vm_start;
++ pa.end = vma->vm_start;
++
++ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
++ struct page_desc pd;
++
++ page_get_desc(mmobj, vma, addr, &pd, ctx);
++ cloned_pages += pd.shared;
++
++ if (pd.type == PD_FUNKEY) {
++ eprintk_ctx("dump_one_vma: funkey page\n");
++ return -EINVAL;
++ }
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ if (pd.type == PD_LAZY &&
++ (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED)))
++ pd.type = PD_COPY;
++#else
++ if (pd.type == PD_LAZY)
++ pd.type = PD_COPY;
++#endif
++
++ if (!can_expand(&pa, &pd)) {
++ if (pa.type == PD_COPY ||
++ pa.type == PD_ZERO) {
++ pgb.cpt_start = pa.start;
++ pgb.cpt_end = pa.end;
++ dump_page_block(vma, &pgb, pa.type, ctx);
++ } else if (pa.type == PD_CLONE) {
++ dump_copypage_block(vma, &pa, ctx);
++ cloned_pages++;
++ } else if (pa.type == PD_LAZY) {
++ dump_lazypage_block(vma, &pa, ctx);
++ } else if (pa.type == PD_ABSENT &&
++ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
++ dump_remappage_block(vma, &pa, ctx);
++ }
++ pa.start = addr;
++ }
++ pa.type = pd.type;
++ pa.end = addr + PAGE_SIZE;
++ pa.pgoff = pd.index;
++ pa.mm = pd.mm;
++ }
++
++ if (pa.end > pa.start) {
++ if (pa.type == PD_COPY ||
++ pa.type == PD_ZERO) {
++ pgb.cpt_start = pa.start;
++ pgb.cpt_end = pa.end;
++ dump_page_block(vma, &pgb, pa.type, ctx);
++ } else if (pa.type == PD_CLONE) {
++ dump_copypage_block(vma, &pa, ctx);
++ cloned_pages++;
++ } else if (pa.type == PD_LAZY) {
++ dump_lazypage_block(vma, &pa, ctx);
++ } else if (pa.type == PD_ABSENT &&
++ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) {
++ dump_remappage_block(vma, &pa, ctx);
++ }
++ }
++
++ if (cloned_pages) {
++ __u32 anonvma = 1;
++ loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma);
++ ctx->pwrite(&anonvma, 4, ctx, anonpos);
++ }
++
++ cpt_close_object(ctx);
++
++ cpt_pop_object(&saved_object, ctx);
++
++ return 0;
++}
++
++static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx,
++ cpt_context_t *ctx)
++{
++ loff_t saved_object;
++ struct cpt_aio_ctx_image aimg;
++
++ if (!list_empty(&aio_ctx->run_list) ||
++ !list_empty(&aio_ctx->active_reqs) ||
++ aio_ctx->reqs_active) {
++ eprintk_ctx("AIO is active after suspend\n");
++ return -EBUSY;
++ }
++
++ cpt_push_object(&saved_object, ctx);
++
++ aimg.cpt_next = CPT_ALIGN(sizeof(aimg));
++ aimg.cpt_object = CPT_OBJ_AIO_CONTEXT;
++ aimg.cpt_hdrlen = sizeof(aimg);
++ aimg.cpt_content = CPT_CONTENT_ARRAY;
++
++ aimg.cpt_max_reqs = aio_ctx->max_reqs;
++ aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages;
++ aimg.cpt_nr = aio_ctx->ring_info.nr;
++ aimg.cpt_tail = aio_ctx->ring_info.tail;
++ aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base;
++
++ ctx->write(&aimg, sizeof(aimg), ctx);
++
++ cpt_pop_object(&saved_object, ctx);
++ return 0;
++}
++
++static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct mm_struct *mm = obj->o_obj;
++ struct vm_area_struct *vma;
++ struct cpt_mm_image *v = cpt_get_buf(ctx);
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = -1;
++ v->cpt_object = CPT_OBJ_MM;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_start_code = mm->start_code;
++ v->cpt_end_code = mm->end_code;
++ v->cpt_start_data = mm->start_data;
++ v->cpt_end_data = mm->end_data;
++ v->cpt_start_brk = mm->start_brk;
++ v->cpt_brk = mm->brk;
++ v->cpt_start_stack = mm->start_stack;
++ v->cpt_start_arg = mm->arg_start;
++ v->cpt_end_arg = mm->arg_end;
++ v->cpt_start_env = mm->env_start;
++ v->cpt_end_env = mm->env_end;
++ v->cpt_def_flags = mm->def_flags;
++ v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx);
++ v->cpt_dumpable = mm->dumpable;
++ v->cpt_vps_dumpable = mm->vps_dumpable;
++ v->cpt_used_hugetlb = 0; /* not used */
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ if (mm->context.size) {
++ loff_t saved_object;
++ struct cpt_obj_bits b;
++ int size;
++
++ dprintk_ctx("nontrivial LDT\n");
++
++ cpt_push_object(&saved_object, ctx);
++
++ cpt_open_object(NULL, ctx);
++ b.cpt_next = CPT_NULL;
++ b.cpt_object = CPT_OBJ_BITS;
++ b.cpt_hdrlen = sizeof(b);
++ b.cpt_content = CPT_CONTENT_MM_CONTEXT;
++ b.cpt_size = mm->context.size*LDT_ENTRY_SIZE;
++
++ ctx->write(&b, sizeof(b), ctx);
++
++ size = mm->context.size*LDT_ENTRY_SIZE;
++
++#if defined(CONFIG_X86_64) || LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15)
++ ctx->write(mm->context.ldt, size, ctx);
++#else
++ for (i = 0; i < size; i += PAGE_SIZE) {
++ int nr = i / PAGE_SIZE, bytes;
++ char *kaddr = kmap(mm->context.ldt_pages[nr]);
++
++ bytes = size - i;
++ if (bytes > PAGE_SIZE)
++ bytes = PAGE_SIZE;
++ ctx->write(kaddr, bytes, ctx);
++ kunmap(mm->context.ldt_pages[nr]);
++ }
++#endif
++
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_object, ctx);
++ }
++
++ for (vma = mm->mmap; vma; vma = vma->vm_next) {
++ int err;
++
++#ifdef CONFIG_X86_64
++ if (vma->vm_start == 0xFFFFE000 &&
++ vma->vm_end == 0xFFFFF000)
++ continue;
++#endif
++
++ if ((err = dump_one_vma(obj, vma, ctx)) != 0)
++ return err;
++ }
++
++ if (mm->ioctx_list) {
++ struct kioctx *aio_ctx;
++ int err;
++
++ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next)
++ if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_object(ctx);
++
++ return 0;
++}
++
++int cpt_dump_vm(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ scnt = scnt0 = zcnt = 0;
++
++ cpt_open_section(ctx, CPT_SECT_MM);
++
++ for_each_object(obj, CPT_OBJ_MM) {
++ int err;
++
++ if ((err = dump_one_mm(obj, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_section(ctx);
++
++ if (scnt)
++ dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt);
++ if (scnt0)
++ dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0);
++ if (zcnt)
++ dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt);
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_mm.h linux-2.6.16-026test015/kernel/cpt/cpt_mm.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_mm.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_mm.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,16 @@
++int cpt_collect_mm(cpt_context_t *);
++
++int cpt_dump_vm(struct cpt_context *ctx);
++
++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx);
++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++
++int cpt_mm_prepare(unsigned long veid);
++
++int cpt_free_pgin_dir(struct cpt_context *);
++int cpt_start_pagein(struct cpt_context *);
++int rst_setup_pagein(struct cpt_context *);
++int rst_complete_pagein(struct cpt_context *, int);
++int rst_pageind(struct cpt_context *);
++int rst_swapoff(struct cpt_context *);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_net.c linux-2.6.16-026test015/kernel/cpt/cpt_net.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_net.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_net.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,428 @@
++/*
++ *
++ * kernel/cpt/cpt_net.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <net/addrconf.h>
++#include <linux/rtnetlink.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <linux/vzcalluser.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++#include "cpt_syscalls.h"
++
++int cpt_dump_link(struct cpt_context * ctx)
++{
++ struct net_device *dev;
++
++ cpt_open_section(ctx, CPT_SECT_NET_DEVICE);
++ for (dev = dev_base; dev; dev = dev->next) {
++ struct cpt_netdev_image v;
++
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NET_DEVICE;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_VOID;
++
++ v.cpt_index = dev->ifindex;
++ v.cpt_flags = dev->flags;
++ memcpy(v.cpt_name, dev->name, IFNAMSIZ);
++ ctx->write(&v, sizeof(v), ctx);
++ cpt_close_object(ctx);
++
++ if (strcmp(dev->name, "lo") != 0 &&
++ strcmp(dev->name, "venet0") != 0) {
++ eprintk_ctx("unsupported netdevice %s\n", dev->name);
++ cpt_close_section(ctx);
++ return -EBUSY;
++ }
++ }
++ cpt_close_section(ctx);
++ return 0;
++}
++
++int cpt_suspend_network(struct cpt_context *ctx)
++{
++ get_exec_env()->disable_net = 1;
++ synchronize_net();
++ return 0;
++}
++
++int cpt_resume_network(struct cpt_context *ctx)
++{
++ struct ve_struct *env;
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++ env->disable_net = 0;
++ put_ve(env);
++ return 0;
++}
++
++int cpt_dump_ifaddr(struct cpt_context * ctx)
++{
++ struct net_device *dev;
++
++ cpt_open_section(ctx, CPT_SECT_NET_IFADDR);
++ for (dev = dev_base; dev; dev = dev->next) {
++ struct in_device *idev = in_dev_get(dev);
++ struct in_ifaddr *ifa;
++
++ if (!idev)
++ continue;
++
++ for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) {
++ struct cpt_ifaddr_image v;
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NET_IFADDR;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_VOID;
++
++ v.cpt_index = dev->ifindex;
++ v.cpt_family = AF_INET;
++ v.cpt_masklen = ifa->ifa_prefixlen;
++ v.cpt_flags = ifa->ifa_flags;
++ v.cpt_scope = ifa->ifa_scope;
++ memset(&v.cpt_address, 0, sizeof(v.cpt_address));
++ memset(&v.cpt_peer, 0, sizeof(v.cpt_peer));
++ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
++ v.cpt_address[0] = ifa->ifa_local;
++ v.cpt_peer[0] = ifa->ifa_address;
++ v.cpt_broadcast[0] = ifa->ifa_broadcast;
++ memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ);
++ ctx->write(&v, sizeof(v), ctx);
++ cpt_close_object(ctx);
++ }
++ in_dev_put(idev);
++ }
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ for (dev = dev_base; dev; dev = dev->next) {
++ struct inet6_dev *idev = in6_dev_get(dev);
++ struct inet6_ifaddr *ifa;
++
++ if (!idev)
++ continue;
++
++ for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) {
++ struct cpt_ifaddr_image v;
++
++ if (dev == &loopback_dev &&
++ ifa->prefix_len == 128 &&
++ ifa->addr.s6_addr32[0] == 0 &&
++ ifa->addr.s6_addr32[1] == 0 &&
++ ifa->addr.s6_addr32[2] == 0 &&
++ ifa->addr.s6_addr32[3] == htonl(1))
++ continue;
++
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NET_IFADDR;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_VOID;
++
++ v.cpt_index = dev->ifindex;
++ v.cpt_family = AF_INET6;
++ v.cpt_masklen = ifa->prefix_len;
++ v.cpt_flags = ifa->flags;
++ v.cpt_scope = ifa->scope;
++ memcpy(&v.cpt_address, &ifa->addr, 16);
++ memcpy(&v.cpt_peer, &ifa->addr, 16);
++ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast));
++ memcpy(v.cpt_label, dev->name, IFNAMSIZ);
++ ctx->write(&v, sizeof(v), ctx);
++ cpt_close_object(ctx);
++ }
++ in6_dev_put(idev);
++ }
++#endif
++ cpt_close_section(ctx);
++ return 0;
++}
++
++static int cpt_dump_route(struct cpt_context * ctx)
++{
++ int err;
++ struct socket *sock;
++ struct msghdr msg;
++ struct iovec iov;
++ struct {
++ struct nlmsghdr nlh;
++ struct rtgenmsg g;
++ } req;
++ struct sockaddr_nl nladdr;
++ struct cpt_object_hdr v;
++ mm_segment_t oldfs;
++ char *pg;
++
++ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
++ if (err)
++ return err;
++
++ memset(&nladdr, 0, sizeof(nladdr));
++ nladdr.nl_family = AF_NETLINK;
++
++ req.nlh.nlmsg_len = sizeof(req);
++ req.nlh.nlmsg_type = RTM_GETROUTE;
++ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
++ req.nlh.nlmsg_pid = 0;
++ req.g.rtgen_family = AF_INET;
++
++ iov.iov_base=&req;
++ iov.iov_len=sizeof(req);
++ msg.msg_name=&nladdr;
++ msg.msg_namelen=sizeof(nladdr);
++ msg.msg_iov=&iov;
++ msg.msg_iovlen=1;
++ msg.msg_control=NULL;
++ msg.msg_controllen=0;
++ msg.msg_flags=MSG_DONTWAIT;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = sock_sendmsg(sock, &msg, sizeof(req));
++ set_fs(oldfs);
++
++ if (err < 0)
++ goto out_sock;
++
++ pg = (char*)__get_free_page(GFP_KERNEL);
++ if (pg == NULL) {
++ err = -ENOMEM;
++ goto out_sock;
++ }
++
++ cpt_open_section(ctx, CPT_SECT_NET_ROUTE);
++ cpt_open_object(NULL, ctx);
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NET_ROUTE;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_NLMARRAY;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++restart:
++#endif
++ for (;;) {
++ struct nlmsghdr *h;
++
++ iov.iov_base = pg;
++ iov.iov_len = PAGE_SIZE;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
++ set_fs(oldfs);
++
++ if (err < 0)
++ goto out_sock_pg;
++ if (msg.msg_flags & MSG_TRUNC) {
++ err = -ENOBUFS;
++ goto out_sock_pg;
++ }
++
++ h = (struct nlmsghdr*)pg;
++ while (NLMSG_OK(h, err)) {
++ if (h->nlmsg_type == NLMSG_DONE) {
++ err = 0;
++ goto done;
++ }
++ if (h->nlmsg_type == NLMSG_ERROR) {
++ struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h);
++ err = errm->error;
++ eprintk_ctx("NLMSG error: %d\n", errm->error);
++ goto done;
++ }
++ if (h->nlmsg_type != RTM_NEWROUTE) {
++ eprintk_ctx("NLMSG: %d\n", h->nlmsg_type);
++ err = -EINVAL;
++ goto done;
++ }
++ ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx);
++ h = NLMSG_NEXT(h, err);
++ }
++ if (err) {
++ eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type);
++ err = -EINVAL;
++ break;
++ }
++ }
++done:
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ if (!err && req.g.rtgen_family == AF_INET) {
++ req.g.rtgen_family = AF_INET6;
++ iov.iov_base=&req;
++ iov.iov_len=sizeof(req);
++ msg.msg_name=&nladdr;
++ msg.msg_namelen=sizeof(nladdr);
++ msg.msg_iov=&iov;
++ msg.msg_iovlen=1;
++ msg.msg_control=NULL;
++ msg.msg_controllen=0;
++ msg.msg_flags=MSG_DONTWAIT;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = sock_sendmsg(sock, &msg, sizeof(req));
++ set_fs(oldfs);
++
++ if (err > 0)
++ goto restart;
++ }
++#endif
++ cpt_close_object(ctx);
++ cpt_close_section(ctx);
++
++out_sock_pg:
++ free_page((unsigned long)pg);
++out_sock:
++ sock_release(sock);
++ return err;
++}
++
++static int dumpfn(void *arg)
++{
++ int i;
++ int *pfd = arg;
++ char *argv[] = { "iptables-save", "-c", NULL };
++
++ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0);
++ if (i < 0) {
++ eprintk("cannot enter ve to dump iptables\n");
++ module_put(THIS_MODULE);
++ return 1;
++ }
++
++ if (pfd[1] != 1)
++ sc_dup2(pfd[1], 1);
++
++ for (i=0; i<current->files->fdt->max_fds; i++) {
++ if (i != 1)
++ sc_close(i);
++ }
++
++ module_put(THIS_MODULE);
++
++ set_fs(KERNEL_DS);
++ i = sc_execve("/sbin/iptables-save", argv, NULL);
++ eprintk("failed to exec /sbin/iptables-save: %d\n", i);
++ return -1;
++}
++
++
++static int cpt_dump_iptables(struct cpt_context * ctx)
++{
++ int err;
++ int pid;
++ int pfd[2];
++ struct file *f;
++ struct cpt_object_hdr v;
++ char buf[16];
++ loff_t pos;
++ int n;
++
++ err = sc_pipe(pfd);
++ if (err < 0) {
++ eprintk_ctx("sc_pipe: %d\n", err);
++ return err;
++ }
++ err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0);
++ if (err < 0) {
++ eprintk_ctx("local_kernel_thread: %d\n", err);
++ goto out;
++ }
++ f = fget(pfd[0]);
++ sc_close(pfd[1]);
++ sc_close(pfd[0]);
++
++ cpt_open_section(ctx, CPT_SECT_NET_IPTABLES);
++
++ cpt_open_object(NULL, ctx);
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_NAME;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_NAME;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ pos = ctx->file->f_pos;
++ do {
++ mm_segment_t oldfs;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos);
++ set_fs(oldfs);
++ if (n > 0)
++ ctx->write(buf, n, ctx);
++ } while (n > 0);
++
++ if (n < 0)
++ eprintk_ctx("read: %d\n", n);
++
++ fput(f);
++
++ if ((err = sc_waitx(pid, 0)) < 0)
++ eprintk_ctx("wait4: %d\n", err);
++
++ if (ctx->file->f_pos != pos) {
++ buf[0] = 0;
++ ctx->write(buf, 1, ctx);
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ cpt_close_section(ctx);
++ } else {
++ pos = ctx->current_section;
++ cpt_close_object(ctx);
++ cpt_close_section(ctx);
++ ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL;
++ ctx->file->f_pos = pos;
++ }
++ return n;
++
++out:
++ if (pfd[1] >= 0)
++ sc_close(pfd[1]);
++ if (pfd[0] >= 0)
++ sc_close(pfd[0]);
++ return err;
++}
++
++int cpt_dump_ifinfo(struct cpt_context * ctx)
++{
++ int err;
++
++ err = cpt_dump_link(ctx);
++ if (!err)
++ err = cpt_dump_ifaddr(ctx);
++ if (!err)
++ err = cpt_dump_route(ctx);
++ if (!err)
++ err = cpt_dump_iptables(ctx);
++ return err;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_net.h linux-2.6.16-026test015/kernel/cpt/cpt_net.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_net.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_net.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,7 @@
++int cpt_dump_ifinfo(struct cpt_context *ctx);
++int rst_restore_net(struct cpt_context *ctx);
++int cpt_suspend_network(struct cpt_context *ctx);
++int cpt_resume_network(struct cpt_context *ctx);
++int rst_resume_network(struct cpt_context *ctx);
++int cpt_dump_ip_conntrack(struct cpt_context *ctx);
++int rst_restore_ip_conntrack(struct cpt_context * ctx);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_obj.c linux-2.6.16-026test015/kernel/cpt/cpt_obj.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_obj.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_obj.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,172 @@
++/*
++ *
++ * kernel/cpt/cpt_obj.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ obj = kmalloc(sizeof(cpt_object_t), gfp);
++ if (obj) {
++ INIT_LIST_HEAD(&obj->o_list);
++ INIT_LIST_HEAD(&obj->o_hash);
++ INIT_LIST_HEAD(&obj->o_alist);
++ obj->o_count = 1;
++ obj->o_pos = CPT_NULL;
++ obj->o_lock = 0;
++ obj->o_parent = NULL;
++ obj->o_index = CPT_NOINDEX;
++ obj->o_obj = NULL;
++ obj->o_image = NULL;
++ ctx->objcount++;
++ }
++ return obj;
++}
++// //EXPORT_SYMBOL(alloc_cpt_object);
++
++void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx)
++{
++ list_del(&obj->o_alist);
++ kfree(obj);
++ ctx->objcount--;
++}
++
++void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx)
++{
++ list_add_tail(&obj->o_list, &ctx->object_array[type]);
++}
++// //EXPORT_SYMBOL(intern_cpt_object);
++
++void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj,
++ cpt_object_t *head, cpt_context_t *ctx)
++{
++ list_add(&obj->o_list, &head->o_list);
++}
++// //EXPORT_SYMBOL(insert_cpt_object);
++
++cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p,
++ unsigned gfp_mask, cpt_context_t *ctx)
++{
++ cpt_object_t *obj;
++
++ obj = lookup_cpt_object(type, p, ctx);
++
++ if (obj) {
++ obj->o_count++;
++ return obj;
++ }
++
++ if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) {
++ if (p)
++ cpt_obj_setobj(obj, p, ctx);
++ intern_cpt_object(type, obj, ctx);
++ return obj;
++ }
++ return NULL;
++}
++// //EXPORT_SYMBOL(__cpt_object_add);
++
++cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
++{
++ return __cpt_object_add(type, p, GFP_KERNEL, ctx);
++}
++// //EXPORT_SYMBOL(cpt_object_add);
++
++cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx)
++{
++ cpt_object_t *obj;
++
++ obj = lookup_cpt_object(type, p, ctx);
++
++ if (obj)
++ obj->o_count++;
++
++ return obj;
++}
++// //EXPORT_SYMBOL(cpt_object_get);
++
++int cpt_object_init(cpt_context_t *ctx)
++{
++ int i;
++
++ for (i=0; i<CPT_OBJ_MAX; i++) {
++ INIT_LIST_HEAD(&ctx->object_array[i]);
++ }
++ return 0;
++}
++
++int cpt_object_destroy(cpt_context_t *ctx)
++{
++ int i;
++
++ for (i=0; i<CPT_OBJ_MAX; i++) {
++ while (!list_empty(&ctx->object_array[i])) {
++ struct list_head *head = ctx->object_array[i].next;
++ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list);
++ list_del(head);
++ if (obj->o_image)
++ kfree(obj->o_image);
++ free_cpt_object(obj, ctx);
++ }
++ }
++ if (ctx->objcount != 0)
++ eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount);
++ return 0;
++}
++
++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, type) {
++ if (obj->o_obj == p)
++ return obj;
++ }
++ return NULL;
++}
++// //EXPORT_SYMBOL(lookup_cpt_object);
++
++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, type) {
++ if (obj->o_pos == pos)
++ return obj;
++ }
++ return NULL;
++}
++// //EXPORT_SYMBOL(lookup_cpt_obj_bypos);
++
++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, type) {
++ if (obj->o_index == index)
++ return obj;
++ }
++ return NULL;
++}
++// //EXPORT_SYMBOL(lookup_cpt_obj_byindex);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_obj.h linux-2.6.16-026test015/kernel/cpt/cpt_obj.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_obj.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_obj.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,62 @@
++#ifndef __CPT_OBJ_H_
++#define __CPT_OBJ_H_ 1
++
++#include <linux/list.h>
++#include <linux/cpt_image.h>
++
++typedef struct _cpt_object
++{
++ struct list_head o_list;
++ struct list_head o_hash;
++ int o_count;
++ int o_index;
++ int o_lock;
++ loff_t o_pos;
++ loff_t o_ppos;
++ void *o_obj;
++ void *o_image;
++ void *o_parent;
++ struct list_head o_alist;
++} cpt_object_t;
++
++struct cpt_context;
++
++#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list)
++
++
++extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx);
++extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx);
++
++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx);
++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx);
++
++static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx)
++{
++ cpt->o_pos = pos;
++ /* Add to pos hash table */
++}
++
++static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx)
++{
++ cpt->o_obj = ptr;
++ /* Add to hash table */
++}
++
++static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx)
++{
++ cpt->o_index = index;
++ /* Add to index hash table */
++}
++
++
++extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx);
++extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx);
++extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
++extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx);
++extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx);
++
++extern int cpt_object_init(struct cpt_context *ctx);
++extern int cpt_object_destroy(struct cpt_context *ctx);
++
++#endif /* __CPT_OBJ_H_ */
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_proc.c linux-2.6.16-026test015/kernel/cpt/cpt_proc.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_proc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_proc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,575 @@
++/*
++ *
++ * kernel/cpt/cpt_proc.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/list.h>
++#include <linux/proc_fs.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_ioctl.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_dump.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++
++MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>");
++MODULE_LICENSE("GPL");
++
++/* List of contexts and lock protecting the list */
++static struct list_head cpt_context_list;
++static spinlock_t cpt_context_lock;
++
++static int proc_read(char *buffer, char **start, off_t offset,
++ int length, int *eof, void *data)
++{
++ off_t pos = 0;
++ off_t begin = 0;
++ int len = 0;
++ cpt_context_t *ctx;
++
++ len += sprintf(buffer, "Ctx Id VE State\n");
++
++ spin_lock(&cpt_context_lock);
++
++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++ len += sprintf(buffer+len,"%p %08x %-8u %d",
++ ctx,
++ ctx->contextid,
++ ctx->ve_id,
++ ctx->ctx_state
++ );
++
++ buffer[len++] = '\n';
++
++ pos = begin+len;
++ if (pos < offset) {
++ len = 0;
++ begin = pos;
++ }
++ if (pos > offset+length)
++ goto done;
++ }
++ *eof = 1;
++
++done:
++ spin_unlock(&cpt_context_lock);
++ *start = buffer + (offset - begin);
++ len -= (offset - begin);
++ if(len > length)
++ len = length;
++ if(len < 0)
++ len = 0;
++ return len;
++}
++
++void cpt_context_release(cpt_context_t *ctx)
++{
++ list_del(&ctx->ctx_list);
++ spin_unlock(&cpt_context_lock);
++
++ if (ctx->ctx_state > 0)
++ cpt_resume(ctx);
++ ctx->ctx_state = CPT_CTX_ERROR;
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ if (ctx->pgin_task)
++ put_task_struct(ctx->pgin_task);
++ if (ctx->pgin_dir)
++ cpt_free_pgin_dir(ctx);
++ if (ctx->pagein_file_out)
++ fput(ctx->pagein_file_out);
++ if (ctx->pagein_file_in)
++ fput(ctx->pagein_file_in);
++#endif
++ if (ctx->objcount)
++ eprintk_ctx("%d objects leaked\n", ctx->objcount);
++ if (ctx->file)
++ fput(ctx->file);
++ cpt_flush_error(ctx);
++ if (ctx->errorfile) {
++ fput(ctx->errorfile);
++ ctx->errorfile = NULL;
++ }
++ if (ctx->error_msg) {
++ free_page((unsigned long)ctx->error_msg);
++ ctx->error_msg = NULL;
++ }
++ if (ctx->statusfile)
++ fput(ctx->statusfile);
++ if (ctx->lockfile)
++ fput(ctx->lockfile);
++ kfree(ctx);
++
++ spin_lock(&cpt_context_lock);
++}
++
++static void __cpt_context_put(cpt_context_t *ctx)
++{
++ if (!--ctx->refcount)
++ cpt_context_release(ctx);
++}
++
++static void cpt_context_put(cpt_context_t *ctx)
++{
++ spin_lock(&cpt_context_lock);
++ __cpt_context_put(ctx);
++ spin_unlock(&cpt_context_lock);
++}
++
++cpt_context_t * cpt_context_open(void)
++{
++ cpt_context_t *ctx;
++
++ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
++ cpt_context_init(ctx);
++ spin_lock(&cpt_context_lock);
++ list_add_tail(&ctx->ctx_list, &cpt_context_list);
++ spin_unlock(&cpt_context_lock);
++ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
++ if (ctx->error_msg != NULL)
++ ctx->error_msg[0] = 0;
++ }
++ return ctx;
++}
++
++static cpt_context_t * cpt_context_lookup(unsigned int contextid)
++{
++ cpt_context_t *ctx;
++
++ spin_lock(&cpt_context_lock);
++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++ if (ctx->contextid == contextid) {
++ ctx->refcount++;
++ spin_unlock(&cpt_context_lock);
++ return ctx;
++ }
++ }
++ spin_unlock(&cpt_context_lock);
++ return NULL;
++}
++
++int cpt_context_lookup_veid(unsigned int veid)
++{
++ cpt_context_t *ctx;
++
++ spin_lock(&cpt_context_lock);
++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++ if (ctx->ve_id == veid && ctx->ctx_state > 0) {
++ spin_unlock(&cpt_context_lock);
++ return 1;
++ }
++ }
++ spin_unlock(&cpt_context_lock);
++ return 0;
++}
++
++static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
++{
++ int err = 0;
++ cpt_context_t *ctx;
++ struct file *dfile = NULL;
++
++ unlock_kernel();
++
++ if (cmd == CPT_VMPREP) {
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ err = cpt_mm_prepare(arg);
++#else
++ err = -EINVAL;
++#endif
++ goto out_lock;
++ }
++
++ if (cmd == CPT_TEST_CAPS) {
++ unsigned int src_flags, dst_flags = arg;
++
++ err = 0;
++ src_flags = test_cpu_caps();
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
++ goto out_lock;
++ }
++
++ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
++ cpt_context_t *old_ctx;
++
++ ctx = NULL;
++ if (cmd == CPT_JOIN_CONTEXT) {
++ err = -ENOENT;
++ ctx = cpt_context_lookup(arg);
++ if (!ctx)
++ goto out_lock;
++ }
++
++ spin_lock(&cpt_context_lock);
++ old_ctx = (cpt_context_t*)file->private_data;
++ file->private_data = ctx;
++
++ if (old_ctx) {
++ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
++ old_ctx->sticky = 0;
++ old_ctx->refcount--;
++ }
++ __cpt_context_put(old_ctx);
++ }
++ spin_unlock(&cpt_context_lock);
++ err = 0;
++ goto out_lock;
++ }
++
++ spin_lock(&cpt_context_lock);
++ ctx = (cpt_context_t*)file->private_data;
++ if (ctx)
++ ctx->refcount++;
++ spin_unlock(&cpt_context_lock);
++
++ if (!ctx) {
++ cpt_context_t *old_ctx;
++
++ err = -ENOMEM;
++ ctx = cpt_context_open();
++ if (!ctx)
++ goto out_lock;
++
++ spin_lock(&cpt_context_lock);
++ old_ctx = (cpt_context_t*)file->private_data;
++ if (!old_ctx) {
++ ctx->refcount++;
++ file->private_data = ctx;
++ } else {
++ old_ctx->refcount++;
++ }
++ if (old_ctx) {
++ __cpt_context_put(ctx);
++ ctx = old_ctx;
++ }
++ spin_unlock(&cpt_context_lock);
++ }
++
++ if (cmd == CPT_GET_CONTEXT) {
++ unsigned int contextid = (unsigned int)arg;
++
++ if (ctx->contextid && ctx->contextid != contextid) {
++ err = -EINVAL;
++ goto out_nosem;
++ }
++ if (!ctx->contextid) {
++ cpt_context_t *c1 = cpt_context_lookup(contextid);
++ if (c1) {
++ cpt_context_put(c1);
++ err = -EEXIST;
++ goto out_nosem;
++ }
++ ctx->contextid = contextid;
++ }
++ spin_lock(&cpt_context_lock);
++ if (!ctx->sticky) {
++ ctx->sticky = 1;
++ ctx->refcount++;
++ }
++ spin_unlock(&cpt_context_lock);
++ goto out_nosem;
++ }
++
++ down(&ctx->main_sem);
++
++ err = -EBUSY;
++ if (ctx->ctx_state < 0)
++ goto out;
++
++ err = 0;
++ switch (cmd) {
++ case CPT_SET_DUMPFD:
++ if (ctx->ctx_state == CPT_CTX_DUMPING) {
++ err = -EBUSY;
++ break;
++ }
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ if (dfile->f_op == NULL ||
++ dfile->f_op->write == NULL) {
++ fput(dfile);
++ err = -EBADF;
++ break;
++ }
++ }
++ if (ctx->file)
++ fput(ctx->file);
++ ctx->file = dfile;
++ break;
++ case CPT_SET_ERRORFD:
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->errorfile)
++ fput(ctx->errorfile);
++ ctx->errorfile = dfile;
++ break;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ case CPT_SET_PAGEINFDIN:
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->pagein_file_in)
++ fput(ctx->pagein_file_in);
++ ctx->pagein_file_in = dfile;
++ break;
++ case CPT_SET_PAGEINFDOUT:
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->pagein_file_out)
++ fput(ctx->pagein_file_out);
++ ctx->pagein_file_out = dfile;
++ break;
++ case CPT_SET_LAZY:
++ ctx->lazy_vm = arg;
++ break;
++ case CPT_PAGEIND:
++ err = cpt_start_pagein(ctx);
++ break;
++#endif
++ case CPT_SET_VEID:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ ctx->ve_id = arg;
++ break;
++ case CPT_SET_CPU_FLAGS:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ ctx->dst_cpu_flags = arg;
++ ctx->src_cpu_flags = test_cpu_caps();
++ break;
++ case CPT_SUSPEND:
++ if (cpt_context_lookup_veid(ctx->ve_id) ||
++ ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ ctx->ctx_state = CPT_CTX_SUSPENDING;
++ err = cpt_vps_suspend(ctx);
++ if (err) {
++ if (cpt_resume(ctx) == 0)
++ ctx->ctx_state = CPT_CTX_IDLE;
++ } else {
++ ctx->ctx_state = CPT_CTX_SUSPENDED;
++ }
++ break;
++ case CPT_DUMP:
++ if (!ctx->ctx_state) {
++ err = -ENOENT;
++ break;
++ }
++ err = cpt_dump(ctx);
++ break;
++ case CPT_RESUME:
++ if (ctx->ctx_state == CPT_CTX_IDLE) {
++ err = -ENOENT;
++ break;
++ }
++ err = cpt_resume(ctx);
++ if (!err)
++ ctx->ctx_state = CPT_CTX_IDLE;
++ break;
++ case CPT_KILL:
++ if (ctx->ctx_state == CPT_CTX_IDLE) {
++ err = -ENOENT;
++ break;
++ }
++ err = cpt_kill(ctx);
++ if (!err)
++ ctx->ctx_state = CPT_CTX_IDLE;
++ break;
++ case CPT_TEST_VECAPS:
++ {
++ __u32 dst_flags = arg;
++ __u32 src_flags;
++
++ err = cpt_vps_caps(ctx, &src_flags);
++ if (err)
++ break;
++
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err);
++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err);
++ break;
++ }
++ default:
++ err = -EINVAL;
++ break;
++ }
++
++out:
++ cpt_flush_error(ctx);
++ up(&ctx->main_sem);
++out_nosem:
++ cpt_context_put(ctx);
++out_lock:
++ lock_kernel();
++ return err;
++}
++
++static int cpt_open(struct inode *inode, struct file *file)
++{
++ if (!try_module_get(THIS_MODULE))
++ return -EBUSY;
++
++ return 0;
++}
++
++static int cpt_release(struct inode * inode, struct file * file)
++{
++ cpt_context_t *ctx;
++
++ spin_lock(&cpt_context_lock);
++ ctx = (cpt_context_t*)file->private_data;
++ file->private_data = NULL;
++
++ if (ctx)
++ __cpt_context_put(ctx);
++ spin_unlock(&cpt_context_lock);
++
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++
++static struct file_operations cpt_fops = {
++ .owner = THIS_MODULE,
++ .open = cpt_open,
++ .release = cpt_release,
++ .ioctl = cpt_ioctl,
++};
++
++static struct proc_dir_entry *proc_ent;
++
++static struct ctl_table_header *ctl_header;
++
++static ctl_table debug_table[] = {
++ {
++ .ctl_name = 9475,
++ .procname = "cpt",
++ .data = &debug_level,
++ .maxlen = sizeof(debug_level),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ { .ctl_name = 0 }
++};
++static ctl_table root_table[] = {
++ {
++ .ctl_name = CTL_DEBUG,
++ .procname = "debug",
++ .mode = 0555,
++ .child = debug_table,
++ },
++ { .ctl_name = 0 }
++};
++
++static int __init init_cpt(void)
++{
++ int err;
++
++ err = -ENOMEM;
++ ctl_header = register_sysctl_table(root_table, 0);
++ if (!ctl_header)
++ goto err_mon;
++
++ spin_lock_init(&cpt_context_lock);
++ INIT_LIST_HEAD(&cpt_context_list);
++
++ err = -EINVAL;
++ proc_ent = create_proc_entry("cpt", 0600, NULL);
++ if (!proc_ent)
++ goto err_out;
++
++ cpt_fops.read = proc_ent->proc_fops->read;
++ cpt_fops.write = proc_ent->proc_fops->write;
++ cpt_fops.llseek = proc_ent->proc_fops->llseek;
++ proc_ent->proc_fops = &cpt_fops;
++
++ proc_ent->read_proc = proc_read;
++ proc_ent->data = NULL;
++ proc_ent->owner = THIS_MODULE;
++ return 0;
++
++err_out:
++ unregister_sysctl_table(ctl_header);
++err_mon:
++ return err;
++}
++module_init(init_cpt);
++
++static void __exit exit_cpt(void)
++{
++ remove_proc_entry("cpt", NULL);
++ unregister_sysctl_table(ctl_header);
++
++ spin_lock(&cpt_context_lock);
++ while (!list_empty(&cpt_context_list)) {
++ cpt_context_t *ctx;
++ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
++
++ if (!ctx->sticky)
++ ctx->refcount++;
++ ctx->sticky = 0;
++
++ BUG_ON(ctx->refcount != 1);
++
++ __cpt_context_put(ctx);
++ }
++ spin_unlock(&cpt_context_lock);
++}
++module_exit(exit_cpt);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_process.c linux-2.6.16-026test015/kernel/cpt/cpt_process.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_process.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,986 @@
++/*
++ *
++ * kernel/cpt/cpt_process.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/compat.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_ubc.h"
++#include "cpt_process.h"
++#include "cpt_kernel.h"
++
++#ifdef CONFIG_X86_32
++#undef task_pt_regs
++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1)
++#endif
++
++static u32 encode_segment(u32 segreg)
++{
++ segreg &= 0xFFFF;
++
++ if (segreg == 0)
++ return CPT_SEG_ZERO;
++ if ((segreg & 3) != 3) {
++ wprintk("Invalid RPL of a segment reg %x\n", segreg);
++ return CPT_SEG_ZERO;
++ }
++
++ /* LDT descriptor, it is just an index to LDT array */
++ if (segreg & 4)
++ return CPT_SEG_LDT + (segreg >> 3);
++
++ /* TLS descriptor. */
++ if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN &&
++ (segreg >> 3) <= GDT_ENTRY_TLS_MAX)
++ return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN);
++
++ /* One of standard desriptors */
++#ifdef CONFIG_X86_64
++ if (segreg == __USER32_DS)
++ return CPT_SEG_USER32_DS;
++ if (segreg == __USER32_CS)
++ return CPT_SEG_USER32_CS;
++ if (segreg == __USER_DS)
++ return CPT_SEG_USER64_DS;
++ if (segreg == __USER_CS)
++ return CPT_SEG_USER64_CS;
++#else
++ if (segreg == __USER_DS)
++ return CPT_SEG_USER32_DS;
++ if (segreg == __USER_CS)
++ return CPT_SEG_USER32_CS;
++#endif
++ wprintk("Invalid segment reg %x\n", segreg);
++ return CPT_SEG_ZERO;
++}
++
++#ifdef CONFIG_X86_64
++static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, task_t *tsk)
++{
++ d->cpt_ebp = s->rbp;
++ d->cpt_ebx = s->rbx;
++ d->cpt_eax = s->rax;
++ d->cpt_ecx = s->rcx;
++ d->cpt_edx = s->rdx;
++ d->cpt_esi = s->rsi;
++ d->cpt_edi = s->rdi;
++ d->cpt_orig_eax = s->orig_rax;
++ d->cpt_eip = s->rip;
++ d->cpt_xcs = encode_segment(s->cs);
++ d->cpt_eflags = s->eflags;
++ d->cpt_esp = s->rsp;
++ d->cpt_xss = encode_segment(s->ss);
++ d->cpt_xds = encode_segment(tsk->thread.ds);
++ d->cpt_xes = encode_segment(tsk->thread.es);
++}
++
++static int dump_registers(task_t *tsk, struct cpt_context *ctx)
++{
++ cpt_open_object(NULL, ctx);
++
++ if (tsk->thread_info->flags&_TIF_IA32) {
++ struct cpt_x86_regs ri;
++ ri.cpt_next = sizeof(ri);
++ ri.cpt_object = CPT_OBJ_X86_REGS;
++ ri.cpt_hdrlen = sizeof(ri);
++ ri.cpt_content = CPT_CONTENT_VOID;
++
++ ri.cpt_debugreg[0] = tsk->thread.debugreg0;
++ ri.cpt_debugreg[1] = tsk->thread.debugreg1;
++ ri.cpt_debugreg[2] = tsk->thread.debugreg2;
++ ri.cpt_debugreg[3] = tsk->thread.debugreg3;
++ ri.cpt_debugreg[4] = 0;
++ ri.cpt_debugreg[5] = 0;
++ ri.cpt_debugreg[6] = tsk->thread.debugreg6;
++ ri.cpt_debugreg[7] = tsk->thread.debugreg7;
++ ri.cpt_fs = encode_segment(tsk->thread.fsindex);
++ ri.cpt_gs = encode_segment(tsk->thread.gsindex);
++
++ xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk);
++
++ ctx->write(&ri, sizeof(ri), ctx);
++ } else {
++ struct cpt_x86_64_regs ri;
++ ri.cpt_next = sizeof(ri);
++ ri.cpt_object = CPT_OBJ_X86_64_REGS;
++ ri.cpt_hdrlen = sizeof(ri);
++ ri.cpt_content = CPT_CONTENT_VOID;
++
++ ri.cpt_fsbase = tsk->thread.fs;
++ ri.cpt_gsbase = tsk->thread.gs;
++ ri.cpt_fsindex = encode_segment(tsk->thread.fsindex);
++ ri.cpt_gsindex = encode_segment(tsk->thread.gsindex);
++ ri.cpt_ds = encode_segment(tsk->thread.ds);
++ ri.cpt_es = encode_segment(tsk->thread.es);
++ ri.cpt_debugreg[0] = tsk->thread.debugreg0;
++ ri.cpt_debugreg[1] = tsk->thread.debugreg1;
++ ri.cpt_debugreg[2] = tsk->thread.debugreg2;
++ ri.cpt_debugreg[3] = tsk->thread.debugreg3;
++ ri.cpt_debugreg[4] = 0;
++ ri.cpt_debugreg[5] = 0;
++ ri.cpt_debugreg[6] = tsk->thread.debugreg6;
++ ri.cpt_debugreg[7] = tsk->thread.debugreg7;
++
++ memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs));
++
++ ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs);
++ ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss);
++
++ ctx->write(&ri, sizeof(ri), ctx);
++
++#if 0
++ if (ri.cpt_rip >= VSYSCALL_START && ri.cpt_rip < VSYSCALL_END) {
++ eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk));
++ return -EAGAIN;
++ }
++#endif
++ }
++ cpt_close_object(ctx);
++
++ return 0;
++}
++
++#else
++
++static int dump_registers(task_t *tsk, struct cpt_context *ctx)
++{
++ struct cpt_x86_regs ri;
++
++ cpt_open_object(NULL, ctx);
++
++ ri.cpt_next = sizeof(ri);
++ ri.cpt_object = CPT_OBJ_X86_REGS;
++ ri.cpt_hdrlen = sizeof(ri);
++ ri.cpt_content = CPT_CONTENT_VOID;
++
++ ri.cpt_debugreg[0] = tsk->thread.debugreg[0];
++ ri.cpt_debugreg[1] = tsk->thread.debugreg[1];
++ ri.cpt_debugreg[2] = tsk->thread.debugreg[2];
++ ri.cpt_debugreg[3] = tsk->thread.debugreg[3];
++ ri.cpt_debugreg[4] = tsk->thread.debugreg[4];
++ ri.cpt_debugreg[5] = tsk->thread.debugreg[5];
++ ri.cpt_debugreg[6] = tsk->thread.debugreg[6];
++ ri.cpt_debugreg[7] = tsk->thread.debugreg[7];
++ ri.cpt_fs = encode_segment(tsk->thread.fs);
++ ri.cpt_gs = encode_segment(tsk->thread.gs);
++
++ memcpy(&ri.cpt_ebx, task_pt_regs(tsk), sizeof(struct pt_regs));
++
++ ri.cpt_xcs = encode_segment(task_pt_regs(tsk)->xcs);
++ ri.cpt_xss = encode_segment(task_pt_regs(tsk)->xss);
++ ri.cpt_xds = encode_segment(task_pt_regs(tsk)->xds);
++ ri.cpt_xes = encode_segment(task_pt_regs(tsk)->xes);
++
++ ctx->write(&ri, sizeof(ri), ctx);
++ cpt_close_object(ctx);
++
++ return 0;
++}
++#endif
++
++static int dump_kstack(task_t *tsk, struct cpt_context *ctx)
++{
++ struct cpt_obj_bits hdr;
++ unsigned long size;
++ void *start;
++
++ cpt_open_object(NULL, ctx);
++
++#ifdef CONFIG_X86_64
++ size = tsk->thread.rsp0 - tsk->thread.rsp;
++ start = (void*)tsk->thread.rsp;
++#else
++ size = tsk->thread.esp0 - tsk->thread.esp;
++ start = (void*)tsk->thread.esp;
++#endif
++
++ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
++ hdr.cpt_object = CPT_OBJ_BITS;
++ hdr.cpt_hdrlen = sizeof(hdr);
++ hdr.cpt_content = CPT_CONTENT_STACK;
++ hdr.cpt_size = size;
++
++ ctx->write(&hdr, sizeof(hdr), ctx);
++ ctx->write(start, size, ctx);
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ return 0;
++}
++
++/* Formats of i387_fxsave_struct are the same for x86_64
++ * and i386. Plain luck. */
++
++static int dump_fpustate(task_t *tsk, struct cpt_context *ctx)
++{
++ struct cpt_obj_bits hdr;
++ unsigned long size;
++ int type;
++
++ cpt_open_object(NULL, ctx);
++
++ type = CPT_CONTENT_X86_FPUSTATE;
++ size = sizeof(struct i387_fxsave_struct);
++#ifndef CONFIG_X86_64
++ if (!cpu_has_fxsr) {
++ size = sizeof(struct i387_fsave_struct);
++ type = CPT_CONTENT_X86_FPUSTATE_OLD;
++ }
++#endif
++
++ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size);
++ hdr.cpt_object = CPT_OBJ_BITS;
++ hdr.cpt_hdrlen = sizeof(hdr);
++ hdr.cpt_content = type;
++ hdr.cpt_size = size;
++
++ ctx->write(&hdr, sizeof(hdr), ctx);
++ ctx->write(&tsk->thread.i387, size, ctx);
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ return 0;
++}
++
++static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info)
++{
++ si->cpt_signo = info->si_signo;
++ si->cpt_errno = info->si_errno;
++ si->cpt_code = info->si_code;
++
++ switch(si->cpt_code & __SI_MASK) {
++ case __SI_TIMER:
++ si->cpt_pid = info->si_tid;
++ si->cpt_uid = info->si_overrun;
++ si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr);
++ si->cpt_utime = info->si_sys_private;
++ break;
++ case __SI_POLL:
++ si->cpt_pid = info->si_band;
++ si->cpt_uid = info->si_fd;
++ break;
++ case __SI_FAULT:
++ si->cpt_sigval = cpt_ptr_export(info->si_addr);
++#ifdef __ARCH_SI_TRAPNO
++ si->cpt_pid = info->si_trapno;
++#endif
++ break;
++ case __SI_CHLD:
++ si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_type_to_vpid(PIDTYPE_PID, info->si_pid);
++ si->cpt_uid = info->si_uid;
++ si->cpt_sigval = info->si_status;
++ si->cpt_stime = info->si_stime;
++ si->cpt_utime = info->si_utime;
++ break;
++ case __SI_KILL:
++ case __SI_RT:
++ case __SI_MESGQ:
++ default:
++ si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_type_to_vpid(PIDTYPE_TGID, info->si_pid);
++ si->cpt_uid = info->si_uid;
++ si->cpt_sigval = cpt_ptr_export(info->si_ptr);
++ break;
++ }
++ return 0;
++}
++
++static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx)
++{
++ struct sigqueue *q;
++ loff_t saved_obj;
++
++ if (list_empty(&list->list))
++ return 0;
++
++ cpt_push_object(&saved_obj, ctx);
++ list_for_each_entry(q, &list->list, list) {
++ struct cpt_siginfo_image si;
++
++ si.cpt_next = sizeof(si);
++ si.cpt_object = CPT_OBJ_SIGINFO;
++ si.cpt_hdrlen = sizeof(si);
++ si.cpt_content = CPT_CONTENT_VOID;
++
++ si.cpt_qflags = q->flags;
++ si.cpt_user = q->user->uid;
++
++ if (encode_siginfo(&si, &q->info))
++ return -EINVAL;
++
++ ctx->write(&si, sizeof(si), ctx);
++ }
++ cpt_pop_object(&saved_obj, ctx);
++ return 0;
++}
++
++
++
++static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct signal_struct *sig = obj->o_obj;
++ struct cpt_signal_image *v = cpt_get_buf(ctx);
++ task_t *tsk;
++ int i;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_SIGNAL_STRUCT;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ if (sig->pgrp <= 0) {
++ eprintk_ctx("bad pgid\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_pgrp_type = CPT_PGRP_NORMAL;
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->pgrp);
++ if (tsk == NULL)
++ v->cpt_pgrp_type = CPT_PGRP_ORPHAN;
++ read_unlock(&tasklist_lock);
++ v->cpt_pgrp = pid_type_to_vpid(PIDTYPE_PGID, sig->pgrp);
++
++ v->cpt_old_pgrp = 0;
++ if (sig->tty_old_pgrp < 0) {
++ eprintk_ctx("bad tty_old_pgrp\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ if (sig->tty_old_pgrp > 0) {
++ v->cpt_old_pgrp_type = CPT_PGRP_NORMAL;
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->tty_old_pgrp);
++ if (tsk == NULL) {
++ v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN;
++ tsk = find_task_by_pid_type_ve(PIDTYPE_PGID, sig->tty_old_pgrp);
++ }
++ read_unlock(&tasklist_lock);
++ if (tsk == NULL) {
++ eprintk_ctx("tty_old_pgrp does not exist anymore\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_old_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, sig->tty_old_pgrp);
++ if ((int)v->cpt_old_pgrp < 0) {
++ dprintk_ctx("stray tty_old_pgrp %d\n", sig->tty_old_pgrp);
++ v->cpt_old_pgrp = -1;
++ v->cpt_old_pgrp_type = CPT_PGRP_STRAY;
++ }
++ }
++
++ if (sig->session <= 0) {
++ eprintk_ctx("bad session\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_session_type = CPT_PGRP_NORMAL;
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->session);
++ if (tsk == NULL)
++ v->cpt_session_type = CPT_PGRP_ORPHAN;
++ read_unlock(&tasklist_lock);
++ v->cpt_session = pid_type_to_vpid(PIDTYPE_SID, sig->session);
++
++ v->cpt_leader = sig->leader;
++ v->cpt_ctty = CPT_NULL;
++ if (sig->tty) {
++ cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx);
++ if (cobj)
++ v->cpt_ctty = cobj->o_pos;
++ else {
++ eprintk_ctx("controlling tty is not found\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ }
++ memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8);
++
++ v->cpt_curr_target = 0;
++ if (sig->curr_target)
++ v->cpt_curr_target = virt_pid(sig->curr_target);
++ v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0);
++ v->cpt_group_exit_code = sig->group_exit_code;
++ v->cpt_group_exit_task = 0;
++ if (sig->group_exit_task)
++ v->cpt_group_exit_task = virt_pid(sig->group_exit_task);
++ v->cpt_notify_count = sig->notify_count;
++ v->cpt_group_stop_count = sig->group_stop_count;
++
++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8)
++ v->cpt_utime = sig->utime;
++ v->cpt_stime = sig->stime;
++ v->cpt_cutime = sig->cutime;
++ v->cpt_cstime = sig->cstime;
++ v->cpt_nvcsw = sig->nvcsw;
++ v->cpt_nivcsw = sig->nivcsw;
++ v->cpt_cnvcsw = sig->cnvcsw;
++ v->cpt_cnivcsw = sig->cnivcsw;
++ v->cpt_min_flt = sig->min_flt;
++ v->cpt_maj_flt = sig->maj_flt;
++ v->cpt_cmin_flt = sig->cmin_flt;
++ v->cpt_cmaj_flt = sig->cmaj_flt;
++
++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++ __asm__("undefined\n");
++
++ for (i=0; i<CPT_RLIM_NLIMITS; i++) {
++ if (i < RLIM_NLIMITS) {
++ v->cpt_rlim_cur[i] = sig->rlim[i].rlim_cur;
++ v->cpt_rlim_max[i] = sig->rlim[i].rlim_max;
++ } else {
++ v->cpt_rlim_cur[i] = CPT_NULL;
++ v->cpt_rlim_max[i] = CPT_NULL;
++ }
++ }
++#endif
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ dump_sigqueue(&sig->shared_pending, ctx);
++
++ cpt_close_object(ctx);
++ return 0;
++}
++
++
++static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ task_t *tsk = obj->o_obj;
++ int last_thread;
++ struct cpt_task_image *v = cpt_get_buf(ctx);
++ cpt_object_t *tobj;
++ cpt_object_t *tg_obj;
++ loff_t saved_obj;
++ int i;
++ int err;
++ struct timespec delta;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_signal = CPT_NULL;
++ tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx);
++ if (!tg_obj) BUG();
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_TASK;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_state = tsk->state;
++ if (tsk->state == EXIT_ZOMBIE) {
++ eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk));
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ } else if (tsk->state == EXIT_DEAD) {
++ if (tsk->exit_state != EXIT_DEAD &&
++ tsk->exit_state != EXIT_ZOMBIE) {
++ eprintk_ctx("invalid exit_state %ld on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk));
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ }
++ if (tsk->exit_state) {
++ v->cpt_state = tsk->exit_state;
++ if (tsk->state != EXIT_DEAD) {
++ eprintk_ctx("invalid tsk->state %ld/%ld on" CPT_FID "\n",
++ tsk->state, tsk->exit_state, CPT_TID(tsk));
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ }
++ v->cpt_flags = tsk->flags&~PF_FROZEN;
++ v->cpt_ptrace = tsk->ptrace;
++ v->cpt_prio = tsk->prio;
++ v->cpt_exit_code = tsk->exit_code;
++ v->cpt_exit_signal = tsk->exit_signal;
++ v->cpt_pdeath_signal = tsk->pdeath_signal;
++ v->cpt_static_prio = tsk->static_prio;
++ v->cpt_rt_priority = tsk->rt_priority;
++ v->cpt_policy = tsk->policy;
++ if (v->cpt_policy != SCHED_NORMAL) {
++ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm);
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++
++ v->cpt_mm = CPT_NULL;
++ if (tsk->mm) {
++ tobj = lookup_cpt_object(CPT_OBJ_MM, tsk->mm, ctx);
++ if (!tobj) BUG();
++ v->cpt_mm = tobj->o_pos;
++ }
++ v->cpt_files = CPT_NULL;
++ if (tsk->files) {
++ tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk->files, ctx);
++ if (!tobj) BUG();
++ v->cpt_files = tobj->o_pos;
++ }
++ v->cpt_fs = CPT_NULL;
++ if (tsk->fs) {
++ tobj = lookup_cpt_object(CPT_OBJ_FS, tsk->fs, ctx);
++ if (!tobj) BUG();
++ v->cpt_fs = tobj->o_pos;
++ }
++ v->cpt_namespace = CPT_NULL;
++ if (tsk->namespace) {
++ tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk->namespace, ctx);
++ if (!tobj) BUG();
++ v->cpt_namespace = tobj->o_pos;
++
++ if (tsk->namespace != current->namespace)
++ eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm);
++ }
++ v->cpt_sysvsem_undo = CPT_NULL;
++ if (tsk->sysvsem.undo_list && !tsk->exit_state) {
++ tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx);
++ if (!tobj) BUG();
++ v->cpt_sysvsem_undo = tobj->o_pos;
++ }
++ v->cpt_sighand = CPT_NULL;
++ if (tsk->sighand) {
++ tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx);
++ if (!tobj) BUG();
++ v->cpt_sighand = tobj->o_pos;
++ }
++ v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked);
++ v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked);
++ v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask);
++
++ v->cpt_pid = virt_pid(tsk);
++ v->cpt_tgid = virt_tgid(tsk);
++ v->cpt_ppid = 0;
++ if (tsk->parent) {
++ if (tsk->parent != tsk->real_parent &&
++ !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) {
++ eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, virt_pid(tsk), tsk->comm);
++ cpt_release_buf(ctx);
++ return -EBUSY;
++ }
++ v->cpt_ppid = virt_pid(tsk->parent);
++ }
++ v->cpt_rppid = tsk->real_parent ? virt_pid(tsk->real_parent) : 0;
++ v->cpt_pgrp = virt_pgid(tsk);
++ v->cpt_session = virt_sid(tsk);
++ v->cpt_old_pgrp = 0;
++ if (tsk->signal->tty_old_pgrp)
++ v->cpt_old_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, tsk->signal->tty_old_pgrp);
++ v->cpt_leader = tsk->group_leader ? virt_pid(tsk->group_leader) : 0;
++ v->cpt_set_tid = (unsigned long)tsk->set_child_tid;
++ v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid;
++ memcpy(v->cpt_comm, tsk->comm, 16);
++ v->cpt_user = tsk->user->uid;
++ v->cpt_uid = tsk->uid;
++ v->cpt_euid = tsk->euid;
++ v->cpt_suid = tsk->suid;
++ v->cpt_fsuid = tsk->fsuid;
++ v->cpt_gid = tsk->gid;
++ v->cpt_egid = tsk->egid;
++ v->cpt_sgid = tsk->sgid;
++ v->cpt_fsgid = tsk->fsgid;
++ v->cpt_ngids = 0;
++ if (tsk->group_info && tsk->group_info->ngroups != 0) {
++ int i = tsk->group_info->ngroups;
++ if (i > 32) {
++ /* Shame... I did a simplified version and _forgot_
++ * about this. Later, later. */
++ eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk));
++ return -EINVAL;
++ }
++ v->cpt_ngids = i;
++ for (i--; i>=0; i--)
++ v->cpt_gids[i] = tsk->group_info->small_block[i];
++ }
++ memcpy(&v->cpt_ecap, &tsk->cap_effective, 8);
++ memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8);
++ memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8);
++ v->cpt_keepcap = tsk->keep_capabilities;
++
++ v->cpt_did_exec = tsk->did_exec;
++ v->cpt_exec_domain = -1;
++ v->cpt_thrflags = tsk->thread_info->flags & ~(1<<TIF_FREEZE);
++ v->cpt_64bit = 0;
++#ifdef CONFIG_X86_64
++ /* Clear x86_64 specific flags */
++ v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32);
++ if (!(tsk->thread_info->flags & _TIF_IA32)) {
++ ctx->tasks64++;
++ v->cpt_64bit = 1;
++ }
++#endif
++ v->cpt_thrstatus = tsk->thread_info->status;
++ v->cpt_addr_limit = -1;
++
++ v->cpt_personality = tsk->personality;
++
++ for (i=0; i<GDT_ENTRY_TLS_ENTRIES; i++) {
++ if (i>=3) {
++ eprintk_ctx("too many tls descs\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++#ifndef CONFIG_X86_64
++ v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a;
++#else
++ v->cpt_tls[i] = tsk->thread.tls_array[i];
++#endif
++ }
++
++ v->cpt_restart.fn = CPT_RBL_0;
++ if (tsk->thread_info->restart_block.fn != current->thread_info->restart_block.fn) {
++ if (tsk->thread_info->restart_block.fn != nanosleep_restart
++#ifdef CONFIG_X86_64
++ && tsk->thread_info->restart_block.fn != compat_nanosleep_restart
++#endif
++ ) {
++ eprintk_ctx("unknown restart block %p\n", tsk->thread_info->restart_block.fn);
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_restart.fn = CPT_RBL_NANOSLEEP;
++#ifdef CONFIG_X86_64
++ if (tsk->thread_info->restart_block.fn == compat_nanosleep_restart)
++ v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP;
++#endif
++ v->cpt_restart.arg0 = tsk->thread_info->restart_block.arg0;
++ v->cpt_restart.arg1 = tsk->thread_info->restart_block.arg1;
++ v->cpt_restart.arg2 = tsk->thread_info->restart_block.arg2;
++ v->cpt_restart.arg3 = tsk->thread_info->restart_block.arg3;
++ if (debug_level > 2) {
++ ktime_t e, e1;
++ struct timespec now;
++
++ do_posix_clock_monotonic_gettime(&now);
++ e = timespec_to_ktime(now);
++ e1.tv64 = ((u64)tsk->thread_info->restart_block.arg1 << 32) | (u64) tsk->thread_info->restart_block.arg0;
++ e = ktime_sub(e1, e);
++ dprintk("cpt " CPT_FID " RBL %ld/%ld %Ld\n", CPT_TID(tsk),
++ tsk->thread_info->restart_block.arg1,
++ tsk->thread_info->restart_block.arg0, e.tv64);
++ }
++ }
++
++ v->cpt_it_real_incr = 0;
++ v->cpt_it_prof_incr = 0;
++ v->cpt_it_virt_incr = 0;
++ v->cpt_it_real_value = 0;
++ v->cpt_it_prof_value = 0;
++ v->cpt_it_virt_value = 0;
++ if (thread_group_leader(tsk) && tsk->exit_state == 0) {
++ ktime_t rem;
++
++ v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr);
++ v->cpt_it_prof_incr = tsk->signal->it_prof_incr;
++ v->cpt_it_virt_incr = tsk->signal->it_virt_incr;
++
++ rem = hrtimer_get_remaining(&tsk->signal->real_timer);
++
++ if (hrtimer_active(&tsk->signal->real_timer)) {
++ if (rem.tv64 <= 0)
++ rem.tv64 = NSEC_PER_USEC;
++ v->cpt_it_real_value = ktime_to_ns(rem);
++ dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), v->cpt_it_real_value);
++ }
++ v->cpt_it_prof_value = tsk->signal->it_prof_expires;
++ v->cpt_it_virt_value = tsk->signal->it_virt_expires;
++ }
++ v->cpt_used_math = (tsk_used_math(tsk) != 0);
++
++ if (tsk->notifier) {
++ eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm);
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++
++ v->cpt_utime = tsk->utime;
++ v->cpt_stime = tsk->stime;
++ delta = tsk->start_time;
++ _set_normalized_timespec(&delta,
++ delta.tv_sec - get_exec_env()->init_entry->start_time.tv_sec,
++ delta.tv_nsec - get_exec_env()->init_entry->start_time.tv_nsec);
++ v->cpt_starttime = cpt_timespec_export(&delta);
++ v->cpt_nvcsw = tsk->nvcsw;
++ v->cpt_nivcsw = tsk->nivcsw;
++ v->cpt_min_flt = tsk->min_flt;
++ v->cpt_maj_flt = tsk->maj_flt;
++
++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
++ v->cpt_cutime = tsk->cutime;
++ v->cpt_cstime = tsk->cstime;
++ v->cpt_cnvcsw = tsk->cnvcsw;
++ v->cpt_cnivcsw = tsk->cnivcsw;
++ v->cpt_cmin_flt = tsk->cmin_flt;
++ v->cpt_cmaj_flt = tsk->cmaj_flt;
++
++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++ __asm__("undefined\n");
++
++ for (i=0; i<CPT_RLIM_NLIMITS; i++) {
++ if (i < RLIM_NLIMITS) {
++ v->cpt_rlim_cur[i] = tsk->rlim[i].rlim_cur;
++ v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max;
++ } else {
++ v->cpt_rlim_cur[i] = CPT_NULL;
++ v->cpt_rlim_max[i] = CPT_NULL;
++ }
++ }
++#else
++ v->cpt_cutime = tsk->signal->cutime;
++ v->cpt_cstime = tsk->signal->cstime;
++ v->cpt_cnvcsw = tsk->signal->cnvcsw;
++ v->cpt_cnivcsw = tsk->signal->cnivcsw;
++ v->cpt_cmin_flt = tsk->signal->cmin_flt;
++ v->cpt_cmaj_flt = tsk->signal->cmaj_flt;
++
++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++ __asm__("undefined\n");
++
++ for (i=0; i<CPT_RLIM_NLIMITS; i++) {
++ if (i < RLIM_NLIMITS) {
++ v->cpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur;
++ v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max;
++ } else {
++ v->cpt_rlim_cur[i] = CPT_NULL;
++ v->cpt_rlim_max[i] = CPT_NULL;
++ }
++ }
++#endif
++
++ if (tsk->mm)
++ v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx);
++ else
++ v->cpt_mm_ub = CPT_NULL;
++ v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx);
++ v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx);
++ v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx);
++
++ v->cpt_ptrace_message = tsk->ptrace_message;
++ v->cpt_pn_state = tsk->pn_state;
++ v->cpt_stopped_state = tsk->stopped_state;
++ v->cpt_sigsuspend_state = 0;
++
++#ifndef CONFIG_X86_64
++ if (tsk->thread.vm86_info) {
++ eprintk_ctx("vm86 task is running\n");
++ cpt_release_buf(ctx);
++ return -EBUSY;
++ }
++#endif
++
++ v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal);
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ dump_kstack(tsk, ctx);
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ err = dump_registers(tsk, ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ if (err)
++ return err;
++
++ if (tsk_used_math(tsk)) {
++ cpt_push_object(&saved_obj, ctx);
++ dump_fpustate(tsk, ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++
++ if (tsk->last_siginfo) {
++ struct cpt_siginfo_image si;
++ cpt_push_object(&saved_obj, ctx);
++
++ si.cpt_next = sizeof(si);
++ si.cpt_object = CPT_OBJ_LASTSIGINFO;
++ si.cpt_hdrlen = sizeof(si);
++ si.cpt_content = CPT_CONTENT_VOID;
++
++ if (encode_siginfo(&si, tsk->last_siginfo))
++ return -EINVAL;
++
++ ctx->write(&si, sizeof(si), ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++
++ if (tsk->sas_ss_size) {
++ struct cpt_sigaltstack_image si;
++ cpt_push_object(&saved_obj, ctx);
++
++ si.cpt_next = sizeof(si);
++ si.cpt_object = CPT_OBJ_SIGALTSTACK;
++ si.cpt_hdrlen = sizeof(si);
++ si.cpt_content = CPT_CONTENT_VOID;
++
++ si.cpt_stack = tsk->sas_ss_sp;
++ si.cpt_stacksize = tsk->sas_ss_size;
++
++ ctx->write(&si, sizeof(si), ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++
++ dump_sigqueue(&tsk->pending, ctx);
++
++ last_thread = 1;
++ read_lock(&tasklist_lock);
++ do {
++ task_t * next = next_thread(tsk);
++ if (next != tsk && !thread_group_leader(next))
++ last_thread = 0;
++ } while (0);
++ read_unlock(&tasklist_lock);
++
++ if (last_thread) {
++ task_t *prev_tsk;
++ int err;
++ loff_t pos = ctx->file->f_pos;
++
++ cpt_push_object(&saved_obj, ctx);
++ err = dump_one_signal_struct(tg_obj, ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ if (err)
++ return err;
++
++ prev_tsk = tsk;
++ for (;;) {
++ if (prev_tsk->tgid == tsk->tgid) {
++ loff_t tg_pos;
++
++ tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal);
++ ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos);
++ if (thread_group_leader(prev_tsk))
++ break;
++ }
++
++ if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) {
++ eprintk_ctx("bug: thread group leader is lost\n");
++ return -EINVAL;
++ }
++
++ obj = list_entry(obj->o_list.prev, cpt_object_t, o_list);
++ prev_tsk = obj->o_obj;
++ }
++ }
++
++ cpt_close_object(ctx);
++ return 0;
++}
++
++int cpt_dump_tasks(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ cpt_open_section(ctx, CPT_SECT_TASKS);
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ int err;
++
++ if ((err = dump_one_process(obj, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_section(ctx);
++ return 0;
++}
++
++int cpt_collect_signals(cpt_context_t *ctx)
++{
++ cpt_object_t *obj;
++
++ /* Collect process fd sets */
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) {
++ eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, virt_pid(tsk), tsk->comm);
++ return -EBUSY;
++ }
++ if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL)
++ return -ENOMEM;
++ if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL)
++ return -ENOMEM;
++ }
++ return 0;
++}
++
++
++static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct sighand_struct *sig = obj->o_obj;
++ struct cpt_sighand_image *v = cpt_get_buf(ctx);
++ int i;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_SIGHAND_STRUCT;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ for (i=0; i< _NSIG; i++) {
++ if (sig->action[i].sa.sa_handler != SIG_DFL) {
++ loff_t saved_obj;
++ struct cpt_sighandler_image *o = cpt_get_buf(ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ cpt_open_object(NULL, ctx);
++
++ o->cpt_next = CPT_NULL;
++ o->cpt_object = CPT_OBJ_SIGHANDLER;
++ o->cpt_hdrlen = sizeof(*o);
++ o->cpt_content = CPT_CONTENT_VOID;
++
++ o->cpt_signo = i;
++ o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler;
++ o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer;
++ o->cpt_flags = sig->action[i].sa.sa_flags;
++ memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8);
++ ctx->write(o, sizeof(*o), ctx);
++ cpt_release_buf(ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++ }
++
++ cpt_close_object(ctx);
++ return 0;
++}
++
++int cpt_dump_sighand(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT);
++
++ for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) {
++ int err;
++
++ if ((err = dump_one_sighand_struct(obj, ctx)) != 0)
++ return err;
++ }
++
++ cpt_close_section(ctx);
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_process.h linux-2.6.16-026test015/kernel/cpt/cpt_process.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_process.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_process.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,10 @@
++int cpt_collect_signals(cpt_context_t *);
++int cpt_dump_signal(struct cpt_context *);
++int cpt_dump_sighand(struct cpt_context *);
++int cpt_dump_tasks(struct cpt_context *);
++
++int rst_signal_complete(struct cpt_task_image *ti, struct cpt_context *ctx);
++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx);
++
++int rst_restore_process(struct cpt_context *ctx);
++int rst_process_linkage(struct cpt_context *ctx);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket.c linux-2.6.16-026test015/kernel/cpt/cpt_socket.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_socket.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_socket.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,779 @@
++/*
++ *
++ * kernel/cpt/cpt_socket.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/un.h>
++#include <linux/tcp.h>
++#include <net/sock.h>
++#include <net/scm.h>
++#include <net/af_unix.h>
++#include <net/tcp.h>
++#include <net/netlink_sock.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_socket.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++
++static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx);
++
++
++/* Sockets are quite different of another kinds of files.
++ * There is one simplification: only one struct file can refer to a socket,
++ * so we could store information about socket directly in section FILES as
++ * a description of a file and append f.e. array of not-yet-accepted
++ * connections of listening socket as array of auxiliary data.
++ *
++ * Complications are:
++ * 1. TCP sockets can be orphans. We have to relocate orphans as well,
++ * so we have to create special section for orphans.
++ * 2. AF_UNIX sockets are distinguished objects: set of links between
++ * AF_UNIX sockets is quite arbitrary.
++ * A. Each socket can refers to many of files due to FD passing.
++ * B. Each socket except for connected ones can have in queue skbs
++ * sent by any of sockets.
++ *
++ * 2A is relatively easy: after our tasks are frozen we make an additional
++ * recursive pass throgh set of collected files and get referenced to
++ * FD passed files. After end of recursion, all the files are treated
++ * in the same way. All they will be stored in section FILES.
++ *
++ * 2B. We have to resolve all those references at some point.
++ * It is the place where pipe-like approach to image fails.
++ *
++ * All this makes socket checkpointing quite chumbersome.
++ * Right now we collect all the sockets and assign some numeric index value
++ * to each of them. The socket section is separate and put after section FILES,
++ * so section FILES refers to sockets by index, section SOCKET refers to FILES
++ * as usual by position in image. All the refs inside socket section are
++ * by index. When restoring we read socket section, create objects to hold
++ * mappings index <-> pos. At the second pass we open sockets (simultaneosly
++ * with their pairs) and create FILE objects.
++ */
++
++
++/* ====== FD passing ====== */
++
++/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we
++ * have to implement this. A problem is that in general case we receive
++ * skbs from an unknown context, so new files can arrive to checkpointed
++ * set of processes even after they are stopped. Well, we are going just
++ * to ignore unknown fds while doing real checkpointing. It is fair because
++ * links outside checkpointed set are going to fail anyway.
++ *
++ * ATTN: the procedure is recursive. We linearize the recursion adding
++ * newly found files to the end of file list, so they will be analyzed
++ * in the same loop.
++ */
++
++static int collect_one_passedfd(struct file *file, cpt_context_t * ctx)
++{
++ struct inode *inode = file->f_dentry->d_inode;
++ struct socket *sock;
++ struct sock *sk;
++ struct sk_buff *skb;
++
++ if (!S_ISSOCK(inode->i_mode))
++ return -ENOTSOCK;
++
++ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
++
++ if (sock->ops->family != AF_UNIX)
++ return 0;
++
++ sk = sock->sk;
++
++ /* Subtle locking issue. skbs cannot be removed while
++ * we are scanning, because all the processes are stopped.
++ * They still can be added to tail of queue. Locking while
++ * we dereference skb->next is enough to resolve this.
++ * See above about collision with skbs added after we started
++ * checkpointing.
++ */
++
++ skb = skb_peek(&sk->sk_receive_queue);
++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
++ if (UNIXCB(skb).fp && skb->sk &&
++ (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) {
++ struct scm_fp_list *fpl = UNIXCB(skb).fp;
++ int i;
++
++ for (i = fpl->count-1; i >= 0; i--) {
++ if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL)
++ return -ENOMEM;
++ }
++ }
++
++ spin_lock_irq(&sk->sk_receive_queue.lock);
++ skb = skb->next;
++ spin_unlock_irq(&sk->sk_receive_queue.lock);
++ }
++
++ return 0;
++}
++
++int cpt_collect_passedfds(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++
++ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) {
++ int err;
++
++ if ((err = collect_one_passedfd(file, ctx)) < 0)
++ return err;
++ }
++ }
++
++ return 0;
++}
++
++/* ====== End of FD passing ====== */
++
++/* Must be called under bh_lock_sock() */
++
++void clear_backlog(struct sock *sk)
++{
++ struct sk_buff *skb = sk->sk_backlog.head;
++
++ sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
++ while (skb) {
++ struct sk_buff *next = skb->next;
++
++ skb->next = NULL;
++ kfree_skb(skb);
++ skb = next;
++ }
++}
++
++void release_sock_nobacklog(struct sock *sk)
++{
++ spin_lock_bh(&(sk->sk_lock.slock));
++ clear_backlog(sk);
++ sk->sk_lock.owner = NULL;
++ if (waitqueue_active(&(sk->sk_lock.wq)))
++ wake_up(&(sk->sk_lock.wq));
++ spin_unlock_bh(&(sk->sk_lock.slock));
++}
++
++int cpt_dump_skb(int type, int owner, struct sk_buff *skb,
++ struct cpt_context *ctx)
++{
++ struct cpt_skb_image *v = cpt_get_buf(ctx);
++ loff_t saved_obj;
++ struct timeval tmptv;
++
++ cpt_push_object(&saved_obj, ctx);
++ cpt_open_object(NULL, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_SKB;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_owner = owner;
++ v->cpt_queue = type;
++ skb_get_timestamp(skb, &tmptv);
++ v->cpt_stamp = cpt_timeval_export(&tmptv);
++ v->cpt_hspace = skb->data - skb->head;
++ v->cpt_tspace = skb->end - skb->tail;
++ v->cpt_h = skb->h.raw - skb->head;
++ v->cpt_nh = skb->nh.raw - skb->head;
++ v->cpt_mac = skb->mac.raw - skb->head;
++ if (sizeof(skb->cb) < sizeof(v->cpt_cb)) BUG();
++ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb));
++ if (sizeof(skb->cb) > sizeof(v->cpt_cb)) {
++ int i;
++ for (i=sizeof(v->cpt_cb); i<sizeof(skb->cb); i++) {
++ if (skb->cb[i]) {
++ wprintk_ctx("dirty skb cb");
++ break;
++ }
++ }
++ }
++ v->cpt_len = skb->len;
++ v->cpt_mac_len = skb->mac_len;
++ v->cpt_csum = skb->csum;
++ v->cpt_local_df = skb->local_df;
++ v->cpt_pkt_type = skb->pkt_type;
++ v->cpt_ip_summed = skb->ip_summed;
++ v->cpt_priority = skb->priority;
++ v->cpt_protocol = skb->protocol;
++ v->cpt_security = 0;
++ v->cpt_tso_segs = skb_shinfo(skb)->tso_segs;
++ v->cpt_tso_size = skb_shinfo(skb)->tso_size;
++ if (skb_shinfo(skb)->ufo_size) {
++ eprintk_ctx("skb ufo is not supported\n");
++ return -EINVAL;
++ }
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ if (skb->len + (skb->data - skb->head) > 0) {
++ struct cpt_obj_bits ob;
++ loff_t saved_obj2;
++
++ cpt_push_object(&saved_obj2, ctx);
++ cpt_open_object(NULL, ctx);
++ ob.cpt_next = CPT_NULL;
++ ob.cpt_object = CPT_OBJ_BITS;
++ ob.cpt_hdrlen = sizeof(ob);
++ ob.cpt_content = CPT_CONTENT_DATA;
++ ob.cpt_size = skb->len + v->cpt_hspace;
++
++ ctx->write(&ob, sizeof(ob), ctx);
++
++ ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx);
++ if (skb->data_len) {
++ int offset = skb->len - skb->data_len;
++ while (offset < skb->len) {
++ int copy = skb->len - offset;
++ if (copy > PAGE_SIZE)
++ copy = PAGE_SIZE;
++ (void)cpt_get_buf(ctx);
++ if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy))
++ BUG();
++ ctx->write(ctx->tmpbuf, copy, ctx);
++ __cpt_release_buf(ctx);
++ offset += copy;
++ }
++ }
++
++ ctx->align(ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj2, ctx);
++ }
++
++ if (skb->sk && skb->sk->sk_family == AF_UNIX) {
++ struct scm_fp_list *fpl = UNIXCB(skb).fp;
++
++ if (fpl) {
++ int i;
++
++ for (i = 0; i < fpl->count; i++) {
++ struct cpt_fd_image v;
++ cpt_object_t *obj;
++ loff_t saved_obj2;
++
++ obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx);
++
++ if (!obj) {
++ eprintk_ctx("lost passed FD\n");
++ return -EINVAL;
++ }
++
++ cpt_push_object(&saved_obj2, ctx);
++ cpt_open_object(NULL, ctx);
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_FILEDESC;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_VOID;
++
++ v.cpt_fd = i;
++ v.cpt_file = obj->o_pos;
++ v.cpt_flags = 0;
++ ctx->write(&v, sizeof(v), ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj2, ctx);
++ }
++ }
++ }
++
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ return 0;
++}
++
++static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx)
++{
++ struct sk_buff *skb;
++ struct sock *sk_cache = NULL;
++
++ skb = skb_peek(&sk->sk_receive_queue);
++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
++ int err;
++
++ if (sk->sk_family == AF_UNIX) {
++ cpt_object_t *obj;
++ if (skb->sk != sk_cache) {
++ idx = -1;
++ sk_cache = NULL;
++ obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx);
++ if (obj) {
++ idx = obj->o_index;
++ sk_cache = skb->sk;
++ } else if (unix_peer(sk) != skb->sk)
++ goto next_skb;
++ }
++ }
++
++ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx);
++ if (err)
++ return err;
++
++next_skb:
++ spin_lock_irq(&sk->sk_receive_queue.lock);
++ skb = skb->next;
++ spin_unlock_irq(&sk->sk_receive_queue.lock);
++ }
++ return 0;
++}
++
++static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx)
++{
++ struct sk_buff *skb;
++
++ skb = skb_peek(&sk->sk_write_queue);
++ while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) {
++ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx);
++ if (err)
++ return err;
++
++ spin_lock_irq(&sk->sk_write_queue.lock);
++ skb = skb->next;
++ spin_unlock_irq(&sk->sk_write_queue.lock);
++ }
++ return 0;
++}
++
++void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx)
++{
++ loff_t saved_obj;
++ if (sk->sk_filter) {
++ struct cpt_obj_bits v;
++
++ cpt_push_object(&saved_obj, ctx);
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_SKFILTER;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_DATA;
++ v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter);
++
++ ctx->write(&v, sizeof(v), ctx);
++ ctx->write(sk->sk_filter->insns, v.cpt_size, ctx);
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
++ cpt_push_object(&saved_obj, ctx);
++ cpt_dump_mcfilter(sk, ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++}
++
++/* Dump socket content */
++
++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx)
++{
++ struct cpt_sock_image *v = cpt_get_buf(ctx);
++ struct socket *sock;
++
++ cpt_open_object(obj, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_SOCKET;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_file = CPT_NULL;
++ sock = sk->sk_socket;
++ if (sock && sock->file) {
++ cpt_object_t *tobj;
++ tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx);
++ if (tobj)
++ v->cpt_file = tobj->o_pos;
++ }
++ v->cpt_index = index;
++ v->cpt_parent = parent;
++
++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
++ if (sock && !obj->o_lock) {
++ lock_sock(sk);
++ obj->o_lock = 1;
++ }
++ }
++
++ /* Some bits stored in inode */
++ v->cpt_ssflags = sock ? sock->flags : 0;
++ v->cpt_sstate = sock ? sock->state : 0;
++ v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0;
++
++ /* Common data */
++ v->cpt_family = sk->sk_family;
++ v->cpt_type = sk->sk_type;
++ v->cpt_state = sk->sk_state;
++ v->cpt_reuse = sk->sk_reuse;
++ v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED);
++ v->cpt_shutdown = sk->sk_shutdown;
++ v->cpt_userlocks = sk->sk_userlocks;
++ v->cpt_no_check = sk->sk_no_check;
++ v->cpt_zapped = sock_flag(sk, SOCK_DBG);
++ v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP);
++ v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE);
++ v->cpt_protocol = sk->sk_protocol;
++ v->cpt_err = sk->sk_err;
++ v->cpt_err_soft = sk->sk_err_soft;
++ v->cpt_max_ack_backlog = sk->sk_max_ack_backlog;
++ v->cpt_priority = sk->sk_priority;
++ v->cpt_rcvlowat = sk->sk_rcvlowat;
++ v->cpt_rcvtimeo = CPT_NULL;
++ if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT)
++ v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo;
++ v->cpt_sndtimeo = CPT_NULL;
++ if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT)
++ v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo;
++ v->cpt_rcvbuf = sk->sk_rcvbuf;
++ v->cpt_sndbuf = sk->sk_sndbuf;
++ v->cpt_bound_dev_if = sk->sk_bound_dev_if;
++ v->cpt_flags = sk->sk_flags;
++ v->cpt_lingertime = CPT_NULL;
++ if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT)
++ v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime;
++ v->cpt_peer_pid = sk->sk_peercred.pid;
++ v->cpt_peer_uid = sk->sk_peercred.uid;
++ v->cpt_peer_gid = sk->sk_peercred.gid;
++ v->cpt_stamp = cpt_timeval_export(&sk->sk_stamp);
++
++ v->cpt_peer = -1;
++ v->cpt_socketpair = 0;
++ v->cpt_deleted = 0;
++
++ v->cpt_laddrlen = 0;
++ if (sock) {
++ int alen = sizeof(v->cpt_laddr);
++ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ v->cpt_laddrlen = alen;
++ }
++ v->cpt_raddrlen = 0;
++ if (sock) {
++ int alen = sizeof(v->cpt_raddr);
++ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2);
++ if (!err)
++ v->cpt_raddrlen = alen;
++ }
++
++ if (sk->sk_family == AF_UNIX) {
++ if (unix_sk(sk)->dentry) {
++ struct dentry *d = unix_sk(sk)->dentry;
++ v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d);
++ if (!v->cpt_deleted) {
++ int err = 0;
++ char *path;
++ unsigned long pg = __get_free_page(GFP_KERNEL);
++
++ if (!pg) {
++ cpt_release_buf(ctx);
++ return -ENOMEM;
++ }
++
++ path = d_path(d, unix_sk(sk)->mnt, (char *)pg, PAGE_SIZE);
++
++ if (!IS_ERR(path)) {
++ int len = strlen(path);
++ if (len < 126) {
++ strcpy(((char*)v->cpt_laddr)+2, path);
++ v->cpt_laddrlen = len + 2;
++ } else {
++ wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2);
++ }
++ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx);
++ } else {
++ eprintk_ctx("cannot get path of an af_unix socket\n");
++ err = PTR_ERR(path);
++ }
++ free_page(pg);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ }
++ }
++
++ /* If the socket is connected, find its peer. If peer is not
++ * in our table, the socket is connected to external process
++ * and we consider it disconnected.
++ */
++ if (unix_peer(sk)) {
++ cpt_object_t *pobj;
++ pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx);
++ if (pobj)
++ v->cpt_peer = pobj->o_index;
++ else
++ v->cpt_shutdown = SHUTDOWN_MASK;
++
++ if (unix_peer(unix_peer(sk)) == sk)
++ v->cpt_socketpair = 1;
++ }
++
++ /* If the socket shares address with another socket it is
++ * child of some listening socket. Find and record it. */
++ if (unix_sk(sk)->addr &&
++ atomic_read(&unix_sk(sk)->addr->refcnt) > 1 &&
++ sk->sk_state != TCP_LISTEN) {
++ cpt_object_t *pobj;
++ for_each_object(pobj, CPT_OBJ_SOCKET) {
++ struct sock *psk = pobj->o_obj;
++ if (psk->sk_family == AF_UNIX &&
++ psk->sk_state == TCP_LISTEN &&
++ unix_sk(psk)->addr == unix_sk(sk)->addr) {
++ v->cpt_parent = pobj->o_index;
++ break;
++ }
++ }
++ }
++ }
++
++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
++ cpt_dump_socket_in(v, sk, ctx);
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ cpt_dump_sock_attr(sk, ctx);
++
++ dump_rqueue(index, sk, ctx);
++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) {
++ dump_wqueue(index, sk, ctx);
++ cpt_dump_ofo_queue(index, sk, ctx);
++ }
++
++ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
++ && sk->sk_state == TCP_LISTEN)
++ cpt_dump_synwait_queue(sk, index, ctx);
++
++ cpt_close_object(ctx);
++
++ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6)
++ && sk->sk_state == TCP_LISTEN)
++ cpt_dump_accept_queue(sk, index, ctx);
++
++ return 0;
++}
++
++int cpt_dump_orphaned_sockets(struct cpt_context *ctx)
++{
++ int i;
++
++ cpt_open_section(ctx, CPT_SECT_ORPHANS);
++
++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
++ struct sock *sk;
++ struct hlist_node *node;
++
++retry:
++ read_lock_bh(&tcp_hashinfo.ehash[i].lock);
++ sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) {
++
++ if (VE_OWNER_SK(sk) != get_exec_env())
++ continue;
++ if (sk->sk_socket)
++ continue;
++ if (!sock_flag(sk, SOCK_DEAD))
++ continue;
++ if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx))
++ continue;
++ sock_hold(sk);
++ read_unlock_bh(&tcp_hashinfo.ehash[i].lock);
++
++ local_bh_disable();
++ bh_lock_sock(sk);
++ if (sock_owned_by_user(sk))
++ eprintk_ctx("BUG: sk locked by whom?\n");
++ sk->sk_lock.owner = (void *)1;
++ bh_unlock_sock(sk);
++ local_bh_enable();
++
++ cpt_dump_socket(NULL, sk, -1, -1, ctx);
++
++ local_bh_disable();
++ bh_lock_sock(sk);
++ sk->sk_lock.owner = NULL;
++ clear_backlog(sk);
++ tcp_done(sk);
++ bh_unlock_sock(sk);
++ local_bh_enable();
++ sock_put(sk);
++
++ goto retry;
++ }
++ read_unlock_bh(&tcp_hashinfo.ehash[i].lock);
++ }
++ cpt_close_section(ctx);
++ return 0;
++}
++
++static int can_dump(struct sock *sk, cpt_context_t *ctx)
++{
++ switch (sk->sk_family) {
++ case AF_NETLINK:
++ if (((struct netlink_sock *)sk)->cb) {
++ eprintk_ctx("netlink socket has active callback\n");
++ return 0;
++ }
++ break;
++ }
++ return 1;
++}
++
++/* We are not going to block suspend when we have external AF_UNIX connections.
++ * But we cannot stop feed of new packets/connections to our environment
++ * from outside. Taking into account that it is intrincically unreliable,
++ * we collect some amount of data, but when checkpointing/restoring we
++ * are going to drop everything, which does not make sense: skbs sent
++ * by outside processes, connections from outside etc. etc.
++ */
++
++/* The first pass. When we see socket referenced by a file, we just
++ * add it to socket table */
++int cpt_collect_socket(struct file *file, cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++ struct socket *sock;
++ struct sock *sk;
++
++ if (!S_ISSOCK(file->f_dentry->d_inode->i_mode))
++ return -ENOTSOCK;
++ sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket;
++ sk = sock->sk;
++ if (!can_dump(sk, ctx))
++ return -EBUSY;
++ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL)
++ return -ENOMEM;
++ obj->o_parent = file;
++
++ return 0;
++}
++
++/*
++ * We should end with table containing:
++ * * all sockets opened by our processes in the table.
++ * * all the sockets queued in listening queues on _our_ listening sockets,
++ * which are connected to our opened sockets.
++ */
++
++static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx)
++{
++ struct sock *sk = obj->o_obj;
++ cpt_object_t *cobj;
++ struct sk_buff *skb;
++
++ skb = skb_peek(&sk->sk_receive_queue);
++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) {
++ struct sock *lsk = skb->sk;
++ if (unix_peer(lsk) &&
++ lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) {
++ if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL)
++ return -ENOMEM;
++ cobj->o_parent = obj->o_parent;
++ }
++ spin_lock_irq(&sk->sk_receive_queue.lock);
++ skb = skb->next;
++ spin_unlock_irq(&sk->sk_receive_queue.lock);
++ }
++
++ return 0;
++}
++
++int cpt_index_sockets(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++ unsigned long index = 0;
++
++ /* Collect not-yet-accepted children of listening sockets. */
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct sock *sk = obj->o_obj;
++
++ if (sk->sk_state != TCP_LISTEN)
++ continue;
++
++ if (sk->sk_family == AF_UNIX)
++ collect_one_unix_listening_sock(obj, ctx);
++ }
++
++ /* Assign indices to all the sockets. */
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct sock *sk = obj->o_obj;
++ cpt_obj_setindex(obj, index++, ctx);
++
++ if (sk->sk_socket && sk->sk_socket->file) {
++ cpt_object_t *tobj;
++ tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx);
++ if (tobj)
++ cpt_obj_setindex(tobj, obj->o_index, ctx);
++ }
++ }
++
++ return 0;
++}
++
++void cpt_unlock_sockets(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct sock *sk = obj->o_obj;
++ if (sk && obj->o_lock) {
++ if (sk->sk_socket)
++ release_sock(sk);
++ }
++ }
++}
++
++void cpt_kill_sockets(cpt_context_t * ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct sock *sk = obj->o_obj;
++ if (sk && obj->o_lock) {
++ cpt_kill_socket(sk, ctx);
++ if (sk->sk_socket)
++ release_sock_nobacklog(sk);
++ }
++ }
++}
++
++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx)
++{
++ struct fasync_struct *fa;
++ struct inode *inode = file->f_dentry->d_inode;
++ struct socket *sock;
++
++ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket;
++
++ for (fa = sock->fasync_list; fa; fa = fa->fa_next) {
++ if (fa->fa_file == file)
++ return fa->fa_fd;
++ }
++ return -1;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket.h linux-2.6.16-026test015/kernel/cpt/cpt_socket.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_socket.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_socket.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,33 @@
++struct sock;
++
++int cpt_collect_passedfds(cpt_context_t *);
++int cpt_index_sockets(cpt_context_t *);
++int cpt_collect_socket(struct file *, cpt_context_t *);
++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx);
++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx);
++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx);
++int rst_sockets(struct cpt_context *ctx);
++int rst_sockets_complete(struct cpt_context *ctx);
++int cpt_dump_orphaned_sockets(struct cpt_context *ctx);
++
++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx);
++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx);
++
++void cpt_unlock_sockets(cpt_context_t *);
++void cpt_kill_sockets(cpt_context_t *);
++
++
++int cpt_kill_socket(struct sock *, cpt_context_t *);
++int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*);
++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx);
++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx);
++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *);
++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx);
++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx);
++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx);
++int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx);
++
++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
++ loff_t pos, cpt_context_t *ctx);
++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
++ loff_t pos, cpt_context_t *ctx);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket_in.c linux-2.6.16-026test015/kernel/cpt/cpt_socket_in.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_socket_in.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_socket_in.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,443 @@
++/*
++ *
++ * kernel/cpt/cpt_socket_in.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/tcp.h>
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <linux/igmp.h>
++#include <linux/ipv6.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++
++static inline __u32 jiffies_export(unsigned long tmo)
++{
++ __s32 delta = (long)(tmo - jiffies);
++ return delta;
++}
++
++static inline __u32 tcp_jiffies_export(__u32 tmo)
++{
++ __s32 delta = tmo - tcp_time_stamp;
++ return delta;
++}
++
++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx)
++{
++ struct sk_buff *skb;
++ struct tcp_sock *tp;
++
++ if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP)
++ return 0;
++
++ tp = tcp_sk(sk);
++
++ skb = skb_peek(&tp->out_of_order_queue);
++ while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) {
++ int err;
++
++ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx);
++ if (err)
++ return err;
++
++ spin_lock_irq(&tp->out_of_order_queue.lock);
++ skb = skb->next;
++ spin_unlock_irq(&tp->out_of_order_queue.lock);
++ }
++ return 0;
++}
++
++static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk,
++ struct cpt_context *ctx)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ si->cpt_pred_flags = tp->pred_flags;
++ si->cpt_rcv_nxt = tp->rcv_nxt;
++ si->cpt_snd_nxt = tp->snd_nxt;
++ si->cpt_snd_una = tp->snd_una;
++ si->cpt_snd_sml = tp->snd_sml;
++ si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp);
++ si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime);
++ si->cpt_tcp_header_len = tp->tcp_header_len;
++ si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending;
++ si->cpt_quick = inet_csk(sk)->icsk_ack.quick;
++ si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong;
++ si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked;
++ si->cpt_ato = inet_csk(sk)->icsk_ack.ato;
++ si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout);
++ si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime);
++ si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size;
++ si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss;
++ si->cpt_snd_wl1 = tp->snd_wl1;
++ si->cpt_snd_wnd = tp->snd_wnd;
++ si->cpt_max_window = tp->max_window;
++ si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie;
++ si->cpt_mss_cache = tp->mss_cache;
++ si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */
++ si->cpt_mss_clamp = tp->rx_opt.mss_clamp;
++ si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len;
++ si->cpt_ext2_header_len = 0;
++ si->cpt_ca_state = inet_csk(sk)->icsk_ca_state;
++ si->cpt_retransmits = inet_csk(sk)->icsk_retransmits;
++ si->cpt_reordering = tp->reordering;
++ si->cpt_frto_counter = tp->frto_counter;
++ si->cpt_frto_highmark = tp->frto_highmark;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
++ // // si->cpt_adv_cong = tp->adv_cong;
++#endif
++ si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept;
++ si->cpt_backoff = inet_csk(sk)->icsk_backoff;
++ si->cpt_srtt = tp->srtt;
++ si->cpt_mdev = tp->mdev;
++ si->cpt_mdev_max = tp->mdev_max;
++ si->cpt_rttvar = tp->rttvar;
++ si->cpt_rtt_seq = tp->rtt_seq;
++ si->cpt_rto = inet_csk(sk)->icsk_rto;
++ si->cpt_packets_out = tp->packets_out;
++ si->cpt_left_out = tp->left_out;
++ si->cpt_retrans_out = tp->retrans_out;
++ si->cpt_lost_out = tp->lost_out;
++ si->cpt_sacked_out = tp->sacked_out;
++ si->cpt_fackets_out = tp->fackets_out;
++ si->cpt_snd_ssthresh = tp->snd_ssthresh;
++ si->cpt_snd_cwnd = tp->snd_cwnd;
++ si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt;
++ si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp;
++ si->cpt_snd_cwnd_used = tp->snd_cwnd_used;
++ si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp);
++ si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout);
++ si->cpt_ka_timeout = 0;
++ si->cpt_rcv_wnd = tp->rcv_wnd;
++ si->cpt_rcv_wup = tp->rcv_wup;
++ si->cpt_write_seq = tp->write_seq;
++ si->cpt_pushed_seq = tp->pushed_seq;
++ si->cpt_copied_seq = tp->copied_seq;
++ si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok;
++ si->cpt_wscale_ok = tp->rx_opt.wscale_ok;
++ si->cpt_sack_ok = tp->rx_opt.sack_ok;
++ si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp;
++ si->cpt_snd_wscale = tp->rx_opt.snd_wscale;
++ si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale;
++ si->cpt_nonagle = tp->nonagle;
++ si->cpt_keepalive_probes = tp->keepalive_probes;
++ si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval;
++ si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr;
++ si->cpt_ts_recent = tp->rx_opt.ts_recent;
++ si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
++ si->cpt_user_mss = tp->rx_opt.user_mss;
++ si->cpt_dsack = tp->rx_opt.dsack;
++ si->cpt_eff_sacks = tp->rx_opt.eff_sacks;
++ si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq;
++ si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq;
++ si->cpt_sack_array[2] = tp->selective_acks[0].start_seq;
++ si->cpt_sack_array[3] = tp->selective_acks[0].end_seq;
++ si->cpt_sack_array[4] = tp->selective_acks[1].start_seq;
++ si->cpt_sack_array[5] = tp->selective_acks[1].end_seq;
++ si->cpt_sack_array[6] = tp->selective_acks[2].start_seq;
++ si->cpt_sack_array[7] = tp->selective_acks[2].end_seq;
++ si->cpt_sack_array[8] = tp->selective_acks[3].start_seq;
++ si->cpt_sack_array[9] = tp->selective_acks[3].end_seq;
++ si->cpt_window_clamp = tp->window_clamp;
++ si->cpt_rcv_ssthresh = tp->rcv_ssthresh;
++ si->cpt_probes_out = inet_csk(sk)->icsk_probes_out;
++ si->cpt_num_sacks = tp->rx_opt.num_sacks;
++ si->cpt_advmss = tp->advmss;
++ si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries;
++ si->cpt_ecn_flags = tp->ecn_flags;
++ si->cpt_prior_ssthresh = tp->prior_ssthresh;
++ si->cpt_high_seq = tp->high_seq;
++ si->cpt_retrans_stamp = tp->retrans_stamp;
++ si->cpt_undo_marker = tp->undo_marker;
++ si->cpt_undo_retrans = tp->undo_retrans;
++ si->cpt_urg_seq = tp->urg_seq;
++ si->cpt_urg_data = tp->urg_data;
++ si->cpt_pending = inet_csk(sk)->icsk_pending;
++ si->cpt_urg_mode = tp->urg_mode;
++ si->cpt_snd_up = tp->snd_up;
++ si->cpt_keepalive_time = tp->keepalive_time;
++ si->cpt_keepalive_intvl = tp->keepalive_intvl;
++ si->cpt_linger2 = tp->linger2;
++
++ if (sk->sk_state != TCP_LISTEN &&
++ sk->sk_state != TCP_CLOSE &&
++ sock_flag(sk, SOCK_KEEPOPEN)) {
++ si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires);
++ }
++
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ {
++ extern struct inet_connection_sock_af_ops ipv6_mapped;
++ if (sk->sk_family == AF_INET6 &&
++ inet_csk(sk)->icsk_af_ops == &ipv6_mapped)
++ si->cpt_mapped = 1;
++ }
++#endif
++
++ return 0;
++}
++
++
++int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk,
++ struct cpt_context *ctx)
++{
++ struct inet_sock *inet = inet_sk(sk);
++ struct ipv6_pinfo *np = inet6_sk(sk);
++
++ if (sk->sk_family == AF_INET) {
++ struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr);
++ sin->sin_family = AF_INET;
++ sin->sin_port = inet->sport;
++ sin->sin_addr.s_addr = inet->rcv_saddr;
++ si->cpt_laddrlen = sizeof(*sin);
++ } else if (sk->sk_family == AF_INET6) {
++ struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr);
++ sin6->sin6_family = AF_INET6;
++ sin6->sin6_port = inet->sport;
++ memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16);
++ si->cpt_laddrlen = sizeof(*sin6);
++ }
++ if (!inet->num)
++ si->cpt_laddrlen = 0;
++
++ si->cpt_daddr = inet->daddr;
++ si->cpt_dport = inet->dport;
++ si->cpt_saddr = inet->saddr;
++ si->cpt_rcv_saddr = inet->rcv_saddr;
++ si->cpt_sport = inet->sport;
++ si->cpt_uc_ttl = inet->uc_ttl;
++ si->cpt_tos = inet->tos;
++ si->cpt_cmsg_flags = inet->cmsg_flags;
++ si->cpt_mc_index = inet->mc_index;
++ si->cpt_mc_addr = inet->mc_addr;
++ si->cpt_hdrincl = inet->hdrincl;
++ si->cpt_mc_ttl = inet->mc_ttl;
++ si->cpt_mc_loop = inet->mc_loop;
++ si->cpt_pmtudisc = inet->pmtudisc;
++ si->cpt_recverr = inet->recverr;
++ si->cpt_freebind = inet->freebind;
++ si->cpt_idcounter = inet->id;
++
++ si->cpt_cork_flags = inet->cork.flags;
++ si->cpt_cork_fragsize = 0;
++ si->cpt_cork_length = inet->cork.length;
++ si->cpt_cork_addr = inet->cork.addr;
++ si->cpt_cork_saddr = inet->cork.fl.fl4_src;
++ si->cpt_cork_daddr = inet->cork.fl.fl4_dst;
++ si->cpt_cork_oif = inet->cork.fl.oif;
++ if (inet->cork.rt) {
++ si->cpt_cork_fragsize = inet->cork.fragsize;
++ si->cpt_cork_saddr = inet->cork.rt->fl.fl4_src;
++ si->cpt_cork_daddr = inet->cork.rt->fl.fl4_dst;
++ si->cpt_cork_oif = inet->cork.rt->fl.oif;
++ }
++
++ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
++ struct udp_sock *up = udp_sk(sk);
++ si->cpt_udp_pending = up->pending;
++ si->cpt_udp_corkflag = up->corkflag;
++ si->cpt_udp_encap = up->encap_type;
++ si->cpt_udp_len = up->len;
++ }
++
++ if (sk->sk_family == AF_INET6) {
++ memcpy(si->cpt_saddr6, &np->saddr, 16);
++ memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16);
++ memcpy(si->cpt_daddr6, &np->daddr, 16);
++ si->cpt_flow_label6 = np->flow_label;
++ si->cpt_frag_size6 = np->frag_size;
++ si->cpt_hop_limit6 = np->hop_limit;
++ si->cpt_mcast_hops6 = np->mcast_hops;
++ si->cpt_mcast_oif6 = np->mcast_oif;
++ si->cpt_rxopt6 = np->rxopt.all;
++ si->cpt_mc_loop6 = np->mc_loop;
++ si->cpt_recverr6 = np->recverr;
++ si->cpt_sndflow6 = np->sndflow;
++ si->cpt_pmtudisc6 = np->pmtudisc;
++ si->cpt_ipv6only6 = np->ipv6only;
++ si->cpt_mapped = 0;
++ }
++
++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
++ cpt_dump_socket_tcp(si, sk, ctx);
++
++ return 0;
++}
++
++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx)
++{
++ struct request_sock *req;
++
++ for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next)
++ cpt_dump_socket(NULL, req->sk, -1, index, ctx);
++ return 0;
++}
++
++
++static int dump_openreq(struct request_sock *req, struct sock *sk, int index,
++ struct cpt_context *ctx)
++{
++ struct cpt_openreq_image *v = cpt_get_buf(ctx);
++
++ cpt_open_object(NULL, ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_OPENREQ;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_VOID;
++
++ v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn;
++ v->cpt_snt_isn = tcp_rsk(req)->snt_isn;
++ v->cpt_rmt_port = inet_rsk(req)->rmt_port;
++ v->cpt_mss = req->mss;
++ // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6);
++ v->cpt_retrans = req->retrans;
++ v->cpt_snd_wscale = inet_rsk(req)->snd_wscale;
++ v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale;
++ v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok;
++ v->cpt_sack_ok = inet_rsk(req)->sack_ok;
++ v->cpt_wscale_ok = inet_rsk(req)->wscale_ok;
++ v->cpt_ecn_ok = inet_rsk(req)->ecn_ok;
++ v->cpt_acked = inet_rsk(req)->acked;
++ v->cpt_window_clamp = req->window_clamp;
++ v->cpt_rcv_wnd = req->rcv_wnd;
++ v->cpt_ts_recent = req->ts_recent;
++ v->cpt_expires = jiffies_export(req->expires);
++
++ if (v->cpt_family == AF_INET) {
++ memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4);
++ memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4);
++ } else {
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16);
++ memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16);
++ v->cpt_iif = inet6_rsk(req)->iif;
++#endif
++ }
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ cpt_close_object(ctx);
++ return 0;
++}
++
++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx)
++{
++ struct listen_sock *lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
++ struct request_sock *req;
++ int i;
++
++ for (i=0; i<TCP_SYNQ_HSIZE; i++) {
++ for (req=lopt->syn_table[i]; req; req=req->dl_next) {
++ loff_t saved_obj;
++ cpt_push_object(&saved_obj, ctx);
++ dump_openreq(req, sk, index, ctx);
++ cpt_pop_object(&saved_obj, ctx);
++ }
++ }
++ return 0;
++}
++
++
++int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx)
++{
++ if (sk->sk_state != TCP_CLOSE &&
++ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
++ sk->sk_protocol == IPPROTO_TCP) {
++ if (sk->sk_state != TCP_LISTEN)
++ tcp_set_state(sk, TCP_CLOSE);
++ else
++ sk->sk_prot->disconnect(sk, 0);
++ }
++ return 0;
++}
++
++int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx)
++{
++ struct inet_sock *inet = inet_sk(sk);
++ struct ip_mc_socklist *iml;
++
++ for (iml = inet->mc_list; iml; iml = iml->next) {
++ struct cpt_sockmc_image smi;
++ int scnt = 0;
++ int i;
++
++ if (iml->sflist)
++ scnt = iml->sflist->sl_count*16;
++
++ smi.cpt_next = sizeof(smi) + scnt;
++ smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
++ smi.cpt_hdrlen = sizeof(smi);
++ smi.cpt_content = CPT_CONTENT_DATA;
++
++ smi.cpt_family = AF_INET;
++ smi.cpt_mode = iml->sfmode;
++ smi.cpt_ifindex = iml->multi.imr_ifindex;
++ memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr));
++ smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr;
++
++ ctx->write(&smi, sizeof(smi), ctx);
++
++ for (i = 0; i < scnt; i++) {
++ u32 addr[4];
++ memset(&addr, 0, sizeof(addr));
++ addr[0] = iml->sflist->sl_addr[i];
++ ctx->write(&addr, sizeof(addr), ctx);
++ }
++ }
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ if (sk->sk_family == AF_INET6) {
++ struct ipv6_mc_socklist *mcl;
++ struct ipv6_pinfo *np = inet6_sk(sk);
++
++ for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) {
++ struct cpt_sockmc_image smi;
++ int scnt = 0;
++ int i;
++
++ if (mcl->sflist)
++ scnt = mcl->sflist->sl_count*16;
++
++ smi.cpt_next = sizeof(smi) + scnt;
++ smi.cpt_object = CPT_OBJ_SOCK_MCADDR;
++ smi.cpt_hdrlen = sizeof(smi);
++ smi.cpt_content = CPT_CONTENT_DATA;
++
++ smi.cpt_family = AF_INET6;
++ smi.cpt_mode = mcl->sfmode;
++ smi.cpt_ifindex = mcl->ifindex;
++ memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr));
++
++ ctx->write(&smi, sizeof(smi), ctx);
++ for (i = 0; i < scnt; i++)
++ ctx->write(&mcl->sflist->sl_addr[i], 16, ctx);
++ }
++ }
++#endif
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_syscalls.h linux-2.6.16-026test015/kernel/cpt/cpt_syscalls.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_syscalls.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_syscalls.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,95 @@
++#include <linux/unistd.h>
++#include <linux/syscalls.h>
++#include <asm/uaccess.h>
++
++#define WRAP(c, args) return sys_##c args
++#define WRAP2(c, args) int err; mm_segment_t oldfs; \
++ oldfs = get_fs(); set_fs(KERNEL_DS); \
++ err = sys_##c args ;\
++ set_fs(oldfs); \
++ return err
++
++static inline int sc_close(int fd)
++{
++ WRAP(close, (fd));
++}
++
++static inline int sc_dup2(int fd1, int fd2)
++{
++ WRAP(dup2, (fd1, fd2));
++}
++
++static inline int sc_unlink(char *name)
++{
++ WRAP2(unlink, (name));
++}
++
++static inline int sc_pipe(int *pfd)
++{
++ return do_pipe(pfd);
++}
++
++static inline int sc_mknod(char *name, int mode, int dev)
++{
++ WRAP2(mknod, (name, mode, dev));
++}
++
++static inline int sc_chmod(char *name, int mode)
++{
++ WRAP2(mkdir, (name, mode));
++}
++
++static inline int sc_chown(char *name, int uid, int gid)
++{
++ WRAP2(chown, (name, uid, gid));
++}
++
++static inline int sc_mkdir(char *name, int mode)
++{
++ WRAP2(mkdir, (name, mode));
++}
++
++static inline int sc_rmdir(char *name)
++{
++ WRAP2(rmdir, (name));
++}
++
++static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags)
++{
++ WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL));
++}
++
++static inline int sc_mprotect(unsigned long start, size_t len,
++ unsigned long prot)
++{
++ WRAP(mprotect, (start, len, prot));
++}
++
++static inline int sc_mlock(unsigned long start, size_t len)
++{
++ WRAP(mlock, (start, len));
++}
++
++static inline int sc_munlock(unsigned long start, size_t len)
++{
++ WRAP(munlock, (start, len));
++}
++
++static inline int sc_remap_file_pages(unsigned long start, size_t len,
++ unsigned long prot, unsigned long pgoff,
++ unsigned long flags)
++{
++ WRAP(remap_file_pages, (start, len, prot, pgoff, flags));
++}
++
++static inline int sc_waitx(int pid, int opt)
++{
++ WRAP(wait4, (pid, NULL, opt, NULL));
++}
++
++static inline int sc_flock(int fd, int flags)
++{
++ WRAP(flock, (fd, flags));
++}
++
++extern int sc_execve(char *cms, char **argv, char **env);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_sysvipc.c linux-2.6.16-026test015/kernel/cpt/cpt_sysvipc.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_sysvipc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_sysvipc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,317 @@
++/*
++ *
++ * kernel/cpt/cpt_sysvipc.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/shm.h>
++#include <linux/sem.h>
++#include <linux/msg.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++
++struct _warg {
++ struct file *file;
++ struct cpt_sysvshm_image *v;
++};
++
++static int dump_one_shm(struct shmid_kernel *shp, void *arg)
++{
++ struct _warg *warg = arg;
++ struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v;
++
++ if (shp->shm_file != warg->file)
++ return 0;
++
++ v->cpt_key = shp->shm_perm.key;
++ v->cpt_uid = shp->shm_perm.uid;
++ v->cpt_gid = shp->shm_perm.gid;
++ v->cpt_cuid = shp->shm_perm.cuid;
++ v->cpt_cgid = shp->shm_perm.cgid;
++ v->cpt_mode = shp->shm_perm.mode;
++ v->cpt_seq = shp->shm_perm.seq;
++
++ v->cpt_id = shp->id;
++ v->cpt_segsz = shp->shm_segsz;
++ v->cpt_atime = shp->shm_atim;
++ v->cpt_ctime = shp->shm_ctim;
++ v->cpt_dtime = shp->shm_dtim;
++ v->cpt_creator = shp->shm_cprid;
++ v->cpt_last = shp->shm_lprid;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
++ v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1;
++#else
++ v->cpt_mlockuser = -1;
++#endif
++ return 1;
++}
++
++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx)
++{
++ struct cpt_sysvshm_image *v = cpt_get_buf(ctx);
++ struct _warg warg;
++
++ v->cpt_next = sizeof(*v);
++ v->cpt_object = CPT_OBJ_SYSV_SHM;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_VOID;
++
++ warg.file = file;
++ warg.v = v;
++ if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) {
++ cpt_release_buf(ctx);
++ return -ESRCH;
++ }
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++ return 0;
++}
++
++
++int match_sem(int id, struct sem_array *sema, void *arg)
++{
++ if (id != (unsigned long)arg)
++ return 0;
++ return sema->sem_nsems + 1;
++}
++
++static int get_sem_nsem(int id, cpt_context_t *ctx)
++{
++ int res;
++ res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id);
++ if (res > 0)
++ return res - 1;
++ eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id);
++ return -ESRCH;
++}
++
++static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx)
++{
++ struct cpt_sysvsem_undo_image v;
++ loff_t saved_obj;
++
++ cpt_open_object(NULL, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_SEMUNDO;
++ v.cpt_id = su->semid;
++ v.cpt_nsem = get_sem_nsem(su->semid, ctx);
++ if ((int)v.cpt_nsem < 0)
++ return -ESRCH;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx);
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_close_object(ctx);
++ return 0;
++}
++
++struct sem_warg {
++ int last_id;
++ struct cpt_sysvsem_image *v;
++};
++
++static int dump_one_sem(int id, struct sem_array *sma, void *arg)
++{
++ struct sem_warg * warg = (struct sem_warg *)arg;
++ struct cpt_sysvsem_image *v = warg->v;
++ int i;
++
++ if (warg->last_id != -1) {
++ if ((id % IPCMNI) <= warg->last_id)
++ return 0;
++ }
++
++ v->cpt_next = sizeof(*v);
++ v->cpt_object = CPT_OBJ_SYSV_SEM;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_SEMARRAY;
++
++ v->cpt_key = sma->sem_perm.key;
++ v->cpt_uid = sma->sem_perm.uid;
++ v->cpt_gid = sma->sem_perm.gid;
++ v->cpt_cuid = sma->sem_perm.cuid;
++ v->cpt_cgid = sma->sem_perm.cgid;
++ v->cpt_mode = sma->sem_perm.mode;
++ v->cpt_seq = sma->sem_perm.seq;
++
++ v->cpt_id = id;
++ v->cpt_ctime = sma->sem_ctime;
++ v->cpt_otime = sma->sem_otime;
++
++ for (i=0; i<sma->sem_nsems; i++) {
++ struct {
++ __u32 semval;
++ __u32 sempid;
++ } *s = (void*)v + v->cpt_next;
++ if (v->cpt_next >= PAGE_SIZE - sizeof(*s))
++ return -EINVAL;
++ s->semval = sma->sem_base[i].semval;
++ s->sempid = sma->sem_base[i].sempid;
++ v->cpt_next += sizeof(*s);
++ }
++
++ warg->last_id = id % IPCMNI;
++ return 1;
++}
++
++
++int cpt_dump_sysvsem(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++ struct sem_warg warg;
++
++ /* Dumping semaphores is quite tricky because we cannot
++ * write to dump file under lock inside sysvipc_walk_sem().
++ */
++ cpt_open_section(ctx, CPT_SECT_SYSV_SEM);
++ warg.last_id = -1;
++ warg.v = cpt_get_buf(ctx);
++ for (;;) {
++ if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0)
++ break;
++ ctx->write(warg.v, warg.v->cpt_next, ctx);
++ }
++ cpt_release_buf(ctx);
++ cpt_close_section(ctx);
++
++ cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO);
++ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
++ struct sem_undo_list *semu = obj->o_obj;
++ struct sem_undo *su;
++ struct cpt_object_hdr v;
++ loff_t saved_obj;
++
++ cpt_open_object(obj, ctx);
++
++ v.cpt_next = CPT_NULL;
++ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO;
++ v.cpt_hdrlen = sizeof(v);
++ v.cpt_content = CPT_CONTENT_ARRAY;
++
++ ctx->write(&v, sizeof(v), ctx);
++
++ cpt_push_object(&saved_obj, ctx);
++ for (su = semu->proc_list; su; su = su->proc_next) {
++ if (su->semid != -1) {
++ int err;
++ err = dump_one_semundo(su, ctx);
++ if (err < 0)
++ return err;
++ }
++ }
++ cpt_pop_object(&saved_obj, ctx);
++
++ cpt_close_object(ctx);
++ }
++ cpt_close_section(ctx);
++ return 0;
++}
++
++static int collect_one_msg(int id, struct msg_queue *msq, void *arg)
++{
++ int *retp = arg;
++ (*retp)++;
++ return 0;
++}
++
++int cpt_collect_sysvmsg(cpt_context_t * ctx)
++{
++ int ret = 0;
++ sysvipc_walk_msg(collect_one_msg, &ret);
++ if (ret) {
++ eprintk_ctx("SYSV msgqueues are not supported, found %d\n", ret);
++ return -EBUSY;
++ }
++ return 0;
++}
++
++static int cpt_collect_sysvsem_undo(cpt_context_t *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ if (tsk->exit_state) {
++ /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list
++ * on exit. Grrr... */
++ continue;
++ }
++ if (tsk->sysvsem.undo_list &&
++ cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL)
++ return -ENOMEM;
++ }
++
++ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) {
++ struct sem_undo_list *semu = obj->o_obj;
++
++ if (atomic_read(&semu->refcnt) != obj->o_count) {
++ eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt));
++ return -EBUSY;
++ }
++ }
++ return 0;
++}
++
++static int collect_one_shm(struct shmid_kernel *shp, void *arg)
++{
++ cpt_context_t *ctx = arg;
++
++ if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL)
++ return -ENOMEM;
++ return 0;
++}
++
++int cpt_collect_sysvshm(cpt_context_t * ctx)
++{
++ int err;
++
++ err = sysvipc_walk_shm(collect_one_shm, ctx);
++
++ return err < 0 ? err : 0;
++}
++
++int cpt_collect_sysv(cpt_context_t * ctx)
++{
++ int err;
++
++ err = cpt_collect_sysvsem_undo(ctx);
++ if (err)
++ return err;
++ err = cpt_collect_sysvmsg(ctx);
++ if (err)
++ return err;
++ err = cpt_collect_sysvshm(ctx);
++ if (err)
++ return err;
++
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_tty.c linux-2.6.16-026test015/kernel/cpt/cpt_tty.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_tty.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_tty.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,216 @@
++/*
++ *
++ * kernel/cpt/cpt_tty.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/tty.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++/* We must support at least N_TTY. */
++
++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx)
++{
++ struct tty_struct *tty = file->private_data;
++ cpt_object_t *obj;
++ struct cpt_obj_ref o;
++ loff_t saved_pos;
++
++ obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx);
++ if (!obj)
++ return -EINVAL;
++
++ cpt_push_object(&saved_pos, ctx);
++
++ o.cpt_next = sizeof(o);
++ o.cpt_object = CPT_OBJ_REF;
++ o.cpt_hdrlen = sizeof(o);
++ o.cpt_content = CPT_CONTENT_VOID;
++ o.cpt_pos = obj->o_pos;
++ ctx->write(&o, sizeof(o), ctx);
++
++ cpt_pop_object(&saved_pos, ctx);
++
++ return 0;
++}
++
++int cpt_collect_tty(struct file *file, cpt_context_t * ctx)
++{
++ struct tty_struct *tty = file->private_data;
++
++ if (tty) {
++ if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL)
++ return -ENOMEM;
++ if (tty->link) {
++ cpt_object_t *obj;
++
++ obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx);
++ if (obj == NULL)
++ return -ENOMEM;
++ /* Undo o_count, tty->link is not a reference */
++ obj->o_count--;
++ }
++ }
++ return 0;
++}
++
++int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct tty_struct *tty = obj->o_obj;
++ struct cpt_tty_image *v;
++
++ if (tty->link) {
++ if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) {
++ eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE);
++ return -EINVAL;
++ }
++ if (tty->link->link != tty) {
++ eprintk_ctx("bad pty pair\n");
++ return -EINVAL;
++ }
++ if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
++ tty->driver->subtype == PTY_TYPE_SLAVE &&
++ tty->link->count)
++ obj->o_count++;
++ }
++ if (obj->o_count != tty->count) {
++ eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count);
++ return -EBUSY;
++ }
++
++ cpt_open_object(obj, ctx);
++
++ v = cpt_get_buf(ctx);
++ v->cpt_next = -1;
++ v->cpt_object = CPT_OBJ_TTY;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_ARRAY;
++
++ v->cpt_index = tty->index;
++ v->cpt_link = -1;
++ if (tty->link)
++ v->cpt_link = tty->link->index;
++ v->cpt_drv_type = tty->driver->type;
++ v->cpt_drv_subtype = tty->driver->subtype;
++ v->cpt_drv_flags = tty->driver->flags;
++ v->cpt_packet = tty->packet;
++ v->cpt_stopped = tty->stopped;
++ v->cpt_hw_stopped = tty->hw_stopped;
++ v->cpt_flow_stopped = tty->flow_stopped;
++ v->cpt_flags = tty->flags;
++ v->cpt_ctrl_status = tty->ctrl_status;
++ v->cpt_canon_data = tty->canon_data;
++ v->cpt_canon_head = tty->canon_head - tty->read_tail;
++ v->cpt_canon_column = tty->canon_column;
++ v->cpt_column = tty->column;
++ v->cpt_erasing = tty->erasing;
++ v->cpt_lnext = tty->lnext;
++ v->cpt_icanon = tty->icanon;
++ v->cpt_raw = tty->raw;
++ v->cpt_real_raw = tty->real_raw;
++ v->cpt_closing = tty->closing;
++ v->cpt_minimum_to_wake = tty->minimum_to_wake;
++ v->cpt_pgrp = 0;
++ if (tty->pgrp > 0) {
++ v->cpt_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, tty->pgrp);
++ if ((int)v->cpt_pgrp < 0) {
++ dprintk_ctx("cannot map tty->pgrp %d -> %d\n", tty->pgrp, (int)v->cpt_pgrp);
++ v->cpt_pgrp = -1;
++ }
++ }
++ v->cpt_session = 0;
++ if (tty->session > 0) {
++ v->cpt_session = _pid_type_to_vpid(PIDTYPE_SID, tty->session);
++ if ((int)v->cpt_session < 0) {
++ eprintk_ctx("cannot map tty->session %d -> %d\n", tty->session, (int)v->cpt_session);
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ }
++ memcpy(v->cpt_name, tty->name, 64);
++ v->cpt_ws_row = tty->winsize.ws_row;
++ v->cpt_ws_col = tty->winsize.ws_col;
++ v->cpt_ws_prow = tty->winsize.ws_ypixel;
++ v->cpt_ws_pcol = tty->winsize.ws_xpixel;
++ if (tty->termios == NULL) {
++ eprintk_ctx("NULL termios");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ v->cpt_c_line = tty->termios->c_line;
++ v->cpt_c_iflag = tty->termios->c_iflag;
++ v->cpt_c_oflag = tty->termios->c_oflag;
++ v->cpt_c_cflag = tty->termios->c_cflag;
++ v->cpt_c_lflag = tty->termios->c_lflag;
++ memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS);
++ if (NCCS < 32)
++ memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS);
++ memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags));
++
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ if (tty->read_buf && tty->read_cnt) {
++ struct cpt_obj_bits *v = cpt_get_buf(ctx);
++ loff_t saved_pos;
++
++ cpt_push_object(&saved_pos, ctx);
++ cpt_open_object(NULL, ctx);
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_BITS;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_DATA;
++ v->cpt_size = tty->read_cnt;
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_release_buf(ctx);
++
++ if (tty->read_cnt) {
++ int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail);
++ ctx->write(tty->read_buf + tty->read_tail, n, ctx);
++ if (tty->read_cnt > n)
++ ctx->write(tty->read_buf, tty->read_cnt-n, ctx);
++ ctx->align(ctx);
++ }
++
++ cpt_close_object(ctx);
++ cpt_pop_object(&saved_pos, ctx);
++ }
++
++ cpt_close_object(ctx);
++
++ return 0;
++}
++
++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx)
++{
++ struct tty_struct * tty;
++ struct fasync_struct *fa;
++
++ tty = (struct tty_struct *)file->private_data;
++
++ for (fa = tty->fasync; fa; fa = fa->fa_next) {
++ if (fa->fa_file == file)
++ return fa->fa_fd;
++ }
++ return -1;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_ubc.c linux-2.6.16-026test015/kernel/cpt/cpt_ubc.c
+--- linux-2.6.16.orig/kernel/cpt/cpt_ubc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_ubc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,132 @@
++/*
++ *
++ * kernel/cpt/cpt_ubc.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/types.h>
++#include <ub/beancounter.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx);
++ if (obj != NULL) {
++ if (obj->o_count == 1)
++ get_beancounter(bc);
++ if (bc->parent != NULL && obj->o_parent == NULL)
++ obj->o_parent = cpt_add_ubc(bc->parent, ctx);
++ }
++ return obj;
++}
++
++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx);
++ if (obj == NULL) {
++ char buf[48];
++ print_ub_uid(bc, buf, sizeof(buf));
++ printk(KERN_ERR "CPT: unknown ub %s (%p)\n", buf, bc);
++ dump_stack();
++ return CPT_NULL;
++ }
++ return obj->o_pos;
++}
++
++static void dump_one_bc_parm(__u64 *dmp, struct ubparm *prm, int held)
++{
++ dmp[0] = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL);
++ dmp[1] = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL);
++ dmp[2] = (held ? prm->held : CPT_NULL);
++ dmp[3] = prm->maxheld;
++ dmp[4] = prm->minheld;
++ dmp[5] = prm->failcnt;
++}
++
++static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct user_beancounter *bc;
++ struct cpt_beancounter_image *v;
++ int i;
++
++ bc = obj->o_obj;
++ v = cpt_get_buf(ctx);
++
++ v->cpt_next = CPT_NULL;
++ v->cpt_object = CPT_OBJ_UBC;
++ v->cpt_hdrlen = sizeof(*v);
++ v->cpt_content = CPT_CONTENT_VOID;
++
++ if (obj->o_parent != NULL)
++ v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos;
++ else
++ v->cpt_parent = CPT_NULL;
++ v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0;
++ for (i = 0; i < UB_RESOURCES; i++)
++ dump_one_bc_parm(v->cpt_parms, bc->ub_parms, 0);
++ for (i = 0; i < UB_RESOURCES; i++)
++ dump_one_bc_parm(v->cpt_parms + UB_RESOURCES * 6,
++ bc->ub_store, 1);
++ memset(v->cpt_parms + UB_RESOURCES * 12, 0,
++ sizeof(v->cpt_parms)
++ - UB_RESOURCES * 12 * sizeof(v->cpt_parms[0]));
++
++ cpt_open_object(obj, ctx);
++ ctx->write(v, sizeof(*v), ctx);
++ cpt_close_object(ctx);
++
++ cpt_release_buf(ctx);
++ return 0;
++}
++
++int cpt_dump_ubc(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++ int skipped;
++ int top;
++
++ cpt_open_section(ctx, CPT_SECT_UBC);
++
++ do {
++ skipped = 0;
++ top = 0;
++ for_each_object(obj, CPT_OBJ_UBC) {
++ if (obj->o_parent == NULL)
++ top++;
++ if (obj->o_pos != CPT_NULL)
++ continue;
++ if (obj->o_parent != NULL &&
++ ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL)
++ skipped++;
++ else
++ dump_one_bc(obj, ctx);
++ }
++ } while (skipped && (top < 2));
++
++ cpt_close_section(ctx);
++ if (top > 1) {
++ eprintk_ctx("More than one top level ub exist");
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++void cpt_finish_ubc(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_UBC)
++ put_beancounter(obj->o_obj);
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_ubc.h linux-2.6.16-026test015/kernel/cpt/cpt_ubc.h
+--- linux-2.6.16.orig/kernel/cpt/cpt_ubc.h 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_ubc.h 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,9 @@
++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx);
++int cpt_dump_ubc(struct cpt_context *ctx);
++
++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx);
++int rst_undump_ubc(struct cpt_context *ctx);
++
++void cpt_finish_ubc(struct cpt_context *ctx);
++void rst_finish_ubc(struct cpt_context *ctx);
+diff -upr linux-2.6.16.orig/kernel/cpt/cpt_x8664.S linux-2.6.16-026test015/kernel/cpt/cpt_x8664.S
+--- linux-2.6.16.orig/kernel/cpt/cpt_x8664.S 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/cpt_x8664.S 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,61 @@
++#define ASSEMBLY 1
++#include <linux/config.h>
++#include <linux/linkage.h>
++#include <asm/segment.h>
++#include <asm/smp.h>
++#include <asm/cache.h>
++#include <asm/errno.h>
++#include <asm/dwarf2.h>
++#include <asm/calling.h>
++#include <asm/msr.h>
++#include <asm/unistd.h>
++#include <asm/thread_info.h>
++#include <asm/hw_irq.h>
++#include <asm/errno.h>
++
++ .code64
++
++ .macro FAKE_STACK_FRAME child_rip
++ /* push in order ss, rsp, eflags, cs, rip */
++ xorq %rax, %rax
++ pushq %rax /* ss */
++ pushq %rax /* rsp */
++ pushq $(1<<9) /* eflags - interrupts on */
++ pushq $__KERNEL_CS /* cs */
++ pushq \child_rip /* rip */
++ pushq %rax /* orig rax */
++ .endm
++
++ .macro UNFAKE_STACK_FRAME
++ addq $8*6, %rsp
++ .endm
++
++ENTRY(asm_kernel_thread)
++ FAKE_STACK_FRAME $child_rip
++ SAVE_ALL
++
++ # rdi: flags, rsi: usp, rdx: will be &pt_regs
++ movq %rdx,%rdi
++ orq $0x00800000,%rdi
++ movq $-1, %rsi
++ movq %rsp, %rdx
++
++ xorl %r8d,%r8d
++ xorl %r9d,%r9d
++ pushq %rcx
++ call do_fork_pid
++ addq $8, %rsp
++ /* call do_fork */
++ movq %rax,RAX(%rsp)
++ xorl %edi,%edi
++ RESTORE_ALL
++ UNFAKE_STACK_FRAME
++ ret
++
++child_rip:
++ movq %rdi, %rax
++ movq %rsi, %rdi
++ call *%rax
++ xorq %rdi, %rdi
++ xorq %rsi, %rsi
++ call complete_and_exit
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_conntrack.c linux-2.6.16-026test015/kernel/cpt/rst_conntrack.c
+--- linux-2.6.16.orig/kernel/cpt/rst_conntrack.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_conntrack.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,294 @@
++/*
++ *
++ * kernel/cpt/rst_conntrack.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/unistd.h>
++#include <linux/ve.h>
++#include <linux/vzcalluser.h>
++#include <linux/cpt_image.h>
++#include <linux/icmp.h>
++#include <linux/ip.h>
++
++#if defined(CONFIG_VE_IPTABLES) && \
++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE))
++
++#include <linux/netfilter.h>
++#include <linux/netfilter_ipv4/ip_conntrack.h>
++#include <linux/netfilter_ipv4/ip_nat.h>
++#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
++#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
++#include <linux/netfilter_ipv4/ip_conntrack_core.h>
++#include <linux/netfilter_ipv4/ip_nat_helper.h>
++#include <linux/netfilter_ipv4/ip_nat_core.h>
++
++#define ASSERT_READ_LOCK(x) do { } while (0)
++#define ASSERT_WRITE_LOCK(x) do { } while (0)
++
++#include <linux/netfilter_ipv4/listhelp.h>
++
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++struct ct_holder
++{
++ struct ct_holder *next;
++ struct ip_conntrack *ct;
++ int index;
++};
++
++static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir)
++{
++ tuple->dst.ip = v->cpt_dst;
++ tuple->dst.u.all = v->cpt_dstport;
++ tuple->dst.protonum = v->cpt_protonum;
++ tuple->dst.dir = v->cpt_dir;
++ if (dir != tuple->dst.dir)
++ wprintk("dir != tuple->dst.dir\n");
++
++ tuple->src.ip = v->cpt_src;
++ tuple->src.u.all = v->cpt_srcport;
++}
++
++
++static int undump_expect_list(struct ip_conntrack *ct,
++ struct cpt_ip_conntrack_image *ci,
++ loff_t pos, struct ct_holder *ct_list,
++ cpt_context_t *ctx)
++{
++ loff_t end;
++ int err;
++
++ end = pos + ci->cpt_next;
++ pos += ci->cpt_hdrlen;
++ while (pos < end) {
++ struct cpt_ip_connexpect_image v;
++ struct ip_conntrack_expect *exp;
++ struct ip_conntrack *sibling;
++
++ err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx);
++ if (err)
++ return err;
++
++ sibling = NULL;
++ if (v.cpt_sibling_conntrack) {
++ struct ct_holder *c;
++
++ for (c = ct_list; c; c = c->next) {
++ if (c->index == v.cpt_sibling_conntrack) {
++ sibling = c->ct;
++ break;
++ }
++ }
++ if (!sibling) {
++ eprintk_ctx("lost sibling of expectation\n");
++ return -EINVAL;
++ }
++ }
++
++ write_lock_bh(&ip_conntrack_lock);
++
++ /* It is possible. Helper module could be just unregistered,
++ * if expectation were on the list, it would be destroyed. */
++ if (ct->helper == NULL) {
++ write_unlock_bh(&ip_conntrack_lock);
++ dprintk_ctx("conntrack: no helper and non-trivial expectation\n");
++ continue;
++ }
++
++ exp = ip_conntrack_expect_alloc(NULL);
++ if (exp == NULL) {
++ write_unlock_bh(&ip_conntrack_lock);
++ return -ENOMEM;
++ }
++
++ if (ct->helper->timeout && !del_timer(&exp->timeout)) {
++ /* Dying already. We can do nothing. */
++ write_unlock_bh(&ip_conntrack_lock);
++ dprintk_ctx("conntrack expectation is dying\n");
++ continue;
++ }
++
++ decode_tuple(&v.cpt_tuple, &exp->tuple, 0);
++ decode_tuple(&v.cpt_mask, &exp->mask, 0);
++
++ exp->master = ct;
++ nf_conntrack_get(&ct->ct_general);
++ ip_conntrack_expect_insert(exp);
++#if 0
++ if (sibling) {
++ exp->sibling = sibling;
++ sibling->master = exp;
++ LIST_DELETE(&ve_ip_conntrack_expect_list, exp);
++ ct->expecting--;
++ nf_conntrack_get(&master_ct(sibling)->infos[0]);
++ } else
++#endif
++ if (ct->helper->timeout) {
++ exp->timeout.expires = jiffies + v.cpt_timeout;
++ add_timer(&exp->timeout);
++ }
++ write_unlock_bh(&ip_conntrack_lock);
++
++ pos += v.cpt_next;
++ }
++ return 0;
++}
++
++static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos,
++ struct ct_holder **ct_list, cpt_context_t *ctx)
++{
++ int err = 0;
++ struct ip_conntrack *conntrack;
++ struct ct_holder *c;
++ struct ip_conntrack_tuple orig, repl;
++
++ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL);
++ if (c == NULL)
++ return -ENOMEM;
++
++ decode_tuple(&ci->cpt_tuple[0], &orig, 0);
++ decode_tuple(&ci->cpt_tuple[1], &repl, 1);
++
++ conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub);
++ if (!conntrack || IS_ERR(conntrack)) {
++ kfree(c);
++ return -ENOMEM;
++ }
++
++ c->ct = conntrack;
++ c->next = *ct_list;
++ *ct_list = c;
++ c->index = ci->cpt_index;
++
++ decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0);
++ decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1);
++
++ conntrack->status = ci->cpt_status;
++
++ memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto));
++ memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help));
++
++#ifdef CONFIG_IP_NF_NAT_NEEDED
++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
++ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
++ conntrack->nat.masq_index = ci->cpt_masq_index;
++#endif
++ if (ci->cpt_initialized) {
++ conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos;
++ conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before;
++ conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after;
++ conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos;
++ conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before;
++ conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after;
++ }
++ if (conntrack->status & IPS_NAT_DONE_MASK)
++ ip_nat_hash_conntrack(conntrack);
++#endif
++
++ write_lock_bh(&ip_conntrack_lock);
++
++ if (ci->cpt_ct_helper) {
++ conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple);
++ if (conntrack->helper == NULL) {
++ eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n");
++ err = -EINVAL;
++ }
++ }
++
++ ip_conntrack_hash_insert(conntrack);
++ conntrack->timeout.expires = jiffies + ci->cpt_timeout;
++
++ write_unlock_bh(&ip_conntrack_lock);
++
++ if (err == 0 && ci->cpt_next > ci->cpt_hdrlen)
++ err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx);
++
++ return err;
++}
++
++int rst_restore_ip_conntrack(struct cpt_context * ctx)
++{
++ int err = 0;
++ loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_ip_conntrack_image ci;
++ struct ct_holder *c;
++ struct ct_holder *ct_list = NULL;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) {
++ eprintk_ctx("conntrack module ct->proto version mismatch\n");
++ return -EINVAL;
++ }
++ if (sizeof(ci.cpt_help_data) != sizeof(union ip_conntrack_help)) {
++ eprintk_ctx("conntrack module ct->help version mismatch\n");
++ return -EINVAL;
++ }
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx);
++ if (err)
++ break;
++ err = undump_one_ct(&ci, sec, &ct_list, ctx);
++ if (err)
++ break;
++ sec += ci.cpt_next;
++ }
++
++ while ((c = ct_list) != NULL) {
++ ct_list = c->next;
++ if (c->ct)
++ add_timer(&c->ct->timeout);
++ kfree(c);
++ }
++
++ return err;
++}
++
++#else
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++int rst_restore_ip_conntrack(struct cpt_context * ctx)
++{
++ if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL)
++ return -EINVAL;
++ return 0;
++}
++
++#endif
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_context.c linux-2.6.16-026test015/kernel/cpt/rst_context.c
+--- linux-2.6.16.orig/kernel/cpt/rst_context.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_context.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,315 @@
++/*
++ *
++ * kernel/cpt/rst_context.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx)
++{
++ mm_segment_t oldfs;
++ ssize_t err = -EBADF;
++ struct file *file = ctx->file;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ if (file)
++ err = file->f_op->read(file, addr, count, &file->f_pos);
++ set_fs(oldfs);
++ if (err != count)
++ return err >= 0 ? -EIO : err;
++ return 0;
++}
++
++static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos)
++{
++ mm_segment_t oldfs;
++ ssize_t err = -EBADF;
++ struct file *file = ctx->file;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ if (file)
++ err = file->f_op->read(file, addr, count, &pos);
++ set_fs(oldfs);
++ if (err != count)
++ return err >= 0 ? -EIO : err;
++ return 0;
++}
++
++static void file_align(struct cpt_context *ctx)
++{
++ struct file *file = ctx->file;
++
++ if (file)
++ file->f_pos = CPT_ALIGN(file->f_pos);
++}
++
++int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end)
++{
++ struct cpt_section_hdr hdr;
++ int err;
++ loff_t pos;
++
++ pos = ctx->sections[type];
++ *start = *end = pos;
++
++ if (pos != CPT_NULL) {
++ if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0)
++ return err;
++ if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr))
++ return -EINVAL;
++ *start = pos + hdr.cpt_hdrlen;
++ *end = pos + hdr.cpt_next;
++ }
++ return 0;
++}
++EXPORT_SYMBOL(rst_get_section);
++
++void rst_context_init(struct cpt_context *ctx)
++{
++ int i;
++
++ memset(ctx, 0, sizeof(*ctx));
++
++ init_MUTEX(&ctx->main_sem);
++ ctx->refcount = 1;
++
++ ctx->current_section = -1;
++ ctx->current_object = -1;
++ ctx->pagesize = PAGE_SIZE;
++ ctx->read = file_read;
++ ctx->pread = file_pread;
++ ctx->align = file_align;
++ for (i=0; i < CPT_SECT_MAX; i++)
++ ctx->sections[i] = CPT_NULL;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ init_completion(&ctx->pgin_notify);
++#endif
++ cpt_object_init(ctx);
++}
++
++static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx)
++{
++ struct cpt_section_hdr h;
++
++ while (start < end) {
++ int err;
++
++ err = ctx->pread(&h, sizeof(h), ctx, start);
++ if (err)
++ return err;
++ if (h.cpt_hdrlen < sizeof(h) ||
++ h.cpt_next < h.cpt_hdrlen ||
++ start + h.cpt_next > end)
++ return -EINVAL;
++ if (h.cpt_section >= CPT_SECT_MAX)
++ return -EINVAL;
++ ctx->sections[h.cpt_section] = start;
++ start += h.cpt_next;
++ }
++ return 0;
++}
++
++int rst_open_dumpfile(struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_major_tail *v;
++ struct cpt_major_hdr h;
++ unsigned long size;
++
++ err = -EBADF;
++ if (!ctx->file)
++ goto err_out;
++
++ err = -ENOMEM;
++ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL);
++ if (ctx->tmpbuf == NULL)
++ goto err_out;
++ __cpt_release_buf(ctx);
++
++ size = ctx->file->f_dentry->d_inode->i_size;
++
++ if (size & 7) {
++ err = -EINVAL;
++ goto err_out;
++ }
++ if (size < sizeof(struct cpt_major_hdr) +
++ sizeof(struct cpt_major_tail)) {
++ err = -EINVAL;
++ goto err_out;
++ }
++ err = ctx->pread(&h, sizeof(h), ctx, 0);
++ if (err) {
++ eprintk_ctx("too short image 1 %d\n", err);
++ goto err_out;
++ }
++ if (h.cpt_signature[0] != CPT_SIGNATURE0 ||
++ h.cpt_signature[1] != CPT_SIGNATURE1 ||
++ h.cpt_signature[2] != CPT_SIGNATURE2 ||
++ h.cpt_signature[3] != CPT_SIGNATURE3) {
++ err = -EINVAL;
++ goto err_out;
++ }
++ if (h.cpt_hz != HZ) {
++ err = -EINVAL;
++ eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ);
++ goto err_out;
++ }
++ ctx->virt_jiffies64 = h.cpt_start_jiffies64;
++ ctx->start_time.tv_sec = h.cpt_start_sec;
++ ctx->start_time.tv_nsec = h.cpt_start_nsec;
++ ctx->kernel_config_flags = h.cpt_kernel_config[0];
++ ctx->iptables_mask = h.cpt_iptables_mask;
++ ctx->image_version = h.cpt_image_version;
++
++ v = cpt_get_buf(ctx);
++ err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v));
++ if (err) {
++ eprintk_ctx("too short image 2 %d\n", err);
++ cpt_release_buf(ctx);
++ goto err_out;
++ }
++ if (v->cpt_signature[0] != CPT_SIGNATURE0 ||
++ v->cpt_signature[1] != CPT_SIGNATURE1 ||
++ v->cpt_signature[2] != CPT_SIGNATURE2 ||
++ v->cpt_signature[3] != CPT_SIGNATURE3 ||
++ v->cpt_nsect != CPT_SECT_MAX_INDEX) {
++ err = -EINVAL;
++ cpt_release_buf(ctx);
++ goto err_out;
++ }
++ if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) {
++ cpt_release_buf(ctx);
++ goto err_out;
++ }
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ ctx->lazypages = v->cpt_lazypages;
++#endif
++ ctx->tasks64 = v->cpt_64bit;
++ cpt_release_buf(ctx);
++ return 0;
++
++err_out:
++ if (ctx->tmpbuf) {
++ free_page((unsigned long)ctx->tmpbuf);
++ ctx->tmpbuf = NULL;
++ }
++ return err;
++}
++
++void rst_close_dumpfile(struct cpt_context *ctx)
++{
++ if (ctx->file) {
++ fput(ctx->file);
++ ctx->file = NULL;
++ }
++ if (ctx->tmpbuf) {
++ free_page((unsigned long)ctx->tmpbuf);
++ ctx->tmpbuf = NULL;
++ }
++}
++
++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_object_hdr *hdr = tmp;
++ err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos);
++ if (err)
++ return err;
++ if (type > 0 && type != hdr->cpt_object)
++ return -EINVAL;
++ if (hdr->cpt_hdrlen > hdr->cpt_next)
++ return -EINVAL;
++ if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr))
++ return -EINVAL;
++ if (size < sizeof(*hdr))
++ return -EINVAL;
++ if (size > hdr->cpt_hdrlen)
++ size = hdr->cpt_hdrlen;
++ if (size > sizeof(*hdr))
++ err = ctx->pread(hdr+1, size - sizeof(*hdr),
++ ctx, pos + sizeof(*hdr));
++ return err;
++}
++EXPORT_SYMBOL(_rst_get_object);
++
++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx)
++{
++ int err;
++ void *tmp;
++ struct cpt_object_hdr hdr;
++ err = ctx->pread(&hdr, sizeof(hdr), ctx, pos);
++ if (err)
++ return NULL;
++ if (type > 0 && type != hdr.cpt_object)
++ return NULL;
++ if (hdr.cpt_hdrlen > hdr.cpt_next)
++ return NULL;
++ if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr))
++ return NULL;
++ tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL);
++ if (!tmp)
++ return NULL;
++ err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos);
++ if (!err)
++ return tmp;
++ kfree(tmp);
++ return NULL;
++}
++EXPORT_SYMBOL(__rst_get_object);
++
++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_object_hdr hdr;
++ __u8 *name;
++
++ err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx);
++ if (err)
++ return NULL;
++ if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE)
++ return NULL;
++ name = (void*)__get_free_page(GFP_KERNEL);
++ if (!name)
++ return NULL;
++ err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen,
++ ctx, *pos_p + hdr.cpt_hdrlen);
++ if (err) {
++ free_page((unsigned long)name);
++ return NULL;
++ }
++ *pos_p += hdr.cpt_next;
++ return name;
++}
++
++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx)
++{
++ return __rst_get_name(&pos, ctx);
++}
++
++void rst_put_name(__u8 *name, struct cpt_context *ctx)
++{
++ unsigned long addr = (unsigned long)name;
++
++ if (addr)
++ free_page(addr&~(PAGE_SIZE-1));
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_epoll.c linux-2.6.16-026test015/kernel/cpt/rst_epoll.c
+--- linux-2.6.16.orig/kernel/cpt/rst_epoll.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_epoll.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,173 @@
++/*
++ *
++ * kernel/cpt/rst_epoll.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/namespace.h>
++#include <linux/mount.h>
++#include <linux/namei.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/vzcalluser.h>
++#include <linux/eventpoll.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++#include "cpt_syscalls.h"
++
++/* Those funcations are static in fs/eventpoll.c */
++extern struct file_operations eventpoll_fops;
++extern int ep_insert(struct eventpoll *ep, struct epoll_event *event,
++ struct file *tfile, int fd);
++extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
++extern void ep_release_epitem(struct epitem *epi);
++
++
++struct file *cpt_open_epolldev(struct cpt_file_image *fi,
++ unsigned flags,
++ struct cpt_context *ctx)
++{
++ struct file *file;
++ int efd;
++
++ /* Argument "size" is ignored, use just 1 */
++ efd = sys_epoll_create(1);
++ if (efd < 0)
++ return ERR_PTR(efd);
++
++ file = fget(efd);
++ sys_close(efd);
++ return file;
++}
++
++static int restore_one_epoll(cpt_object_t *obj,
++ loff_t pos,
++ struct cpt_epoll_image *ebuf,
++ cpt_context_t *ctx)
++{
++ int err = 0;
++ loff_t endpos;
++ struct file *file = obj->o_obj;
++ struct eventpoll *ep;
++
++ if (file->f_op != &eventpoll_fops) {
++ eprintk_ctx("bad epoll file\n");
++ return -EINVAL;
++ }
++
++ ep = file->private_data;
++
++ if (unlikely(ep == NULL)) {
++ eprintk_ctx("bad epoll device\n");
++ return -EINVAL;
++ }
++
++ endpos = pos + ebuf->cpt_next;
++ pos += ebuf->cpt_hdrlen;
++ while (pos < endpos) {
++ struct cpt_epoll_file_image efi;
++ struct epoll_event epds;
++
++ cpt_object_t *tobj;
++
++ err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx);
++ if (err)
++ return err;
++ tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx);
++ if (!tobj) {
++ eprintk_ctx("epoll file not found\n");
++ return -EINVAL;
++ }
++ epds.events = efi.cpt_events;
++ epds.data = efi.cpt_data;
++ down_write(&ep->sem);
++ err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd);
++ if (!err) {
++ struct epitem *epi;
++ epi = ep_find(ep, tobj->o_obj, efi.cpt_fd);
++ if (epi) {
++ epi->revents = efi.cpt_revents;
++ if (efi.cpt_ready) {
++ unsigned long flags;
++ write_lock_irqsave(&ep->lock, flags);
++ if (list_empty(&epi->rdllink))
++ list_add_tail(&epi->rdllink, &ep->rdllist);
++ write_unlock_irqrestore(&ep->lock, flags);
++ }
++ ep_release_epitem(epi);
++ }
++ }
++ up_write(&ep->sem);
++ if (err)
++ break;
++ pos += efi.cpt_next;
++ }
++ return err;
++}
++
++int rst_eventpoll(cpt_context_t *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_EPOLL];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ cpt_object_t *obj;
++ struct cpt_epoll_image *ebuf = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx);
++ if (obj == NULL) {
++ eprintk_ctx("cannot find epoll file object\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ err = restore_one_epoll(obj, sec, ebuf, ctx);
++ cpt_release_buf(ctx);
++ if (err)
++ return err;
++ sec += ebuf->cpt_next;
++ }
++
++ return 0;
++
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_files.c linux-2.6.16-026test015/kernel/cpt/rst_files.c
+--- linux-2.6.16.orig/kernel/cpt/rst_files.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_files.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1453 @@
++/*
++ *
++ * kernel/cpt/rst_files.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mount.h>
++#include <linux/tty.h>
++#include <linux/namei.h>
++#include <linux/vmalloc.h>
++#include <linux/smp_lock.h>
++#include <linux/vmalloc.h>
++#include <linux/pagemap.h>
++#include <asm/uaccess.h>
++#include <ub/ub_mem.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++#include "cpt_fsmagic.h"
++
++#include "cpt_syscalls.h"
++
++
++struct filejob {
++ struct filejob *next;
++ int pid;
++ loff_t fdi;
++};
++
++static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx)
++{
++ struct filejob *j;
++
++ j = kmalloc(sizeof(*j), GFP_KERNEL);
++ if (j == NULL)
++ return -ENOMEM;
++ j->pid = current->pid;
++ j->fdi = pos;
++ j->next = ctx->filejob_queue;
++ ctx->filejob_queue = j;
++ return 0;
++}
++
++static void _anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf)
++{
++ struct page *page = buf->page;
++
++ if (info->tmp_page) {
++ __free_page(page);
++ } else {
++ info->tmp_page = page;
++ }
++ module_put(THIS_MODULE);
++}
++
++static void *_anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
++{
++ return kmap(buf->page);
++}
++
++static void _anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf)
++{
++ kunmap(buf->page);
++}
++
++static struct pipe_buf_operations _anon_pipe_buf_ops = {
++ .can_merge = 1,
++ .map = _anon_pipe_buf_map,
++ .unmap = _anon_pipe_buf_unmap,
++ .release = _anon_pipe_buf_release,
++};
++
++/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer
++ * many times. We need to mark it in CPT_OBJ_INODE table in some way.
++ */
++static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi,
++ struct cpt_context *ctx)
++{
++ struct inode *ino = file->f_dentry->d_inode;
++ struct cpt_inode_image ii;
++ struct cpt_obj_bits b;
++ struct pipe_inode_info *info;
++ int err;
++ int count;
++
++ if (!S_ISFIFO(ino->i_mode)) {
++ eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", fi->cpt_inode);
++ return -EINVAL;
++ }
++ if (fi->cpt_inode == CPT_NULL)
++ return 0;
++
++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
++ if (err)
++ return err;
++
++ if (ii.cpt_next <= ii.cpt_hdrlen)
++ return 0;
++
++ err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx);
++ if (err)
++ return err;
++
++ if (b.cpt_size == 0)
++ return 0;
++
++ mutex_lock(PIPE_MUTEX(*ino));
++ info = ino->i_pipe;
++ if (info->nrbufs) {
++ mutex_unlock(PIPE_MUTEX(*ino));
++ eprintk("pipe buffer is restored already\n");
++ return -EINVAL;
++ }
++ info->curbuf = 0;
++ count = 0;
++ while (count < b.cpt_size) {
++ struct pipe_buffer *buf = info->bufs + info->nrbufs;
++ void * addr;
++ int chars;
++
++ chars = b.cpt_size - count;
++ if (chars > PAGE_SIZE)
++ chars = PAGE_SIZE;
++ if (!try_module_get(THIS_MODULE)) {
++ err = -EBUSY;
++ break;
++ }
++
++ buf->page = alloc_page(GFP_HIGHUSER);
++ if (buf->page == NULL) {
++ err = -ENOMEM;
++ break;
++ }
++ buf->ops = &_anon_pipe_buf_ops;
++ buf->offset = 0;
++ buf->len = chars;
++ info->nrbufs++;
++ addr = kmap(buf->page);
++ err = ctx->pread(addr, chars, ctx,
++ fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count);
++ if (err)
++ break;
++ count += chars;
++ }
++ mutex_unlock(PIPE_MUTEX(*ino));
++
++ return err;
++}
++
++static int make_flags(struct cpt_file_image *fi)
++{
++ int flags = O_NOFOLLOW;
++ switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) {
++ case FMODE_READ|FMODE_WRITE:
++ flags |= O_RDWR; break;
++ case FMODE_WRITE:
++ flags |= O_WRONLY; break;
++ case FMODE_READ:
++ flags |= O_RDONLY; break;
++ default: break;
++ }
++ flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC);
++ flags |= O_NONBLOCK|O_NOCTTY;
++ return flags;
++}
++
++static struct file *open_pipe(char *name,
++ struct cpt_file_image *fi,
++ unsigned flags,
++ struct cpt_context *ctx)
++{
++ int err;
++ cpt_object_t *obj;
++ struct cpt_inode_image ii;
++ struct file *rf, *wf;
++
++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
++ if (err)
++ return ERR_PTR(err);
++
++ if (ii.cpt_sb == FSMAGIC_PIPEFS) {
++ int pfd[2];
++
++ if ((err = sc_pipe(pfd)) < 0)
++ return ERR_PTR(err);
++
++ rf = fcheck(pfd[0]);
++ wf = fcheck(pfd[1]);
++ get_file(rf);
++ get_file(wf);
++ sc_close(pfd[0]);
++ sc_close(pfd[1]);
++
++ if (fi->cpt_mode&FMODE_READ) {
++ struct file *tf;
++ tf = wf; wf = rf; rf = tf;
++ }
++ } else {
++ if (fi->cpt_mode&FMODE_READ) {
++ rf = filp_open(name, flags, 0);
++ if (IS_ERR(rf)) {
++ dprintk_ctx("filp_open\n");
++ return rf;
++ }
++ dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode);
++ return rf;
++ }
++
++ dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), fi->cpt_inode);
++
++ rf = filp_open(name, O_RDWR|O_NONBLOCK, 0);
++ if (IS_ERR(rf))
++ return rf;
++ wf = dentry_open(dget(rf->f_dentry),
++ mntget(rf->f_vfsmnt), flags);
++ }
++
++ /* Add pipe inode to obj table. */
++ obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx);
++ if (obj == NULL) {
++ fput(rf); fput(wf);
++ return ERR_PTR(-ENOMEM);
++ }
++ cpt_obj_setpos(obj, fi->cpt_inode, ctx);
++ obj->o_parent = rf;
++
++ /* Add another side of pipe to obj table, it will not be used
++ * (o_pos = PT_NULL), another processes opeining pipe will find
++ * inode and open it with dentry_open(). */
++ obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx);
++ if (obj == NULL) {
++ fput(wf);
++ return ERR_PTR(-ENOMEM);
++ }
++ return wf;
++}
++
++static struct file *open_special(struct cpt_file_image *fi,
++ unsigned flags,
++ int deleted,
++ struct cpt_context *ctx)
++{
++ struct cpt_inode_image *ii;
++ struct file *file;
++
++ /* Directories and named pipes are not special actually */
++ if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode))
++ return NULL;
++
++ /* No support for block devices at the moment. */
++ if (S_ISBLK(fi->cpt_i_mode))
++ return ERR_PTR(-EINVAL);
++
++ if (S_ISSOCK(fi->cpt_i_mode)) {
++ eprintk_ctx("bug: socket is not open\n");
++ return ERR_PTR(-EINVAL);
++ }
++
++ /* Support only (some) character devices at the moment. */
++ if (!S_ISCHR(fi->cpt_i_mode))
++ return ERR_PTR(-EINVAL);
++
++ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx);
++ if (ii == NULL)
++ return ERR_PTR(-ENOMEM);
++
++ /* Do not worry about this right now. /dev/null,zero,*random are here.
++ * To prohibit at least /dev/mem?
++ */
++ if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) {
++ kfree(ii);
++ return NULL;
++ }
++
++ file = rst_open_tty(fi, ii, flags, ctx);
++ kfree(ii);
++ return file;
++}
++
++static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx)
++{
++ struct file_lock lock;
++ cpt_object_t *obj;
++
++ memset(&lock, 0, sizeof(lock));
++ lock.fl_type = fli->cpt_type;
++ lock.fl_flags = fli->cpt_flags & ~FL_SLEEP;
++ lock.fl_start = fli->cpt_start;
++ lock.fl_end = fli->cpt_end;
++ obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx);
++ if (!obj) {
++ eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner);
++ return -EINVAL;
++ }
++ lock.fl_owner = obj->o_obj;
++ lock.fl_pid = vpid_to_pid(fli->cpt_pid);
++ if (lock.fl_pid < 0) {
++ eprintk_ctx("unknown lock pid %d\n", lock.fl_pid);
++ return -EINVAL;
++ }
++ lock.fl_file = file;
++
++ if (lock.fl_owner == NULL)
++ eprintk_ctx("no lock owner\n");
++ return posix_lock_file(file, &lock);
++}
++
++static int restore_flock(struct file *file, struct cpt_flock_image *fli,
++ cpt_context_t *ctx)
++{
++ int cmd, err, fd;
++ fd = get_unused_fd();
++ if (fd < 0) {
++ eprintk_ctx("BSD flock cannot be restored\n");
++ return fd;
++ }
++ get_file(file);
++ fd_install(fd, file);
++ if (fli->cpt_type == F_RDLCK) {
++ cmd = LOCK_SH;
++ } else if (fli->cpt_type == F_WRLCK) {
++ cmd = LOCK_EX;
++ } else {
++ eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type);
++ sc_close(fd);
++ return -EINVAL;
++ }
++
++ err = sc_flock(fd, LOCK_NB | cmd);
++ sc_close(fd);
++ return err;
++}
++
++
++static int fixup_posix_locks(struct file *file,
++ struct cpt_file_image *fi,
++ loff_t pos, struct cpt_context *ctx)
++{
++ int err;
++ loff_t end;
++ struct cpt_flock_image fli;
++
++ end = pos + fi->cpt_next;
++ pos += fi->cpt_hdrlen;
++ while (pos < end) {
++ err = rst_get_object(-1, pos, &fli, ctx);
++ if (err)
++ return err;
++ if (fli.cpt_object == CPT_OBJ_FLOCK &&
++ (fli.cpt_flags&FL_POSIX)) {
++ err = restore_posix_lock(file, &fli, ctx);
++ if (err)
++ return err;
++ dprintk_ctx("posix lock restored\n");
++ }
++ pos += fli.cpt_next;
++ }
++ return 0;
++}
++
++int rst_posix_locks(struct cpt_context *ctx)
++{
++ int err;
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++ struct cpt_file_image fi;
++
++ if (obj->o_pos == CPT_NULL)
++ continue;
++
++ err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx);
++ if (err < 0)
++ return err;
++ if (fi.cpt_next > fi.cpt_hdrlen)
++ fixup_posix_locks(file, &fi, obj->o_pos, ctx);
++ }
++ return 0;
++}
++
++static int fixup_flocks(struct file *file,
++ struct cpt_file_image *fi,
++ loff_t pos, struct cpt_context *ctx)
++{
++ int err;
++ loff_t end;
++ struct cpt_flock_image fli;
++
++ end = pos + fi->cpt_next;
++ pos += fi->cpt_hdrlen;
++ while (pos < end) {
++ err = rst_get_object(-1, pos, &fli, ctx);
++ if (err)
++ return err;
++ if (fli.cpt_object == CPT_OBJ_FLOCK &&
++ (fli.cpt_flags&FL_FLOCK)) {
++ err = restore_flock(file, &fli, ctx);
++ if (err)
++ return err;
++ dprintk_ctx("bsd lock restored\n");
++ }
++ pos += fli.cpt_next;
++ }
++ return 0;
++}
++
++
++static int fixup_reg_data(struct file *file, loff_t pos, loff_t end,
++ struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_page_block pgb;
++ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos);
++
++ do_write = file->f_op->write;
++ if (do_write == NULL) {
++ eprintk_ctx("no write method. Cannot restore contents of the file.\n");
++ return -EINVAL;
++ }
++
++ atomic_inc(&file->f_count);
++
++ while (pos < end) {
++ loff_t opos;
++ loff_t ipos;
++ int count;
++
++ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx);
++ if (err)
++ goto out;
++ dprintk_ctx("restoring file data block: %08x-%08x\n",
++ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
++ ipos = pos + pgb.cpt_hdrlen;
++ opos = pgb.cpt_start;
++ count = pgb.cpt_end-pgb.cpt_start;
++ while (count > 0) {
++ mm_segment_t oldfs;
++ int copy = count;
++
++ if (copy > PAGE_SIZE)
++ copy = PAGE_SIZE;
++ (void)cpt_get_buf(ctx);
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
++ set_fs(oldfs);
++ if (err) {
++ __cpt_release_buf(ctx);
++ goto out;
++ }
++ if (!(file->f_mode & FMODE_WRITE) ||
++ (file->f_flags&O_DIRECT)) {
++ fput(file);
++ file = dentry_open(dget(file->f_dentry),
++ mntget(file->f_vfsmnt), O_WRONLY);
++ if (IS_ERR(file)) {
++ __cpt_release_buf(ctx);
++ return PTR_ERR(file);
++ }
++ }
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ ipos += copy;
++ err = do_write(file, ctx->tmpbuf, copy, &opos);
++ set_fs(oldfs);
++ __cpt_release_buf(ctx);
++ if (err != copy) {
++ if (err >= 0)
++ err = -EIO;
++ goto out;
++ }
++ count -= copy;
++ }
++ pos += pgb.cpt_next;
++ }
++ err = 0;
++
++out:
++ fput(file);
++ return err;
++}
++
++
++static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi,
++ struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_inode_image ii;
++ struct file *file = *file_p;
++ struct iattr newattrs;
++
++ if (!S_ISREG(fi->cpt_i_mode))
++ return 0;
++
++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx);
++ if (err)
++ return err;
++
++ if (file == NULL) {
++ file = shmem_file_setup("dev/zero", ii.cpt_size, 0);
++ if (IS_ERR(file))
++ return PTR_ERR(file);
++ *file_p = file;
++ }
++
++ if (ii.cpt_next > ii.cpt_hdrlen) {
++ err = fixup_reg_data(file, fi->cpt_inode+ii.cpt_hdrlen,
++ fi->cpt_inode+ii.cpt_next, ctx);
++ if (err)
++ return err;
++ }
++
++ mutex_lock(&file->f_dentry->d_inode->i_mutex);
++ /* stage 1 - update size like do_truncate does */
++ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
++ newattrs.ia_size = ii.cpt_size;
++ cpt_timespec_import(&newattrs.ia_ctime, ii.cpt_ctime);
++ err = notify_change(file->f_dentry, &newattrs);
++ if (err)
++ goto out;
++
++ /* stage 2 - update times */
++ newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME |
++ ATTR_ATIME_SET | ATTR_MTIME_SET;
++ cpt_timespec_import(&newattrs.ia_atime, ii.cpt_atime);
++ cpt_timespec_import(&newattrs.ia_mtime, ii.cpt_mtime);
++ err = notify_change(file->f_dentry, &newattrs);
++
++out:
++ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
++ return err;
++}
++
++static int fixup_file_flags(struct file *file, struct cpt_file_image *fi,
++ int was_dentry_open, loff_t pos,
++ cpt_context_t *ctx)
++{
++ if (fi->cpt_pos != file->f_pos) {
++ int err = -ESPIPE;
++ if (file->f_op->llseek)
++ err = file->f_op->llseek(file, fi->cpt_pos, 0);
++ if (err < 0) {
++ dprintk_ctx("file %Ld lseek %Ld - %Ld\n", pos, file->f_pos, fi->cpt_pos);
++ file->f_pos = fi->cpt_pos;
++ }
++ }
++ file->f_uid = fi->cpt_uid;
++ file->f_gid = fi->cpt_gid;
++ file->f_owner.pid = 0;
++ if (fi->cpt_fown_pid) {
++ file->f_owner.pid = comb_vpid_to_pid(fi->cpt_fown_pid);
++ if (file->f_owner.pid == 0) {
++ wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", file->f_owner.pid);
++ return -EINVAL;
++ }
++ }
++ file->f_owner.uid = fi->cpt_fown_uid;
++ file->f_owner.euid = fi->cpt_fown_euid;
++ file->f_owner.signum = fi->cpt_fown_signo;
++
++ if (file->f_mode != fi->cpt_mode) {
++ if (was_dentry_open &&
++ ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) {
++ file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK);
++ file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK);
++ }
++ if (file->f_mode != fi->cpt_mode)
++ wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode);
++ }
++ if (file->f_flags != fi->cpt_flags) {
++ if (!(fi->cpt_flags&O_NOFOLLOW))
++ file->f_flags &= ~O_NOFOLLOW;
++ if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) {
++ file->f_flags &= ~O_NONBLOCK;
++ file->f_flags |= fi->cpt_flags&O_NONBLOCK;
++ }
++ if (fi->cpt_flags&FASYNC) {
++ if (fi->cpt_fown_fd == -1) {
++ wprintk_ctx("No fd for FASYNC\n");
++ return -EINVAL;
++ } else if (file->f_op && file->f_op->fasync) {
++ if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) {
++ wprintk_ctx("FASYNC problem\n");
++ return -EINVAL;
++ } else {
++ file->f_flags |= FASYNC;
++ }
++ }
++ }
++ if (file->f_flags != fi->cpt_flags) {
++ eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags);
++ return -EINVAL;
++ }
++ }
++ return 0;
++}
++
++static struct file *
++open_deleted(char *name, unsigned flags, struct cpt_file_image *fi,
++ cpt_context_t *ctx)
++{
++ struct file * file;
++ char *suffix = NULL;
++ int attempt = 0;
++ int tmp_pass = 0;
++ mode_t mode = fi->cpt_i_mode;
++
++ /* Strip (deleted) part... */
++ if (strlen(name) > strlen(" (deleted)")) {
++ if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) {
++ suffix = &name[strlen(name) - strlen(" (deleted)")];
++ *suffix = 0;
++ } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) {
++ memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1);
++ suffix = name + strlen(name);
++ }
++ }
++
++try_again:
++ for (;;) {
++ if (attempt) {
++ if (attempt > 1000) {
++ eprintk_ctx("open_deleted: failed after %d attempts\n", attempt);
++ return ERR_PTR(-EEXIST);
++ }
++ if (suffix == NULL) {
++ eprintk_ctx("open_deleted: no suffix\n");
++ return ERR_PTR(-EEXIST);
++ }
++ sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt));
++ }
++ attempt++;
++
++ if (S_ISFIFO(mode)) {
++ int err;
++ err = sc_mknod(name, S_IFIFO|(mode&017777), 0);
++ if (err == -EEXIST)
++ continue;
++ if (err < 0 && !tmp_pass)
++ goto change_dir;
++ if (err < 0)
++ return ERR_PTR(err);
++ file = open_pipe(name, fi, flags, ctx);
++ sc_unlink(name);
++ } else if (S_ISCHR(mode)) {
++ int err;
++ struct cpt_inode_image *ii;
++
++ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx);
++ if (ii == NULL)
++ return ERR_PTR(-ENOMEM);
++ err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev));
++ kfree(ii);
++ if (err == -EEXIST)
++ continue;
++ if (err < 0 && !tmp_pass)
++ goto change_dir;
++ if (err < 0)
++ return ERR_PTR(err);
++ file = filp_open(name, flags, mode&017777);
++ sc_unlink(name);
++ } else if (S_ISDIR(mode)) {
++ int err;
++ err = sc_mkdir(name, mode&017777);
++ if (err == -EEXIST)
++ continue;
++ if (err < 0 && !tmp_pass)
++ goto change_dir;
++ if (err < 0)
++ return ERR_PTR(err);
++ file = filp_open(name, flags, mode&017777);
++ sc_rmdir(name);
++ } else {
++ file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777);
++ if (IS_ERR(file)) {
++ if (PTR_ERR(file) == -EEXIST)
++ continue;
++ if (!tmp_pass)
++ goto change_dir;
++ } else {
++ sc_unlink(name);
++ }
++ }
++ break;
++ }
++
++ if (IS_ERR(file)) {
++ eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file));
++ return file;
++ } else {
++ dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode);
++ }
++ return file;
++
++change_dir:
++ sprintf(name, "/tmp/rst%u", current->pid);
++ suffix = name + strlen(name);
++ attempt = 1;
++ tmp_pass = 1;
++ goto try_again;
++}
++
++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx)
++{
++ int err;
++ int was_dentry_open = 0;
++ cpt_object_t *obj;
++ cpt_object_t *iobj;
++ struct cpt_file_image fi;
++ __u8 *name = NULL;
++ struct file *file;
++ int flags;
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx);
++ if (obj) {
++ file = obj->o_obj;
++ if (obj->o_index >= 0) {
++ dprintk_ctx("file is attached to a socket\n");
++ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
++ if (err < 0)
++ goto err_out;
++ fixup_file_flags(file, &fi, 0, pos, ctx);
++ }
++ get_file(file);
++ return file;
++ }
++
++ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx);
++ if (err < 0)
++ goto err_out;
++
++ flags = make_flags(&fi);
++
++ /* Easy way, inode has been already open. */
++ if (fi.cpt_inode != CPT_NULL &&
++ !(fi.cpt_lflags & CPT_DENTRY_CLONING) &&
++ (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL &&
++ iobj->o_parent) {
++ struct file *filp = iobj->o_parent;
++ file = dentry_open(dget(filp->f_dentry),
++ mntget(filp->f_vfsmnt), flags);
++ dprintk_ctx("rst_file: file obtained by dentry_open\n");
++ was_dentry_open = 1;
++ goto map_file;
++ }
++
++ if (fi.cpt_next > fi.cpt_hdrlen)
++ name = rst_get_name(pos + sizeof(fi), ctx);
++
++ if (fi.cpt_lflags == CPT_DENTRY_DELETED) {
++ if (fi.cpt_inode == CPT_NULL) {
++ eprintk_ctx("deleted file and no inode.\n");
++ err = -EINVAL;
++ goto err_out;
++ }
++
++ /* One very special case... */
++ if (S_ISREG(fi.cpt_i_mode) &&
++ (!name || !name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) {
++ /* MAP_ANON|MAP_SHARED mapping.
++ * kernel makes this damn ugly way, when file which
++ * is passed to mmap by user does not match
++ * file finally attached to VMA. Ok, rst_mm
++ * has to take care of this. Otherwise, it will fail.
++ */
++ file = NULL;
++ } else if (S_ISREG(fi.cpt_i_mode) ||
++ S_ISCHR(fi.cpt_i_mode) ||
++ S_ISFIFO(fi.cpt_i_mode) ||
++ S_ISDIR(fi.cpt_i_mode)) {
++ if (S_ISCHR(fi.cpt_i_mode)) {
++ file = open_special(&fi, flags, 1, ctx);
++ if (file != NULL)
++ goto map_file;
++ }
++ file = open_deleted(name, flags, &fi, ctx);
++ if (IS_ERR(file))
++ goto out;
++ } else {
++ eprintk_ctx("not a regular deleted file.\n");
++ err = -EINVAL;
++ goto err_out;
++ }
++
++ err = fixup_file_content(&file, &fi, ctx);
++ if (err)
++ goto err_put;
++ goto map_file;
++ } else {
++ if (!name || !name[0]) {
++ eprintk_ctx("no name for file?\n");
++ err = -EINVAL;
++ goto err_out;
++ }
++ if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) &&
++ (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL)
++ goto map_file;
++ if (S_ISFIFO(fi.cpt_i_mode) &&
++ (file = open_pipe(name, &fi, flags, ctx)) != NULL)
++ goto map_file;
++ if (!S_ISREG(fi.cpt_i_mode) &&
++ (file = open_special(&fi, flags, 0, ctx)) != NULL)
++ goto map_file;
++ }
++
++ file = filp_open(name, flags, 0);
++
++map_file:
++ if (!IS_ERR(file)) {
++ fixup_file_flags(file, &fi, was_dentry_open, pos, ctx);
++
++ if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) {
++ err = fixup_pipe_data(file, &fi, ctx);
++ if (err)
++ goto err_put;
++ }
++
++ obj = cpt_object_get(CPT_OBJ_FILE, file, ctx);
++ if (!obj) {
++ obj = cpt_object_add(CPT_OBJ_FILE, file, ctx);
++ if (obj)
++ get_file(file);
++ }
++ if (obj)
++ cpt_obj_setpos(obj, pos, ctx);
++
++ obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx);
++ if (obj) {
++ cpt_obj_setpos(obj, fi.cpt_inode, ctx);
++ if (!obj->o_parent || fi.cpt_lflags != CPT_DENTRY_DELETED)
++ obj->o_parent = file;
++ }
++
++ if (fi.cpt_next > fi.cpt_hdrlen) {
++ err = fixup_flocks(file, &fi, pos, ctx);
++ if (err)
++ goto err_put;
++ }
++ } else {
++ if (fi.cpt_lflags & CPT_DENTRY_PROC) {
++ dprintk_ctx("rst_file /proc delayed\n");
++ file = NULL;
++ }
++ }
++
++out:
++ if (name)
++ rst_put_name(name, ctx);
++ return file;
++
++err_put:
++ if (file)
++ fput(file);
++err_out:
++ if (name)
++ rst_put_name(name, ctx);
++ return ERR_PTR(err);
++}
++
++
++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ __u32 flag = 0;
++
++ if (ti->cpt_files == CPT_NULL ||
++ lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx))
++ flag |= CLONE_FILES;
++ if (ti->cpt_fs == CPT_NULL ||
++ lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx))
++ flag |= CLONE_FS;
++ return flag;
++}
++
++static void local_close_files(struct files_struct * files)
++{
++ int i, j;
++
++ j = 0;
++ for (;;) {
++ unsigned long set;
++ i = j * __NFDBITS;
++ if (i >= files->fdt->max_fdset || i >= files->fdt->max_fds)
++ break;
++ set = files->fdt->open_fds->fds_bits[j];
++ while (set) {
++ if (set & 1) {
++ struct file * file = xchg(&files->fdt->fd[i], NULL);
++ if (file)
++ filp_close(file, files);
++ }
++ i++;
++ set >>= 1;
++ }
++ files->fdt->open_fds->fds_bits[j] = 0;
++ files->fdt->close_on_exec->fds_bits[j] = 0;
++ j++;
++ }
++}
++
++extern int expand_fdtable(struct files_struct *files, int nr);
++
++
++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ struct cpt_files_struct_image fi;
++ struct files_struct *f = current->files;
++ cpt_object_t *obj;
++ loff_t pos, endpos;
++ int err;
++
++ if (ti->cpt_files == CPT_NULL) {
++ current->files = NULL;
++ if (f)
++ put_files_struct(f);
++ return 0;
++ }
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx);
++ if (obj) {
++ if (obj->o_obj != f) {
++ put_files_struct(f);
++ f = obj->o_obj;
++ atomic_inc(&f->count);
++ current->files = f;
++ }
++ return 0;
++ }
++
++ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx);
++ if (err)
++ return err;
++
++ local_close_files(f);
++
++ if (fi.cpt_max_fds > f->fdt->max_fds) {
++ spin_lock(&f->file_lock);
++ err = expand_fdtable(f, fi.cpt_max_fds-1);
++ spin_unlock(&f->file_lock);
++ if (err)
++ return err;
++ }
++
++ pos = ti->cpt_files + fi.cpt_hdrlen;
++ endpos = ti->cpt_files + fi.cpt_next;
++ while (pos < endpos) {
++ struct cpt_fd_image fdi;
++ struct file *filp;
++
++ err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx);
++ if (err)
++ return err;
++ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
++ if (IS_ERR(filp)) {
++ eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), fdi.cpt_file);
++ return PTR_ERR(filp);
++ }
++ if (filp == NULL) {
++ int err = rst_filejob_queue(pos, ctx);
++ if (err)
++ return err;
++ } else {
++ if (fdi.cpt_fd >= f->fdt->max_fds) BUG();
++ f->fdt->fd[fdi.cpt_fd] = filp;
++ FD_SET(fdi.cpt_fd, f->fdt->open_fds);
++ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
++ FD_SET(fdi.cpt_fd, f->fdt->close_on_exec);
++ }
++ pos += fdi.cpt_next;
++ }
++ f->fdt->next_fd = fi.cpt_next_fd;
++
++ obj = cpt_object_add(CPT_OBJ_FILES, f, ctx);
++ if (obj) {
++ cpt_obj_setpos(obj, ti->cpt_files, ctx);
++ cpt_obj_setindex(obj, fi.cpt_index, ctx);
++ }
++ return 0;
++}
++
++int rst_do_filejobs(cpt_context_t *ctx)
++{
++ struct filejob *j;
++
++ while ((j = ctx->filejob_queue) != NULL) {
++ int err;
++ task_t *tsk;
++ struct cpt_fd_image fdi;
++ struct file *filp;
++
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_pid_ve(j->pid);
++ if (tsk)
++ get_task_struct(tsk);
++ read_unlock(&tasklist_lock);
++ if (!tsk)
++ return -EINVAL;
++
++ err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx);
++ if (err) {
++ put_task_struct(tsk);
++ return err;
++ }
++
++ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
++ if (tsk->files->fdt->fd[fdi.cpt_fd] ||
++ FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) {
++ eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi);
++ put_task_struct(tsk);
++ return -EBUSY;
++ }
++
++ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx);
++ if (IS_ERR(filp)) {
++ eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), fdi.cpt_file);
++ put_task_struct(tsk);
++ return PTR_ERR(filp);
++ }
++ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG();
++ tsk->files->fdt->fd[fdi.cpt_fd] = filp;
++ FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds);
++ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC)
++ FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec);
++
++ dprintk_ctx("filejob %Ld done\n", j->fdi);
++
++ put_task_struct(tsk);
++ ctx->filejob_queue = j->next;
++ kfree(j);
++ }
++ return 0;
++}
++
++void rst_flush_filejobs(cpt_context_t *ctx)
++{
++ struct filejob *j;
++
++ while ((j = ctx->filejob_queue) != NULL) {
++ ctx->filejob_queue = j->next;
++ kfree(j);
++ }
++}
++
++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ struct fs_struct *f = current->fs;
++ cpt_object_t *obj;
++
++ if (ti->cpt_fs == CPT_NULL) {
++ exit_fs(current);
++ return 0;
++ }
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx);
++ if (obj) {
++ if (obj->o_obj != f) {
++ exit_fs(current);
++ f = obj->o_obj;
++ atomic_inc(&f->count);
++ current->fs = f;
++ }
++ return 0;
++ }
++
++ /* Do _not_ restore root. Image contains absolute pathnames.
++ * So, we fix it in context of rst process.
++ */
++
++ obj = cpt_object_add(CPT_OBJ_FS, f, ctx);
++ if (obj)
++ cpt_obj_setpos(obj, ti->cpt_fs, ctx);
++
++ return 0;
++}
++
++static int get_dir(struct dentry **dp, struct vfsmount **mp,
++ loff_t *pos, struct cpt_context *ctx)
++{
++ struct cpt_file_image fi;
++ struct file * file;
++ int err;
++
++ err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx);
++ if (err)
++ return err;
++
++ file = rst_file(*pos, -1, ctx);
++ if (IS_ERR(file))
++ return PTR_ERR(file);
++
++ *dp = dget(file->f_dentry);
++ *mp = mntget(file->f_vfsmnt);
++ *pos += fi.cpt_next;
++ fput(file);
++ return 0;
++}
++
++static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt,
++ struct dentry *dentry)
++{
++ struct dentry *old_root;
++ struct vfsmount *old_rootmnt;
++ write_lock(&fs->lock);
++ old_root = fs->root;
++ old_rootmnt = fs->rootmnt;
++ fs->rootmnt = mnt;
++ fs->root = dentry;
++ write_unlock(&fs->lock);
++ if (old_root) {
++ dput(old_root);
++ mntput(old_rootmnt);
++ }
++}
++
++static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt,
++ struct dentry *dentry)
++{
++ struct dentry *old_pwd;
++ struct vfsmount *old_pwdmnt;
++
++ write_lock(&fs->lock);
++ old_pwd = fs->pwd;
++ old_pwdmnt = fs->pwdmnt;
++ fs->pwdmnt = mnt;
++ fs->pwd = dentry;
++ write_unlock(&fs->lock);
++
++ if (old_pwd) {
++ dput(old_pwd);
++ mntput(old_pwdmnt);
++ }
++}
++
++
++int rst_restore_fs(struct cpt_context *ctx)
++{
++ loff_t pos;
++ cpt_object_t *obj;
++ int err = 0;
++
++ for_each_object(obj, CPT_OBJ_FS) {
++ struct cpt_fs_struct_image fi;
++ struct fs_struct *fs = obj->o_obj;
++ int i;
++ struct dentry *d[3];
++ struct vfsmount *m[3];
++
++ err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx);
++ if (err)
++ return err;
++
++ fs->umask = fi.cpt_umask;
++
++ pos = obj->o_pos + fi.cpt_hdrlen;
++ d[0] = d[1] = d[2] = NULL;
++ m[0] = m[1] = m[2] = NULL;
++ i = 0;
++ while (pos < obj->o_pos + fi.cpt_next && i<3) {
++ err = get_dir(d+i, m+i, &pos, ctx);
++ if (err) {
++ eprintk_ctx("cannot get_dir: %d", err);
++ for (--i; i >= 0; i--) {
++ if (d[i])
++ dput(d[i]);
++ if (m[i])
++ mntput(m[i]);
++ }
++ return err;
++ }
++ i++;
++ }
++ if (d[0])
++ __set_fs_root(fs, m[0], d[0]);
++ if (d[1])
++ __set_fs_pwd(fs, m[1], d[1]);
++ if (d[2]) {
++ struct dentry *olddentry;
++ struct vfsmount *oldmnt;
++ write_lock(&fs->lock);
++ oldmnt = fs->altrootmnt;
++ olddentry = fs->altroot;
++ fs->altrootmnt = m[2];
++ fs->altroot = d[2];
++ write_unlock(&fs->lock);
++
++ if (olddentry) {
++ dput(olddentry);
++ mntput(oldmnt);
++ }
++ }
++ }
++ return err;
++}
++
++int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, unsigned long flags, struct cpt_context *ctx)
++{
++ int err;
++
++ if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0))
++ mntbind = NULL;
++
++ if (mntbind)
++ flags |= MS_BIND;
++
++ err = sc_mount(mntbind, mntpnt, mnttype, flags);
++ if (err < 0) {
++ eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags);
++ return err;
++ }
++ return 0;
++}
++
++static int undumptmpfs(void *arg)
++{
++ int i;
++ int *pfd = arg;
++ char *argv[] = { "tar", "x", "-C", "/", "-S", NULL };
++
++ if (pfd[0] != 0)
++ sc_dup2(pfd[0], 0);
++
++ for (i=1; i<current->files->fdt->max_fds; i++)
++ sc_close(i);
++
++ module_put(THIS_MODULE);
++
++ set_fs(KERNEL_DS);
++ i = sc_execve("/bin/tar", argv, NULL);
++ eprintk("failed to exec /bin/tar: %d\n", i);
++ return -1;
++}
++
++static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx)
++{
++ int err;
++ int pfd[2];
++ struct file *f;
++ struct cpt_object_hdr v;
++ int n;
++ loff_t end;
++ int pid;
++
++ err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx);
++ if (err < 0)
++ return err;
++
++ err = sc_pipe(pfd);
++ if (err < 0)
++ return err;
++ pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0);
++ if (err < 0)
++ goto out;
++ f = fget(pfd[1]);
++ sc_close(pfd[1]);
++ sc_close(pfd[0]);
++
++ ctx->file->f_pos = *pos + v.cpt_hdrlen;
++ end = *pos + v.cpt_next;
++ *pos += v.cpt_next;
++ do {
++ char buf[16];
++ mm_segment_t oldfs;
++
++ n = end - ctx->file->f_pos;
++ if (n > sizeof(buf))
++ n = sizeof(buf);
++
++ if (ctx->read(buf, n, ctx))
++ break;
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ f->f_op->write(f, buf, n, &f->f_pos);
++ set_fs(oldfs);
++ } while (ctx->file->f_pos < end);
++
++ fput(f);
++
++ clear_tsk_thread_flag(current,TIF_SIGPENDING);
++
++ if ((err = sc_waitx(pid, 0)) < 0)
++ eprintk_ctx("wait4: %d\n", err);
++
++ return 0;
++
++out:
++ if (pfd[1] >= 0)
++ sc_close(pfd[1]);
++ if (pfd[0] >= 0)
++ sc_close(pfd[0]);
++ return err;
++}
++
++int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx)
++{
++ int err;
++ loff_t endpos;
++
++ endpos = pos + mi->cpt_next;
++ pos += mi->cpt_hdrlen;
++
++ while (pos < endpos) {
++ char *mntdev;
++ char *mntpnt;
++ char *mnttype;
++ char *mntbind;
++
++ mntdev = __rst_get_name(&pos, ctx);
++ mntpnt = __rst_get_name(&pos, ctx);
++ mnttype = __rst_get_name(&pos, ctx);
++ mntbind = __rst_get_name(&pos, ctx);
++ err = -EINVAL;
++ if (mnttype && mntpnt) {
++ err = 0;
++ if (strcmp(mntpnt, "/"))
++ err = do_one_mount(mntpnt, mnttype, mntbind, mi->cpt_flags, ctx);
++ if (strcmp(mnttype, "tmpfs") == 0) {
++ rst_restore_tmpfs(&pos, ctx);
++ }
++ }
++ if (mntdev)
++ rst_put_name(mntdev, ctx);
++ if (mntpnt)
++ rst_put_name(mntpnt, ctx);
++ if (mnttype)
++ rst_put_name(mnttype, ctx);
++ if (mntbind)
++ rst_put_name(mntbind, ctx);
++ if (err)
++ return err;
++ }
++ return 0;
++}
++
++int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_vfsmount_image mi;
++
++ while (pos < endpos) {
++ err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx);
++ if (err)
++ return err;
++ err = restore_one_vfsmount(&mi, pos, ctx);
++ if (err)
++ return err;
++ pos += mi.cpt_next;
++ }
++ return 0;
++}
++
++int rst_root_namespace(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_NAMESPACE];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_object_hdr sbuf;
++ int done = 0;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx);
++ if (err)
++ return err;
++ if (done) {
++ eprintk_ctx("multiple namespaces are not supported\n");
++ break;
++ }
++ done++;
++ err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx);
++ if (err)
++ return err;
++ sec += sbuf.cpt_next;
++ }
++
++ return 0;
++}
++
++int rst_stray_files(struct cpt_context *ctx)
++{
++ int err = 0;
++ loff_t sec = ctx->sections[CPT_SECT_FILES];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ struct cpt_object_hdr sbuf;
++ cpt_object_t *obj;
++
++ err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx);
++ if (err)
++ break;
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx);
++ if (!obj) {
++ struct file *file;
++
++ dprintk_ctx("stray file %Ld\n", sec);
++
++ file = rst_sysv_shm(sec, ctx);
++
++ if (IS_ERR(file)) {
++ eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file));
++ return PTR_ERR(file);
++ } else {
++ fput(file);
++ }
++ }
++ sec += sbuf.cpt_next;
++ }
++
++ return err;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_i386.S linux-2.6.16-026test015/kernel/cpt/rst_i386.S
+--- linux-2.6.16.orig/kernel/cpt/rst_i386.S 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_i386.S 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,40 @@
++#define ASSEMBLY 1
++
++#include <linux/config.h>
++#include <linux/linkage.h>
++#include <asm/thread_info.h>
++#include <asm/errno.h>
++#include <asm/segment.h>
++#include <asm/page.h>
++#include <asm/smp.h>
++#include <asm/page.h>
++
++ .section .text
++ .align 4
++ .global ret_last_siginfo
++ret_last_siginfo:
++ call rlsi
++ movl %eax,%esp
++ ret
++
++ .align 8
++ .global ret_child_tid
++ret_child_tid:
++ push %esp
++ call rct
++ movl %eax,%esp
++ ret
++
++ .align 4
++ .global ret_from_rst
++ret_from_rst:
++ pushl %eax
++ jmp ret_from_fork+6
++
++ .align 4
++ .global pre_ret_from_fork
++pre_ret_from_fork:
++ pushl %eax
++ call schedule_tail
++ popl %eax
++ ret
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_mm.c linux-2.6.16-026test015/kernel/cpt/rst_mm.c
+--- linux-2.6.16.orig/kernel/cpt/rst_mm.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_mm.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,986 @@
++/*
++ *
++ * kernel/cpt/rst_mm.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/virtinfo.h>
++#include <linux/hugetlb.h>
++#include <linux/errno.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/mman.h>
++#include <linux/vmalloc.h>
++#include <linux/rmap.h>
++#include <linux/hash.h>
++#include <asm/pgalloc.h>
++#include <asm/tlb.h>
++#include <asm/tlbflush.h>
++#include <asm/pgtable.h>
++#include <asm/mmu.h>
++#include <asm/ldt.h>
++#include <asm/desc.h>
++#include <asm/mmu_context.h>
++#include <linux/swapops.h>
++#include <linux/cpt_image.h>
++
++#ifdef CONFIG_VE
++#include <ub/beancounter.h>
++#include <ub/ub_vmpages.h>
++#endif
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_files.h"
++#include "cpt_ubc.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++#include "cpt_pagein.h"
++#endif
++
++#include "cpt_syscalls.h"
++
++#define __PAGE_NX (1ULL<<63)
++
++static unsigned long make_prot(struct cpt_vma_image *vmai)
++{
++ unsigned long prot = 0;
++
++ if (vmai->cpt_flags&VM_READ)
++ prot |= PROT_READ;
++ if (vmai->cpt_flags&VM_WRITE)
++ prot |= PROT_WRITE;
++ if (vmai->cpt_flags&VM_EXEC)
++ prot |= PROT_EXEC;
++ if (vmai->cpt_flags&VM_GROWSDOWN)
++ prot |= PROT_GROWSDOWN;
++ if (vmai->cpt_flags&VM_GROWSUP)
++ prot |= PROT_GROWSUP;
++ return prot;
++}
++
++static unsigned long make_flags(struct cpt_vma_image *vmai)
++{
++ unsigned long flags = MAP_FIXED;
++
++ if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE))
++ flags |= MAP_SHARED;
++ else
++ flags |= MAP_PRIVATE;
++
++ if (vmai->cpt_file == CPT_NULL)
++ flags |= MAP_ANONYMOUS;
++ if (vmai->cpt_flags&VM_GROWSDOWN)
++ flags |= MAP_GROWSDOWN;
++ if (vmai->cpt_flags&VM_DENYWRITE)
++ flags |= MAP_DENYWRITE;
++ if (vmai->cpt_flags&VM_EXECUTABLE)
++ flags |= MAP_EXECUTABLE;
++ if (!(vmai->cpt_flags&VM_ACCOUNT))
++ flags |= MAP_NORESERVE;
++ return flags;
++}
++
++
++#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
++static int __alloc_ldt(mm_context_t *pc, int mincount)
++{
++ int oldsize, newsize, i;
++
++ if (mincount <= pc->size)
++ return 0;
++ /*
++ * LDT got larger - reallocate if necessary.
++ */
++ oldsize = pc->size;
++ mincount = (mincount+511)&(~511);
++ newsize = mincount*LDT_ENTRY_SIZE;
++ for (i = 0; i < newsize; i += PAGE_SIZE) {
++ int nr = i/PAGE_SIZE;
++ BUG_ON(i >= 64*1024);
++ if (!pc->ldt_pages[nr]) {
++ pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC);
++ if (!pc->ldt_pages[nr])
++ return -ENOMEM;
++ clear_highpage(pc->ldt_pages[nr]);
++ }
++ }
++ pc->size = mincount;
++ return 0;
++}
++
++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
++{
++ struct mm_struct *mm = current->mm;
++ int i;
++ int err;
++ int size;
++
++ err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE);
++ if (err)
++ return err;
++
++ size = mm->context.size*LDT_ENTRY_SIZE;
++
++ for (i = 0; i < size; i += PAGE_SIZE) {
++ int nr = i / PAGE_SIZE, bytes;
++ char *kaddr = kmap(mm->context.ldt_pages[nr]);
++
++ bytes = size - i;
++ if (bytes > PAGE_SIZE)
++ bytes = PAGE_SIZE;
++ err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i);
++ kunmap(mm->context.ldt_pages[nr]);
++ if (err)
++ return err;
++ }
++
++ load_LDT(&mm->context);
++ return 0;
++}
++
++#else
++
++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx)
++{
++ struct mm_struct *mm = current->mm;
++ int oldsize = mm->context.size;
++ void *oldldt;
++ void *newldt;
++ int err;
++
++ if (li->cpt_size > PAGE_SIZE)
++ newldt = vmalloc(li->cpt_size);
++ else
++ newldt = kmalloc(li->cpt_size, GFP_KERNEL);
++
++ if (!newldt)
++ return -ENOMEM;
++
++ err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen);
++ if (err)
++ return err;
++
++ oldldt = mm->context.ldt;
++ mm->context.ldt = newldt;
++ mm->context.size = li->cpt_size/LDT_ENTRY_SIZE;
++
++ load_LDT(&mm->context);
++
++ if (oldsize) {
++ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
++ vfree(oldldt);
++ else
++ kfree(oldldt);
++ }
++ return 0;
++}
++#endif
++
++static int
++restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg)
++{
++ struct aio_ring_info *info = &aio_ctx->ring_info;
++ unsigned nr_events = aio_ctx->max_reqs;
++ unsigned long size;
++ int nr_pages;
++
++ /* We recalculate parameters of the ring exactly like
++ * fs/aio.c does and then compare calculated values
++ * with ones, stored in dump. They must be the same. */
++
++ nr_events += 2;
++
++ size = sizeof(struct aio_ring);
++ size += sizeof(struct io_event) * nr_events;
++ nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
++
++ if (nr_pages != aimg->cpt_ring_pages)
++ return -EINVAL;
++
++ info->nr_pages = nr_pages;
++
++ nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
++
++ if (nr_events != aimg->cpt_nr)
++ return -EINVAL;
++
++ info->nr = 0;
++ info->ring_pages = info->internal_pages;
++ if (nr_pages > AIO_RING_PAGES) {
++ info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
++ if (!info->ring_pages)
++ return -ENOMEM;
++ memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
++ }
++
++ info->mmap_size = nr_pages * PAGE_SIZE;
++
++ /* This piece of shit is not entirely my fault. Kernel aio.c makes
++ * something odd mmap()ping some pages and then pinning them.
++ * I guess it is just some mud remained of failed attempt to show ring
++ * to user space. The result is odd. :-) Immediately after
++ * creation of AIO context, kernel shares those pages with user
++ * and user can read and even write there. But after the first
++ * fork, pages are marked COW with evident consequences.
++ * I remember, I did the same mistake in the first version
++ * of mmapped packet socket, luckily that crap never reached
++ * mainstream.
++ *
++ * So, what are we going to do? I can simulate this odd behaviour
++ * exactly, but I am not insane yet. For now just take the pages
++ * from user space. Alternatively, we could keep kernel copy
++ * in AIO context image, which would be more correct.
++ *
++ * What is wrong now? If the pages are COWed, ring is transferred
++ * incorrectly.
++ */
++ down_read(&current->mm->mmap_sem);
++ info->mmap_base = aimg->cpt_mmap_base;
++ info->nr_pages = get_user_pages(current, current->mm,
++ info->mmap_base, nr_pages,
++ 1, 0, info->ring_pages, NULL);
++ up_read(&current->mm->mmap_sem);
++
++ if (unlikely(info->nr_pages != nr_pages)) {
++ int i;
++
++ for (i=0; i<info->nr_pages; i++)
++ put_page(info->ring_pages[i]);
++ if (info->ring_pages && info->ring_pages != info->internal_pages)
++ kfree(info->ring_pages);
++ return -EFAULT;
++ }
++
++ aio_ctx->user_id = info->mmap_base;
++
++ info->nr = nr_events;
++ info->tail = aimg->cpt_tail;
++
++ return 0;
++}
++
++static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx)
++{
++ int err;
++ struct kioctx *aio_ctx;
++ extern spinlock_t aio_nr_lock;
++
++ aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
++ if (!aio_ctx)
++ return -ENOMEM;
++
++ memset(aio_ctx, 0, sizeof(*aio_ctx));
++ aio_ctx->max_reqs = aimg->cpt_max_reqs;
++
++ if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) {
++ kmem_cache_free(kioctx_cachep, aio_ctx);
++ eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err);
++ return err;
++ }
++
++ aio_ctx->mm = current->mm;
++ atomic_inc(&aio_ctx->mm->mm_count);
++ atomic_set(&aio_ctx->users, 1);
++ spin_lock_init(&aio_ctx->ctx_lock);
++ spin_lock_init(&aio_ctx->ring_info.ring_lock);
++ init_waitqueue_head(&aio_ctx->wait);
++ INIT_LIST_HEAD(&aio_ctx->active_reqs);
++ INIT_LIST_HEAD(&aio_ctx->run_list);
++ INIT_WORK(&aio_ctx->wq, aio_kick_handler, ctx);
++
++ spin_lock(&aio_nr_lock);
++ aio_nr += aio_ctx->max_reqs;
++ spin_unlock(&aio_nr_lock);
++
++ write_lock(&aio_ctx->mm->ioctx_list_lock);
++ aio_ctx->next = aio_ctx->mm->ioctx_list;
++ aio_ctx->mm->ioctx_list = aio_ctx;
++ write_unlock(&aio_ctx->mm->ioctx_list_lock);
++
++ return 0;
++}
++
++struct anonvma_map
++{
++ struct hlist_node list;
++ struct anon_vma *avma;
++ __u64 id;
++};
++
++static int verify_create_anonvma(struct mm_struct *mm,
++ struct cpt_vma_image *vmai,
++ cpt_context_t *ctx)
++{
++ struct anon_vma *avma = NULL;
++ struct anon_vma *new_avma;
++ struct vm_area_struct *vma;
++ int h;
++
++ if (!ctx->anonvmas) {
++ if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE)
++ return -EINVAL;
++ if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL)
++ return -ENOMEM;
++ for (h = 0; h < CPT_ANONVMA_HSIZE; h++)
++ INIT_HLIST_HEAD(&ctx->anonvmas[h]);
++ } else {
++ struct anonvma_map *map;
++ struct hlist_node *elem;
++
++ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
++ hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) {
++ if (map->id == vmai->cpt_anonvmaid) {
++ avma = map->avma;
++ break;
++ }
++ }
++ }
++
++ down_read(&mm->mmap_sem);
++ if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) {
++ up_read(&mm->mmap_sem);
++ return -ESRCH;
++ }
++ if (vma->vm_start != vmai->cpt_start) {
++ up_read(&mm->mmap_sem);
++ eprintk_ctx("vma start mismatch\n");
++ return -EINVAL;
++ }
++ if (vma->vm_pgoff != vmai->cpt_pgoff) {
++ dprintk_ctx("vma pgoff mismatch, fixing\n");
++ if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) {
++ eprintk_ctx("cannot fixup vma pgoff\n");
++ up_read(&mm->mmap_sem);
++ return -EINVAL;
++ }
++ vma->vm_pgoff = vmai->cpt_pgoff;
++ }
++
++ if (!vma->anon_vma) {
++ if (avma) {
++ vma->anon_vma = avma;
++ anon_vma_link(vma);
++ } else {
++ int err;
++
++ err = anon_vma_prepare(vma);
++
++ if (err) {
++ up_read(&mm->mmap_sem);
++ return err;
++ }
++ }
++ } else {
++ /* Note, we _can_ arrive to the situation, when two
++ * different anonvmaid's point to one anon_vma, this happens
++ * f.e. when mmap() merged new area to previous one and
++ * they will share one anon_vma even if they did not on
++ * original host.
++ *
++ * IT IS OK. To all that I understand, we may merge all
++ * the anon_vma's and rmap can scan all the huge list of vmas
++ * searching for page. It is just "suboptimal".
++ *
++ * Real disaster would happen, if vma already got an anon_vma
++ * with different id. It is very rare case, kernel does the
++ * best efforts to merge anon_vmas when some attributes are
++ * different. In this case we will fall to copying memory.
++ */
++ if (avma && vma->anon_vma != avma) {
++ up_read(&mm->mmap_sem);
++ wprintk_ctx("anon_vma mismatch\n");
++ return 0;
++ }
++ }
++
++ new_avma = vma->anon_vma;
++ up_read(&mm->mmap_sem);
++
++ if (!avma) {
++ struct anonvma_map *map;
++
++ if (!new_avma)
++ return -EINVAL;
++
++ if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL)
++ return -ENOMEM;
++
++ map->id = vmai->cpt_anonvmaid;
++ map->avma = new_avma;
++ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS);
++ hlist_add_head(&map->list, &ctx->anonvmas[h]);
++ }
++ return 0;
++}
++
++static int copy_mm_pages(struct mm_struct *src, unsigned long start,
++ unsigned long end)
++{
++ int err;
++
++ for (; start < end; start += PAGE_SIZE) {
++ struct page *page;
++ struct page *spage;
++ void *maddr, *srcaddr;
++
++ err = get_user_pages(current, current->mm,
++ start, 1, 1, 1, &page, NULL);
++ if (err == 0)
++ err = -EFAULT;
++ if (err < 0)
++ return err;
++
++ err = get_user_pages(current, src,
++ start, 1, 0, 1, &spage, NULL);
++
++ if (err == 0)
++ err = -EFAULT;
++ if (err < 0) {
++ page_cache_release(page);
++ return err;
++ }
++
++ srcaddr = kmap(spage);
++ maddr = kmap(page);
++ memcpy(maddr, srcaddr, PAGE_SIZE);
++ set_page_dirty_lock(page);
++ kunmap(page);
++ kunmap(spage);
++ page_cache_release(page);
++ page_cache_release(spage);
++ }
++ return 0;
++}
++
++static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx)
++{
++ int err = 0;
++ unsigned long addr;
++ struct mm_struct *mm = current->mm;
++ struct vm_area_struct *vma;
++ struct file *file = NULL;
++ unsigned long prot;
++ int checked = 0;
++
++ prot = make_prot(vmai);
++
++ if (vmai->cpt_file != CPT_NULL) {
++ if (vmai->cpt_type == CPT_VMA_TYPE_0) {
++ file = rst_file(vmai->cpt_file, -1, ctx);
++ if (IS_ERR(file)) {
++ eprintk_ctx("do_rst_vma: rst_file: %Ld\n", vmai->cpt_file);
++ return PTR_ERR(file);
++ }
++ } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) {
++ file = rst_sysv_shm(vmai->cpt_file, ctx);
++ if (IS_ERR(file))
++ return PTR_ERR(file);
++ }
++ }
++
++ down_write(&mm->mmap_sem);
++ addr = do_mmap_pgoff(file, vmai->cpt_start,
++ vmai->cpt_end-vmai->cpt_start,
++ prot, make_flags(vmai),
++ vmai->cpt_pgoff);
++
++ if (addr != vmai->cpt_start) {
++ up_write(&mm->mmap_sem);
++
++ err = -EINVAL;
++ if (IS_ERR((void*)addr))
++ err = addr;
++ goto out;
++ }
++
++ vma = find_vma(mm, vmai->cpt_start);
++ if (vma == NULL) {
++ up_write(&mm->mmap_sem);
++ eprintk_ctx("cannot find mmapped vma\n");
++ err = -ESRCH;
++ goto out;
++ }
++
++ /* do_mmap_pgoff() can merge new area to previous one (not to the next,
++ * we mmap in order, the rest of mm is still unmapped). This can happen
++ * f.e. if flags are to be adjusted later, or if we had different
++ * anon_vma on two adjacent regions. Split it by brute force. */
++ if (vma->vm_start != vmai->cpt_start) {
++ dprintk_ctx("vma %Ld merged, split\n", vmapos);
++ err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0);
++ if (err) {
++ up_write(&mm->mmap_sem);
++ eprintk_ctx("cannot split vma\n");
++ goto out;
++ }
++ }
++ up_write(&mm->mmap_sem);
++
++ if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) {
++ err = verify_create_anonvma(mm, vmai, ctx);
++ if (err) {
++ eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos);
++ goto out;
++ }
++ }
++
++ if (vmai->cpt_next > vmai->cpt_hdrlen) {
++ loff_t offset = vmapos + vmai->cpt_hdrlen;
++
++ do {
++ union {
++ struct cpt_page_block pb;
++ struct cpt_remappage_block rpb;
++ struct cpt_copypage_block cpb;
++ struct cpt_lazypage_block lpb;
++ } u;
++ loff_t pos;
++
++ err = rst_get_object(-1, offset, &u, ctx);
++ if (err) {
++ eprintk_ctx("vma fix object: %d\n", err);
++ goto out;
++ }
++ if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) {
++ err = sc_remap_file_pages(u.rpb.cpt_start,
++ u.rpb.cpt_end-u.rpb.cpt_start,
++ 0, u.rpb.cpt_pgoff, 0);
++ if (err < 0) {
++ eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err,
++ (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start),
++ (__u32)u.rpb.cpt_pgoff);
++ goto out;
++ }
++ offset += u.rpb.cpt_next;
++ continue;
++ } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) {
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ unsigned long addr = u.lpb.cpt_start;
++
++ down_read(&mm->mmap_sem);
++ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) {
++ up_read(&mm->mmap_sem);
++ eprintk_ctx("lost vm_area_struct\n");
++ err = -ESRCH;
++ goto out;
++ }
++ err = anon_vma_prepare(vma);
++ if (err) {
++ up_read(&mm->mmap_sem);
++ goto out;
++ }
++ while (addr < u.lpb.cpt_end) {
++ err = rst_pagein(vma, u.lpb.cpt_index + (addr-u.lpb.cpt_start)/PAGE_SIZE,
++ addr, ctx);
++ if (err)
++ break;
++ addr += PAGE_SIZE;
++ }
++ up_read(&mm->mmap_sem);
++#else
++ err = -EINVAL;
++#endif
++ if (err)
++ goto out;
++ offset += u.cpb.cpt_next;
++ continue;
++ } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) {
++ struct vm_area_struct *vma, *vma1;
++ struct mm_struct *src;
++ struct anon_vma *src_anon;
++ cpt_object_t *mobj;
++
++ if (!vmai->cpt_anonvmaid) {
++ err = -EINVAL;
++ eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n");
++ goto out;
++ }
++
++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx);
++ if (!mobj) {
++ eprintk_ctx("lost mm_struct to clone pages from\n");
++ err = -ESRCH;
++ goto out;
++ }
++ src = mobj->o_obj;
++
++ down_read(&src->mmap_sem);
++ src_anon = NULL;
++ vma1 = find_vma(src, u.cpb.cpt_start);
++ if (vma1)
++ src_anon = vma1->anon_vma;
++ up_read(&src->mmap_sem);
++
++ if (!vma1) {
++ eprintk_ctx("lost src vm_area_struct\n");
++ err = -ESRCH;
++ goto out;
++ }
++
++ down_read(&mm->mmap_sem);
++ if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) {
++ up_read(&mm->mmap_sem);
++ eprintk_ctx("lost vm_area_struct\n");
++ err = -ESRCH;
++ goto out;
++ }
++
++ if (!src_anon ||
++ !vma->anon_vma ||
++ vma->anon_vma != src_anon ||
++ vma->vm_start - vma1->vm_start !=
++ (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) {
++ up_read(&mm->mmap_sem);
++ wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos);
++ err = copy_mm_pages(mobj->o_obj,
++ u.cpb.cpt_start,
++ u.cpb.cpt_end);
++ } else {
++ err = __copy_page_range(vma, vma1,
++ u.cpb.cpt_start,
++ u.cpb.cpt_end-u.cpb.cpt_start);
++ up_read(&mm->mmap_sem);
++ }
++ if (err) {
++ eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err,
++ (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start),
++ (long)u.cpb.cpt_source);
++ goto out;
++ }
++
++ offset += u.cpb.cpt_next;
++ continue;
++ }
++ if (u.pb.cpt_object != CPT_OBJ_PAGES) {
++ eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object);
++ err = -EINVAL;
++ goto out;
++ }
++ pos = offset + sizeof(u.pb);
++ if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) {
++ /* I guess this is get_user_pages() messed things,
++ * this happens f.e. when gdb inserts breakpoints.
++ */
++ int i;
++ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) {
++ struct page *page;
++ void *maddr;
++ err = get_user_pages(current, current->mm,
++ (unsigned long)u.pb.cpt_start + i*PAGE_SIZE,
++ 1, 1, 1, &page, NULL);
++ if (err == 0)
++ err = -EFAULT;
++ if (err < 0) {
++ eprintk_ctx("get_user_pages: %d\n", err);
++ goto out;
++ }
++ err = 0;
++ maddr = kmap(page);
++ if (u.pb.cpt_content == CPT_CONTENT_VOID) {
++ memset(maddr, 0, PAGE_SIZE);
++ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
++ err = ctx->pread(maddr, PAGE_SIZE,
++ ctx, pos + i*PAGE_SIZE);
++ if (err) {
++ kunmap(page);
++ goto out;
++ }
++ } else {
++ err = -EINVAL;
++ kunmap(page);
++ goto out;
++ }
++ set_page_dirty_lock(page);
++ kunmap(page);
++ page_cache_release(page);
++ }
++ } else {
++ if (!(prot&PROT_WRITE))
++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
++ if (u.pb.cpt_content == CPT_CONTENT_VOID) {
++ int i;
++ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) {
++ err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i);
++ if (err) {
++ eprintk_ctx("__put_user 2 %d\n", err);
++ goto out;
++ }
++ }
++ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) {
++ loff_t tpos = pos;
++ err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start),
++ u.pb.cpt_end-u.pb.cpt_start,
++ &tpos);
++ if (err != u.pb.cpt_end-u.pb.cpt_start) {
++ if (err >= 0)
++ err = -EIO;
++ goto out;
++ }
++ } else {
++ err = -EINVAL;
++ goto out;
++ }
++ if (!(prot&PROT_WRITE))
++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
++ }
++ err = 0;
++ offset += u.pb.cpt_next;
++ } while (offset < vmapos + vmai->cpt_next);
++ }
++
++check:
++ do {
++ struct vm_area_struct *vma;
++ down_read(&mm->mmap_sem);
++ vma = find_vma(mm, addr);
++ if (vma) {
++ if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) {
++ VM_ClearReadHint(vma);
++ vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK;
++ }
++ if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) {
++ dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos);
++ up_read(&mm->mmap_sem);
++ if (vma->vm_flags&VM_LOCKED)
++ err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start);
++ else
++ err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start);
++ if (err)
++ goto out;
++ goto check;
++ }
++ if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX)
++ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
++ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot);
++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
++ if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) &&
++ (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE))
++ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos,
++ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot);
++#endif
++ if (vma->vm_flags != vmai->cpt_flags) {
++ unsigned long x = vma->vm_flags ^ vmai->cpt_flags;
++ if (x & VM_EXEC) {
++ /* Crap. On i386 this is OK.
++ * It is impossible to make via mmap/mprotect
++ * exec.c clears VM_EXEC on stack. */
++ vma->vm_flags &= ~VM_EXEC;
++ } else if ((x & VM_ACCOUNT) && !checked) {
++ checked = 1;
++ if (!(prot&PROT_WRITE)) {
++ up_read(&mm->mmap_sem);
++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE);
++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot);
++ goto check;
++ }
++ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
++ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
++ } else {
++ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos,
++ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags);
++ }
++ }
++ } else {
++ wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos);
++ }
++ up_read(&mm->mmap_sem);
++ } while (0);
++
++out:
++ if (file)
++ fput(file);
++ return err;
++}
++
++static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx)
++{
++ int err = 0;
++ unsigned int def_flags;
++ struct mm_struct *mm = current->mm;
++
++ down_write(&mm->mmap_sem);
++ do_munmap(mm, 0, TASK_SIZE);
++
++ mm->start_code = vmi->cpt_start_code;
++ mm->end_code = vmi->cpt_end_code;
++ mm->start_data = vmi->cpt_start_data;
++ mm->end_data = vmi->cpt_end_data;
++ mm->start_brk = vmi->cpt_start_brk;
++ mm->brk = vmi->cpt_brk;
++ mm->start_stack = vmi->cpt_start_stack;
++ mm->arg_start = vmi->cpt_start_arg;
++ mm->arg_end = vmi->cpt_end_arg;
++ mm->env_start = vmi->cpt_start_env;
++ mm->env_end = vmi->cpt_end_env;
++ mm->def_flags = 0;
++ def_flags = vmi->cpt_def_flags;
++
++ mm->dumpable = (vmi->cpt_dumpable != 0);
++ mm->vps_dumpable = (vmi->cpt_vps_dumpable != 0);
++
++#if 0 /* def CONFIG_HUGETLB_PAGE*/
++/* NB: ? */
++ int used_hugetlb;
++#endif
++ up_write(&mm->mmap_sem);
++
++ if (vmi->cpt_next > vmi->cpt_hdrlen) {
++ loff_t offset = pos + vmi->cpt_hdrlen;
++ do {
++ union {
++ struct cpt_vma_image vmai;
++ struct cpt_aio_ctx_image aioi;
++ struct cpt_obj_bits bits;
++ } u;
++ err = rst_get_object(-1, offset, &u, ctx);
++ if (err)
++ goto out;
++ if (u.vmai.cpt_object == CPT_OBJ_VMA) {
++ err = do_rst_vma(&u.vmai, offset, pos, ctx);
++ if (err)
++ goto out;
++ } else if (u.bits.cpt_object == CPT_OBJ_BITS &&
++ u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) {
++ err = do_rst_ldt(&u.bits, offset, ctx);
++ if (err)
++ goto out;
++ } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) {
++ err = do_rst_aio(&u.aioi, offset, ctx);
++ if (err)
++ goto out;
++ } else {
++ eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object);
++ err = -EINVAL;
++ goto out;
++ }
++ offset += u.vmai.cpt_next;
++ } while (offset < pos + vmi->cpt_next);
++ }
++
++ down_write(&mm->mmap_sem);
++ mm->def_flags = def_flags;
++ up_write(&mm->mmap_sem);
++
++
++out:
++ return err;
++}
++
++extern void exit_mm(struct task_struct * tsk);
++
++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ int err = 0;
++ cpt_object_t *mobj;
++ void *tmp = (void*)__get_free_page(GFP_KERNEL);
++ struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp;
++
++ if (!tmp)
++ return -ENOMEM;
++
++ if (ti->cpt_mm == CPT_NULL) {
++ if (current->mm)
++ exit_mm(current);
++ goto out;
++ }
++
++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
++ if (mobj) {
++ if (current->mm != mobj->o_obj) BUG();
++ goto out;
++ }
++
++ if (current->mm == NULL) {
++ struct mm_struct *mm = mm_alloc();
++ if (mm == NULL) {
++ err = -ENOMEM;
++ goto out;
++ }
++ err = init_new_context(current, mm);
++ if (err) {
++ mmdrop(mm);
++ goto out;
++ }
++ current->mm = mm;
++ }
++
++ if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0)
++ goto out;
++ if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) {
++ eprintk_ctx("do_rst_mm %Ld\n", ti->cpt_mm);
++ goto out;
++ }
++ err = -ENOMEM;
++ mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx);
++ if (mobj != NULL) {
++ err = 0;
++ cpt_obj_setpos(mobj, ti->cpt_mm, ctx);
++ }
++
++out:
++ if (tmp)
++ free_page((unsigned long)tmp);
++ return err;
++}
++
++/* This is part of mm setup, made in parent context. Mostly, it is the place,
++ * where we graft mm of another process to child.
++ */
++
++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ task_t *tsk = obj->o_obj;
++ cpt_object_t *mobj;
++
++ /* Task without mm. Just get rid of this. */
++ if (ti->cpt_mm == CPT_NULL) {
++ if (tsk->mm) {
++ mmput(tsk->mm);
++ tsk->mm = NULL;
++ }
++ return 0;
++ }
++
++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx);
++ if (mobj) {
++ struct mm_struct *newmm = mobj->o_obj;
++ /* Good, the MM is already created. */
++ if (newmm == tsk->mm) {
++ /* Already done by clone(). */
++ return 0;
++ }
++ mmput(tsk->mm);
++ atomic_inc(&newmm->mm_users);
++ tsk->mm = newmm;
++ tsk->active_mm = newmm;
++ }
++ return 0;
++}
++
++/* We use CLONE_VM when mm of child is going to be shared with parent.
++ * Otherwise mm is copied.
++ */
++
++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ if (ti->cpt_mm == CPT_NULL ||
++ lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx))
++ return CLONE_VM;
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_net.c linux-2.6.16-026test015/kernel/cpt/rst_net.c
+--- linux-2.6.16.orig/kernel/cpt/rst_net.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_net.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,481 @@
++/*
++ *
++ * kernel/cpt/rst_net.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/netdevice.h>
++#include <linux/inetdevice.h>
++#include <linux/rtnetlink.h>
++#include <linux/ve.h>
++#include <linux/ve_proto.h>
++#include <net/route.h>
++#include <net/ip_fib.h>
++#include <net/addrconf.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++#include "cpt_net.h"
++
++#include "cpt_syscalls.h"
++
++extern struct in_ifaddr *inet_alloc_ifa(void);
++extern int inet_insert_ifa(struct in_ifaddr *ifa);
++
++int rst_restore_ifaddr(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_ifaddr_image di;
++ struct net_device *dev;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ int cindex = -1;
++ int err;
++ err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx);
++ if (err)
++ return err;
++ if (di.cpt_index == ctx->lo_index_old)
++ cindex = ctx->lo_index;
++ else if (di.cpt_index == ctx->venet_index_old)
++ cindex = ctx->venet_index;
++ if (cindex <= 0)
++ eprintk_ctx("unknown ifaddr for %d\n", di.cpt_index);
++ rtnl_lock();
++ dev = __dev_get_by_index(cindex);
++ if (dev && di.cpt_family == AF_INET) {
++ struct in_device *in_dev;
++ struct in_ifaddr *ifa;
++ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
++ in_dev = inetdev_init(dev);
++ ifa = inet_alloc_ifa();
++ if (ifa) {
++ ifa->ifa_local = di.cpt_address[0];
++ ifa->ifa_address = di.cpt_peer[0];
++ ifa->ifa_broadcast = di.cpt_broadcast[0];
++ ifa->ifa_prefixlen = di.cpt_masklen;
++ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
++ ifa->ifa_flags = di.cpt_flags;
++ ifa->ifa_scope = di.cpt_scope;
++ memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ);
++ in_dev_hold(in_dev);
++ ifa->ifa_dev = in_dev;
++ err = inet_insert_ifa(ifa);
++ if (err && err != -EEXIST) {
++ rtnl_unlock();
++ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
++ return err;
++ }
++ }
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ } else if (dev && di.cpt_family == AF_INET6) {
++ err = inet6_addr_add(dev->ifindex,
++ (struct in6_addr *)di.cpt_address,
++ di.cpt_masklen);
++ if (err && err != -EEXIST) {
++ rtnl_unlock();
++ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label);
++ return err;
++ }
++#endif
++ } else {
++ rtnl_unlock();
++ eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index);
++ return -EINVAL;
++ }
++ rtnl_unlock();
++ sec += di.cpt_next;
++ }
++ return 0;
++}
++
++static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx)
++{
++ int min_len = NLMSG_LENGTH(sizeof(struct rtmsg));
++ struct rtmsg *rtm = NLMSG_DATA(nlh);
++ int idx = -1;
++ __u32 prefix0 = 0;
++
++ if (nlh->nlmsg_len > min_len) {
++ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len);
++ struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len);
++
++ while (RTA_OK(rta, attrlen)) {
++ if (rta->rta_type == RTA_OIF) {
++ idx = *(int*)RTA_DATA(rta);
++ if (idx == ctx->lo_index_old)
++ idx = ctx->lo_index;
++ else if (idx == ctx->venet_index_old)
++ idx = ctx->venet_index;
++ else {
++ eprintk_ctx("unknown iface %d\n", idx);
++ return -ENODEV;
++ }
++ *(int*)RTA_DATA(rta) = idx;
++ } else if (rta->rta_type == RTA_DST) {
++ prefix0 = *(__u32*)RTA_DATA(rta);
++ }
++ rta = RTA_NEXT(rta, attrlen);
++ }
++ }
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ if (rtm->rtm_family == AF_INET6) {
++ if (rtm->rtm_type == RTN_LOCAL)
++ return 2;
++ if (rtm->rtm_flags & RTM_F_CLONED)
++ return 2;
++ if (rtm->rtm_protocol == RTPROT_UNSPEC ||
++ rtm->rtm_protocol == RTPROT_RA ||
++ rtm->rtm_protocol == RTPROT_REDIRECT ||
++ rtm->rtm_protocol == RTPROT_KERNEL)
++ return 2;
++ if (rtm->rtm_protocol == RTPROT_BOOT &&
++ ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) ||
++ (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000))))
++ return 2;
++ }
++#endif
++ return rtm->rtm_protocol == RTPROT_KERNEL;
++}
++
++int rst_restore_route(struct cpt_context *ctx)
++{
++ int err;
++ struct socket *sock;
++ struct msghdr msg;
++ struct iovec iov;
++ struct sockaddr_nl nladdr;
++ mm_segment_t oldfs;
++ loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_object_hdr v;
++ char *pg;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ if (h.cpt_hdrlen >= h.cpt_next)
++ return 0;
++
++ sec += h.cpt_hdrlen;
++ err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx);
++ if (err < 0)
++ return err;
++
++ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock);
++ if (err)
++ return err;
++
++ pg = (char*)__get_free_page(GFP_KERNEL);
++ if (pg == NULL) {
++ err = -ENOMEM;
++ goto out_sock;
++ }
++
++ memset(&nladdr, 0, sizeof(nladdr));
++ nladdr.nl_family = AF_NETLINK;
++
++ endsec = sec + v.cpt_next;
++ sec += v.cpt_hdrlen;
++
++ while (sec < endsec) {
++ struct nlmsghdr *n;
++ struct nlmsghdr nh;
++ int kernel_flag;
++
++ err = ctx->pread(&nh, sizeof(nh), ctx, sec);
++ if (err)
++ goto out_sock_pg;
++ if (nh.nlmsg_len > PAGE_SIZE) {
++ err = -EINVAL;
++ goto out_sock_pg;
++ }
++ err = ctx->pread(pg, nh.nlmsg_len, ctx, sec);
++ if (err)
++ goto out_sock_pg;
++
++ n = (struct nlmsghdr*)pg;
++ n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE;
++
++ err = rewrite_rtmsg(n, ctx);
++ if (err < 0)
++ goto out_sock_pg;
++ kernel_flag = err;
++
++ if (kernel_flag == 2)
++ goto do_next;
++
++ iov.iov_base=n;
++ iov.iov_len=nh.nlmsg_len;
++ msg.msg_name=&nladdr;
++ msg.msg_namelen=sizeof(nladdr);
++ msg.msg_iov=&iov;
++ msg.msg_iovlen=1;
++ msg.msg_control=NULL;
++ msg.msg_controllen=0;
++ msg.msg_flags=MSG_DONTWAIT;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = sock_sendmsg(sock, &msg, nh.nlmsg_len);
++ set_fs(oldfs);
++
++ if (err < 0)
++ goto out_sock_pg;
++ err = 0;
++
++ iov.iov_base=pg;
++ iov.iov_len=PAGE_SIZE;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT);
++ set_fs(oldfs);
++ if (err != -EAGAIN) {
++ if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) &&
++ n->nlmsg_type == NLMSG_ERROR) {
++ struct nlmsgerr *e = NLMSG_DATA(n);
++ if (e->error != -EEXIST || !kernel_flag)
++ eprintk_ctx("NLMERR: %d\n", e->error);
++ } else {
++ eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type);
++ }
++ }
++do_next:
++ err = 0;
++ sec += NLMSG_ALIGN(nh.nlmsg_len);
++ }
++
++out_sock_pg:
++ free_page((unsigned long)pg);
++out_sock:
++ sock_release(sock);
++ return err;
++}
++
++int rst_resume_network(struct cpt_context *ctx)
++{
++ struct ve_struct *env;
++
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++ env->disable_net = 0;
++ put_ve(env);
++ return 0;
++}
++
++int rst_restore_netdev(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_netdev_image di;
++ struct net_device *dev;
++
++ get_exec_env()->disable_net = 1;
++
++ dev = __dev_get_by_name("lo");
++ if (!dev) {
++ eprintk_ctx("cannot find loopback netdevice\n");
++ return -EINVAL;
++ }
++ ctx->lo_index = dev->ifindex;
++ ctx->lo_index_old = -1;
++ dev = __dev_get_by_name("venet0");
++ if (!dev) {
++ eprintk_ctx("cannot find venet0 netdevice\n");
++ return -EINVAL;
++ }
++ ctx->venet_index = dev->ifindex;
++ ctx->venet_index_old = -1;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ int err;
++ err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx);
++ if (err)
++ return err;
++ if (strcmp(di.cpt_name, "lo") == 0) {
++ ctx->lo_index_old = di.cpt_index;
++ } else if (strcmp(di.cpt_name, "venet0") == 0) {
++ ctx->venet_index_old = di.cpt_index;
++ } else {
++ eprintk_ctx("unknown interface %s\n", di.cpt_name);
++ }
++ dev = __dev_get_by_name(di.cpt_name);
++ if (dev) {
++ if (di.cpt_flags^dev->flags) {
++ rtnl_lock();
++ err = dev_change_flags(dev, di.cpt_flags);
++ rtnl_unlock();
++ if (err)
++ eprintk_ctx("dev_change_flags err: %d\n", err);
++ }
++ } else {
++ eprintk_ctx("unknown interface 2 %s\n", di.cpt_name);
++ }
++ sec += di.cpt_next;
++ }
++ return 0;
++}
++
++static int dumpfn(void *arg)
++{
++ int i;
++ int *pfd = arg;
++ char *argv[] = { "iptables-restore", "-c", NULL };
++
++ if (pfd[0] != 0)
++ sc_dup2(pfd[0], 0);
++
++ for (i=1; i<current->files->fdt->max_fds; i++)
++ sc_close(i);
++
++ module_put(THIS_MODULE);
++
++ set_fs(KERNEL_DS);
++ i = sc_execve("/sbin/iptables-restore", argv, NULL);
++ eprintk("failed to exec /sbin/iptables-restore: %d\n", i);
++ return -1;
++}
++
++static int rst_restore_iptables(struct cpt_context * ctx)
++{
++ int err;
++ int pfd[2];
++ struct file *f;
++ struct cpt_object_hdr v;
++ int n;
++ struct cpt_section_hdr h;
++ loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES];
++ loff_t end;
++ int pid;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ if (h.cpt_hdrlen == h.cpt_next)
++ return 0;
++ if (h.cpt_hdrlen > h.cpt_next)
++ return -EINVAL;
++ sec += h.cpt_hdrlen;
++ err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx);
++ if (err < 0)
++ return err;
++
++ err = sc_pipe(pfd);
++ if (err < 0)
++ return err;
++ pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0);
++ if (err < 0)
++ goto out;
++ f = fget(pfd[1]);
++ sc_close(pfd[1]);
++ sc_close(pfd[0]);
++
++ ctx->file->f_pos = sec + v.cpt_hdrlen;
++ end = sec + v.cpt_next;
++ do {
++ char *p;
++ char buf[16];
++ mm_segment_t oldfs;
++
++ n = end - ctx->file->f_pos;
++ if (n > sizeof(buf))
++ n = sizeof(buf);
++
++ if (ctx->read(buf, n, ctx))
++ break;
++ if ((p = memchr(buf, 0, n)) != NULL)
++ n = p - buf;
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ f->f_op->write(f, buf, n, &f->f_pos);
++ set_fs(oldfs);
++ } while (ctx->file->f_pos < end);
++
++ fput(f);
++
++ clear_tsk_thread_flag(current,TIF_SIGPENDING);
++
++ if ((err = sc_waitx(pid, 0)) < 0)
++ eprintk_ctx("wait4: %d\n", err);
++
++ return 0;
++
++out:
++ if (pfd[1] >= 0)
++ sc_close(pfd[1]);
++ if (pfd[0] >= 0)
++ sc_close(pfd[0]);
++ return err;
++}
++
++int rst_restore_net(struct cpt_context *ctx)
++{
++ int err;
++
++ err = rst_restore_netdev(ctx);
++ if (!err)
++ err = rst_restore_ifaddr(ctx);
++ if (!err)
++ err = rst_restore_route(ctx);
++ if (!err)
++ err = rst_restore_iptables(ctx);
++ if (!err)
++ err = rst_restore_ip_conntrack(ctx);
++ return err;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_proc.c linux-2.6.16-026test015/kernel/cpt/rst_proc.c
+--- linux-2.6.16.orig/kernel/cpt/rst_proc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_proc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,604 @@
++/*
++ *
++ * kernel/cpt/rst_proc.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/proc_fs.h>
++#include <linux/smp_lock.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_ioctl.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_dump.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_kernel.h"
++
++MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>");
++MODULE_LICENSE("GPL");
++
++/* List of contexts and lock protecting the list */
++static struct list_head cpt_context_list;
++static spinlock_t cpt_context_lock;
++
++static int proc_read(char *buffer, char **start, off_t offset,
++ int length, int *eof, void *data)
++{
++ off_t pos = 0;
++ off_t begin = 0;
++ int len = 0;
++ cpt_context_t *ctx;
++
++ len += sprintf(buffer, "Ctx Id VE State\n");
++
++ spin_lock(&cpt_context_lock);
++
++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++ len += sprintf(buffer+len,"%p %08x %-8u %d",
++ ctx,
++ ctx->contextid,
++ ctx->ve_id,
++ ctx->ctx_state
++ );
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ len += pagein_info_printf(buffer+len, ctx);
++#endif
++
++ buffer[len++] = '\n';
++
++ pos = begin+len;
++ if (pos < offset) {
++ len = 0;
++ begin = pos;
++ }
++ if (pos > offset+length)
++ goto done;
++ }
++ *eof = 1;
++
++done:
++ spin_unlock(&cpt_context_lock);
++ *start = buffer + (offset - begin);
++ len -= (offset - begin);
++ if(len > length)
++ len = length;
++ if(len < 0)
++ len = 0;
++ return len;
++}
++
++void rst_context_release(cpt_context_t *ctx)
++{
++ list_del(&ctx->ctx_list);
++ spin_unlock(&cpt_context_lock);
++
++ if (ctx->ctx_state > 0)
++ rst_resume(ctx);
++ ctx->ctx_state = CPT_CTX_ERROR;
++
++ rst_close_dumpfile(ctx);
++
++ if (ctx->anonvmas) {
++ int h;
++ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) {
++ while (!hlist_empty(&ctx->anonvmas[h])) {
++ struct hlist_node *elem = ctx->anonvmas[h].first;
++ hlist_del(elem);
++ kfree(elem);
++ }
++ }
++ free_page((unsigned long)ctx->anonvmas);
++ }
++ cpt_flush_error(ctx);
++ if (ctx->errorfile) {
++ fput(ctx->errorfile);
++ ctx->errorfile = NULL;
++ }
++ if (ctx->error_msg) {
++ free_page((unsigned long)ctx->error_msg);
++ ctx->error_msg = NULL;
++ }
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ if (ctx->pagein_file_out)
++ fput(ctx->pagein_file_out);
++ if (ctx->pagein_file_in)
++ fput(ctx->pagein_file_in);
++ if (ctx->pgin_task)
++ put_task_struct(ctx->pgin_task);
++#endif
++ if (ctx->filejob_queue)
++ rst_flush_filejobs(ctx);
++ if (ctx->objcount)
++ eprintk_ctx("%d objects leaked\n", ctx->objcount);
++ kfree(ctx);
++
++ spin_lock(&cpt_context_lock);
++}
++
++static void __cpt_context_put(cpt_context_t *ctx)
++{
++ if (!--ctx->refcount)
++ rst_context_release(ctx);
++}
++
++static void cpt_context_put(cpt_context_t *ctx)
++{
++ spin_lock(&cpt_context_lock);
++ __cpt_context_put(ctx);
++ spin_unlock(&cpt_context_lock);
++}
++
++cpt_context_t * rst_context_open(void)
++{
++ cpt_context_t *ctx;
++
++ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) {
++ rst_context_init(ctx);
++ spin_lock(&cpt_context_lock);
++ list_add_tail(&ctx->ctx_list, &cpt_context_list);
++ spin_unlock(&cpt_context_lock);
++ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL);
++ if (ctx->error_msg != NULL)
++ ctx->error_msg[0] = 0;
++ }
++ return ctx;
++}
++
++void rst_report_error(int err, cpt_context_t *ctx)
++{
++ if (ctx->statusfile) {
++ mm_segment_t oldfs;
++ int status = 7 /* VZ_ENVCREATE_ERROR */;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ if (ctx->statusfile->f_op && ctx->statusfile->f_op->write)
++ ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos);
++ set_fs(oldfs);
++ fput(ctx->statusfile);
++ ctx->statusfile = NULL;
++ }
++}
++
++
++static cpt_context_t * cpt_context_lookup(unsigned int ctxid)
++{
++ cpt_context_t *ctx;
++
++ spin_lock(&cpt_context_lock);
++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) {
++ if (ctx->contextid == ctxid) {
++ ctx->refcount++;
++ spin_unlock(&cpt_context_lock);
++ return ctx;
++ }
++ }
++ spin_unlock(&cpt_context_lock);
++ return NULL;
++}
++
++static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg)
++{
++ int err = 0;
++ cpt_context_t *ctx;
++ struct file *dfile = NULL;
++
++ unlock_kernel();
++
++ if (cmd == CPT_TEST_CAPS) {
++ err = test_cpu_caps();
++ goto out_lock;
++ }
++
++ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) {
++ cpt_context_t *old_ctx;
++
++ ctx = NULL;
++ if (cmd == CPT_JOIN_CONTEXT) {
++ err = -ENOENT;
++ ctx = cpt_context_lookup(arg);
++ if (!ctx)
++ goto out_lock;
++ }
++
++ spin_lock(&cpt_context_lock);
++ old_ctx = (cpt_context_t*)file->private_data;
++ file->private_data = ctx;
++
++ if (old_ctx) {
++ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) {
++ old_ctx->sticky = 0;
++ old_ctx->refcount--;
++ }
++ __cpt_context_put(old_ctx);
++ }
++ spin_unlock(&cpt_context_lock);
++ err = 0;
++ goto out_lock;
++ }
++
++ spin_lock(&cpt_context_lock);
++ ctx = (cpt_context_t*)file->private_data;
++ if (ctx)
++ ctx->refcount++;
++ spin_unlock(&cpt_context_lock);
++
++ if (!ctx) {
++ cpt_context_t *old_ctx;
++
++ err = -ENOMEM;
++ ctx = rst_context_open();
++ if (!ctx)
++ goto out_lock;
++
++ spin_lock(&cpt_context_lock);
++ old_ctx = (cpt_context_t*)file->private_data;
++ if (!old_ctx) {
++ ctx->refcount++;
++ file->private_data = ctx;
++ } else {
++ old_ctx->refcount++;
++ }
++ if (old_ctx) {
++ __cpt_context_put(ctx);
++ ctx = old_ctx;
++ }
++ spin_unlock(&cpt_context_lock);
++ }
++
++ if (cmd == CPT_GET_CONTEXT) {
++ unsigned int contextid = (unsigned int)arg;
++
++ err = -EINVAL;
++ if (ctx->contextid && ctx->contextid != contextid)
++ goto out_nosem;
++ if (!ctx->contextid) {
++ cpt_context_t *c1 = cpt_context_lookup(contextid);
++ if (c1) {
++ cpt_context_put(c1);
++ err = -EEXIST;
++ goto out_nosem;
++ }
++ ctx->contextid = contextid;
++ }
++ spin_lock(&cpt_context_lock);
++ if (!ctx->sticky) {
++ ctx->sticky = 1;
++ ctx->refcount++;
++ }
++ spin_unlock(&cpt_context_lock);
++ err = 0;
++ goto out_nosem;
++ }
++
++ down(&ctx->main_sem);
++
++ err = -EBUSY;
++ if (ctx->ctx_state < 0)
++ goto out;
++
++ err = 0;
++ switch (cmd) {
++ case CPT_SET_DUMPFD:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ if (dfile->f_op == NULL ||
++ dfile->f_op->read == NULL) {
++ fput(dfile);
++ err = -EBADF;
++ break;
++ }
++ }
++ if (ctx->file)
++ fput(ctx->file);
++ ctx->file = dfile;
++ break;
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ case CPT_SET_PAGEINFDIN:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->pagein_file_in)
++ fput(ctx->pagein_file_in);
++ ctx->pagein_file_in = dfile;
++ break;
++ case CPT_SET_PAGEINFDOUT:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->pagein_file_out)
++ fput(ctx->pagein_file_out);
++ ctx->pagein_file_out = dfile;
++ break;
++ case CPT_PAGEIND:
++ err = rst_pageind(ctx);
++ break;
++#endif
++ case CPT_SET_LOCKFD:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->lockfile)
++ fput(ctx->lockfile);
++ ctx->lockfile = dfile;
++ break;
++ case CPT_SET_STATUSFD:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->statusfile)
++ fput(ctx->statusfile);
++ ctx->statusfile = dfile;
++ break;
++ case CPT_SET_ERRORFD:
++ if (arg >= 0) {
++ dfile = fget(arg);
++ if (IS_ERR(dfile)) {
++ err = PTR_ERR(dfile);
++ break;
++ }
++ }
++ if (ctx->errorfile)
++ fput(ctx->errorfile);
++ ctx->errorfile = dfile;
++ break;
++ case CPT_SET_VEID:
++ if (ctx->ctx_state > 0) {
++ err = -EBUSY;
++ break;
++ }
++ ctx->ve_id = arg;
++ break;
++ case CPT_UNDUMP:
++ if (ctx->ctx_state > 0) {
++ err = -ENOENT;
++ break;
++ }
++ ctx->ctx_state = CPT_CTX_UNDUMPING;
++ err = vps_rst_undump(ctx);
++ if (err) {
++ rst_report_error(err, ctx);
++ if (rst_kill(ctx) == 0)
++ ctx->ctx_state = CPT_CTX_IDLE;
++ } else {
++ ctx->ctx_state = CPT_CTX_UNDUMPED;
++ }
++ break;
++ case CPT_RESUME:
++ if (!ctx->ctx_state) {
++ err = -ENOENT;
++ break;
++ }
++ err = rst_resume(ctx);
++ if (!err)
++ ctx->ctx_state = CPT_CTX_IDLE;
++ break;
++ case CPT_KILL:
++ if (!ctx->ctx_state) {
++ err = -ENOENT;
++ break;
++ }
++ err = rst_kill(ctx);
++ if (!err)
++ ctx->ctx_state = CPT_CTX_IDLE;
++ break;
++ default:
++ err = -EINVAL;
++ break;
++ }
++
++out:
++ cpt_flush_error(ctx);
++ up(&ctx->main_sem);
++out_nosem:
++ cpt_context_put(ctx);
++out_lock:
++ lock_kernel();
++ return err;
++}
++
++static int rst_open(struct inode * inode, struct file * file)
++{
++ if (!try_module_get(THIS_MODULE))
++ return -EBUSY;
++
++ return 0;
++}
++
++static int rst_release(struct inode * inode, struct file * file)
++{
++ cpt_context_t *ctx;
++
++ spin_lock(&cpt_context_lock);
++ ctx = (cpt_context_t*)file->private_data;
++ file->private_data = NULL;
++ if (ctx)
++ __cpt_context_put(ctx);
++ spin_unlock(&cpt_context_lock);
++
++
++ module_put(THIS_MODULE);
++ return 0;
++}
++
++static struct file_operations rst_fops =
++{
++ .owner = THIS_MODULE,
++ .ioctl = rst_ioctl,
++ .open = rst_open,
++ .release = rst_release,
++};
++
++
++static struct proc_dir_entry *proc_ent;
++extern void *schedule_tail_p;
++extern void schedule_tail_hook(void);
++
++static struct ctl_table_header *ctl_header;
++
++static ctl_table debug_table[] = {
++ {
++ .ctl_name = 9476,
++ .procname = "rst",
++ .data = &debug_level,
++ .maxlen = sizeof(debug_level),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ { .ctl_name = 0 }
++};
++static ctl_table root_table[] = {
++ {
++ .ctl_name = CTL_DEBUG,
++ .procname = "debug",
++ .mode = 0555,
++ .child = debug_table,
++ },
++ { .ctl_name = 0 }
++};
++
++#ifdef CONFIG_X86_64
++
++static void *vzentry_forkret_get(void)
++{
++ unsigned char *p;
++
++ p = (unsigned char *)ret_from_fork;
++ return (void *)(*(u32 *)(p + 1) + p + 5);
++}
++
++static void vzentry_forkret_set(void *data)
++{
++ unsigned char *p;
++ long offset;
++
++ p = (unsigned char *)ret_from_fork;
++ offset = (unsigned long)data - (unsigned long)(p + 5);
++ if ((long)(s32)offset != offset) {
++ printk("vzentry_forkret_set: too long hook offset\n");
++ BUG();
++ }
++ *(u32 *)(p + 1) = offset;
++}
++#endif
++
++static int __init init_rst(void)
++{
++ int err;
++
++ err = -ENOMEM;
++ ctl_header = register_sysctl_table(root_table, 0);
++ if (!ctl_header)
++ goto err_mon;
++
++ spin_lock_init(&cpt_context_lock);
++ INIT_LIST_HEAD(&cpt_context_list);
++
++ err = -EINVAL;
++ proc_ent = create_proc_entry("rst", 0600, NULL);
++ if (!proc_ent)
++ goto err_out;
++
++ rst_fops.read = proc_ent->proc_fops->read;
++ rst_fops.write = proc_ent->proc_fops->write;
++ rst_fops.llseek = proc_ent->proc_fops->llseek;
++ proc_ent->proc_fops = &rst_fops;
++
++ proc_ent->read_proc = proc_read;
++ proc_ent->data = NULL;
++ proc_ent->owner = THIS_MODULE;
++#ifdef CONFIG_X86_64
++ schedule_tail_p = vzentry_forkret_get();
++ vzentry_forkret_set(&schedule_tail_hook);
++#endif
++ return 0;
++
++err_out:
++ unregister_sysctl_table(ctl_header);
++err_mon:
++ return err;
++}
++module_init(init_rst);
++
++static void __exit exit_rst(void)
++{
++#ifdef CONFIG_X86_64
++ /* This is wrong, of course. But still the best what we can do. */
++ vzentry_forkret_set(schedule_tail_p);
++#endif
++
++ remove_proc_entry("rst", NULL);
++ unregister_sysctl_table(ctl_header);
++
++ spin_lock(&cpt_context_lock);
++ while (!list_empty(&cpt_context_list)) {
++ cpt_context_t *ctx;
++ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list);
++
++ if (!ctx->sticky)
++ ctx->refcount++;
++ ctx->sticky = 0;
++
++ BUG_ON(ctx->refcount != 1);
++
++ __cpt_context_put(ctx);
++ }
++ spin_unlock(&cpt_context_lock);
++}
++module_exit(exit_rst);
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_process.c linux-2.6.16-026test015/kernel/cpt/rst_process.c
+--- linux-2.6.16.orig/kernel/cpt/rst_process.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1257 @@
++/*
++ *
++ * kernel/cpt/rst_process.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/virtinfo.h>
++#include <linux/kmem_cache.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/ptrace.h>
++#include <linux/tty.h>
++#include <asm/desc.h>
++#include <asm/unistd.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_misc.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_ubc.h"
++#include "cpt_process.h"
++#include "cpt_kernel.h"
++
++#ifdef CONFIG_X86_64
++
++#define _TIF_RESUME (1<<22)
++
++#define SYSCALL_NR(regs) ((regs)->orig_rax)
++#define SYSCALL_RETVAL(regs) ((regs)->rax)
++#define SYSCALL_PC(regs) ((regs)->rip)
++
++#define ESP(tsk) (tsk)->thread.rsp
++
++#define __NR32_restart_syscall 0
++#define __NR32_rt_sigtimedwait 177
++#define __NR32_pause 29
++#define __NR32_futex 240
++
++#define syscall_is(tsk,regs,name) ((!((tsk)->thread_info->flags&_TIF_IA32) && \
++ SYSCALL_NR(regs) == __NR_##name) || \
++ (((tsk)->thread_info->flags&_TIF_IA32) && \
++ SYSCALL_NR(regs) == __NR32_##name))
++#else
++
++#define SYSCALL_NR(regs) ((regs)->orig_eax)
++#define SYSCALL_RETVAL(regs) ((regs)->eax)
++#define SYSCALL_PC(regs) ((regs)->eip)
++
++#define ESP(tsk) (tsk)->thread.esp
++
++#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name)
++
++#undef task_pt_regs
++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1)
++
++#endif
++
++static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si)
++{
++ memset(info, 0, sizeof(*info));
++ switch(si->cpt_code & __SI_MASK) {
++ case __SI_TIMER:
++ info->si_tid = si->cpt_pid;
++ info->si_overrun = si->cpt_uid;
++ info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval);
++ info->si_sys_private = si->cpt_utime;
++ break;
++ case __SI_POLL:
++ info->si_band = si->cpt_pid;
++ info->si_fd = si->cpt_uid;
++ break;
++ case __SI_FAULT:
++ info->si_addr = cpt_ptr_import(si->cpt_sigval);
++#ifdef __ARCH_SI_TRAPNO
++ info->si_trapno = si->cpt_pid;
++#endif
++ break;
++ case __SI_CHLD:
++ info->si_pid = si->cpt_pid;
++ info->si_uid = si->cpt_uid;
++ info->si_status = si->cpt_sigval;
++ info->si_stime = si->cpt_stime;
++ info->si_utime = si->cpt_utime;
++ break;
++ case __SI_KILL:
++ case __SI_RT:
++ case __SI_MESGQ:
++ default:
++ info->si_pid = si->cpt_pid;
++ info->si_uid = si->cpt_uid;
++ info->si_ptr = cpt_ptr_import(si->cpt_sigval);
++ break;
++ }
++ info->si_signo = si->cpt_signo;
++ info->si_errno = si->cpt_errno;
++ info->si_code = si->cpt_code;
++}
++
++static int restore_sigqueue(task_t *tsk,
++ struct sigpending *queue, unsigned long start,
++ unsigned long end)
++{
++ while (start < end) {
++ struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start;
++ if (si->cpt_object == CPT_OBJ_SIGINFO) {
++ struct sigqueue *q = NULL;
++ struct user_struct *up;
++ up = alloc_uid(si->cpt_user);
++ if (!up)
++ return -ENOMEM;
++ q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC);
++ if (!q) {
++ free_uid(up);
++ return -ENOMEM;
++ }
++ if (ub_siginfo_charge(q, get_exec_ub())) {
++ kmem_cache_free(sigqueue_cachep, q);
++ free_uid(up);
++ return -ENOMEM;
++ }
++
++ INIT_LIST_HEAD(&q->list);
++ /* Preallocated elements (posix timers) are not
++ * supported yet. It is safe to replace them with
++ * a private one. */
++ q->flags = 0;
++ q->user = up;
++ atomic_inc(&q->user->sigpending);
++
++ decode_siginfo(&q->info, si);
++ list_add_tail(&q->list, &queue->list);
++ }
++ start += si->cpt_next;
++ }
++ return 0;
++}
++
++int rst_process_linkage(cpt_context_t *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ struct cpt_task_image *ti = obj->o_image;
++
++ if (tsk == NULL) {
++ eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm);
++ return -EINVAL;
++ }
++
++ if (virt_pgid(tsk) != ti->cpt_pgrp) {
++ int pid;
++
++ if ((pid = vpid_to_pid(ti->cpt_pgrp)) < 0) {
++ eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk));
++ return -EINVAL;
++ }
++
++ write_lock_irq(&tasklist_lock);
++ detach_pid(tsk, PIDTYPE_PGID);
++ tsk->signal->pgrp = pid;
++ set_virt_pgid(tsk, ti->cpt_pgrp);
++ if (thread_group_leader(tsk))
++ attach_pid(tsk, PIDTYPE_PGID, pid);
++ write_unlock_irq(&tasklist_lock);
++ }
++ if (virt_sid(tsk) != ti->cpt_session) {
++ int pid;
++
++ if ((pid = vpid_to_pid(ti->cpt_session)) < 0) {
++ eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk));
++ return -EINVAL;
++ }
++
++ write_lock_irq(&tasklist_lock);
++ detach_pid(tsk, PIDTYPE_SID);
++ tsk->signal->session = pid;
++ set_virt_sid(tsk, ti->cpt_session);
++ if (thread_group_leader(tsk))
++ attach_pid(tsk, PIDTYPE_SID, pid);
++ write_unlock_irq(&tasklist_lock);
++ }
++ if (ti->cpt_old_pgrp > 0 && tsk->signal->tty_old_pgrp == 0) {
++ int pid;
++
++ if ((pid = vpid_to_pid(ti->cpt_old_pgrp)) < 0) {
++ eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk));
++ return -EINVAL;
++ }
++
++ tsk->signal->tty_old_pgrp = pid;
++ }
++ }
++
++ return 0;
++}
++
++static int restore_one_signal_struct(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_signal_image *si = cpt_get_buf(ctx);
++
++ current->signal->tty = NULL;
++
++ err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++
++ if (virt_pgid(current) != si->cpt_pgrp) {
++ int err;
++ int pid = 0;
++
++ if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) {
++ pid = alloc_pidmap();
++ if (pid < 0) {
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ if ((err = alloc_vpid(pid, si->cpt_pgrp)) < 0) {
++ free_pidmap(pid);
++ pid = 0;
++ if (err != -EEXIST) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ }
++ }
++ if (pid ||
++ (pid = vpid_to_pid(si->cpt_pgrp)) > 0) {
++ write_lock_irq(&tasklist_lock);
++ detach_pid(current, PIDTYPE_PGID);
++ current->signal->pgrp = pid;
++ set_virt_pgid(current, si->cpt_pgrp);
++ if (thread_group_leader(current))
++ attach_pid(current, PIDTYPE_PGID, pid);
++ write_unlock_irq(&tasklist_lock);
++ }
++ }
++
++ current->signal->tty_old_pgrp = 0;
++ if ((int)si->cpt_old_pgrp > 0) {
++ if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) {
++ current->signal->tty_old_pgrp = alloc_pidmap();
++ if (current->signal->tty_old_pgrp < 0) {
++ eprintk_ctx("failed to allocate stray tty_old_pgrp\n");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ free_pidmap(current->signal->tty_old_pgrp);
++ } else {
++ current->signal->tty_old_pgrp = vpid_to_pid(si->cpt_old_pgrp);
++ if (current->signal->tty_old_pgrp < 0) {
++ dprintk_ctx("forward old tty PGID\n");
++ current->signal->tty_old_pgrp = 0;
++ }
++ }
++ }
++
++ if (virt_sid(current) != si->cpt_session) {
++ int err;
++ int pid = 0;
++
++ if (si->cpt_session_type == CPT_PGRP_ORPHAN) {
++ pid = alloc_pidmap();
++ if (pid < 0) {
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ if ((err = alloc_vpid(pid, si->cpt_session)) < 0) {
++ free_pidmap(pid);
++ pid = 0;
++ if (err != -EEXIST) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ }
++ }
++ if (pid ||
++ (pid = vpid_to_pid(si->cpt_session)) > 0) {
++ write_lock_irq(&tasklist_lock);
++ detach_pid(current, PIDTYPE_SID);
++ set_virt_sid(current, si->cpt_session);
++ current->signal->session = pid;
++ if (thread_group_leader(current))
++ attach_pid(current, PIDTYPE_SID, pid);
++ write_unlock_irq(&tasklist_lock);
++ }
++ }
++
++ cpt_sigset_import(&current->signal->shared_pending.signal, si->cpt_sigpending);
++ current->signal->leader = si->cpt_leader;
++ if (si->cpt_ctty != CPT_NULL) {
++ cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx);
++ if (obj) {
++ struct tty_struct *tty = obj->o_obj;
++ if (tty->session == 0 || tty->session == current->signal->session) {
++ tty->session = current->signal->session;
++ current->signal->tty = tty;
++ } else {
++ wprintk_ctx("tty session mismatch\n");
++ }
++ }
++ }
++
++ if (si->cpt_curr_target)
++ current->signal->curr_target = find_task_by_pid_ve(si->cpt_curr_target);
++ current->signal->flags = 0;
++ if (si->cpt_group_exit)
++ current->signal->flags |= SIGNAL_GROUP_EXIT;
++ current->signal->group_exit_code = si->cpt_group_exit_code;
++ if (si->cpt_group_exit_task) {
++ current->signal->group_exit_task = find_task_by_pid_ve(si->cpt_group_exit_task);
++ if (current->signal->group_exit_task == NULL) {
++ eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task);
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ }
++ current->signal->notify_count = si->cpt_notify_count;
++ current->signal->group_stop_count = si->cpt_group_stop_count;
++
++ if (si->cpt_next > si->cpt_hdrlen) {
++ char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL);
++ if (buf == NULL) {
++ cpt_release_buf(ctx);
++ return -ENOMEM;
++ }
++ err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx,
++ ti->cpt_signal + si->cpt_hdrlen);
++ if (err) {
++ kfree(buf);
++ cpt_release_buf(ctx);
++ return err;
++ }
++ restore_sigqueue(current,
++ &current->signal->shared_pending, (unsigned long)buf,
++ (unsigned long)buf + si->cpt_next - si->cpt_hdrlen);
++ kfree(buf);
++ }
++ cpt_release_buf(ctx);
++ return 0;
++}
++
++int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_sighand_image si;
++ int i;
++ loff_t pos, endpos;
++
++ err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx);
++ if (err)
++ return err;
++
++ for (i=0; i<_NSIG; i++) {
++ current->sighand->action[i].sa.sa_handler = SIG_DFL;
++ current->sighand->action[i].sa.sa_restorer = 0;
++ current->sighand->action[i].sa.sa_flags = SA_ONESHOT | SA_NOMASK;
++ memset(&current->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t));
++ }
++
++ pos = ti->cpt_sighand + si.cpt_hdrlen;
++ endpos = ti->cpt_sighand + si.cpt_next;
++ while (pos < endpos) {
++ struct cpt_sighandler_image shi;
++
++ err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx);
++ if (err)
++ return err;
++ current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler;
++ current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer;
++ current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags;
++ cpt_sigset_import(&current->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask);
++ pos += shi.cpt_next;
++ }
++
++ return 0;
++}
++
++
++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ __u32 flag = 0;
++
++ if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx))
++ flag |= CLONE_THREAD;
++ if (ti->cpt_sighand == CPT_NULL ||
++ lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx))
++ flag |= CLONE_SIGHAND;
++ return flag;
++}
++
++int rst_signal_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ int err;
++ cpt_object_t *obj;
++
++ if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) {
++ return -EINVAL;
++ }
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx);
++ if (obj) {
++ struct sighand_struct *sig = current->sighand;
++ if (obj->o_obj != sig) {
++ return -EINVAL;
++ }
++ } else {
++ obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx);
++ if (obj == NULL)
++ return -ENOMEM;
++ cpt_obj_setpos(obj, ti->cpt_sighand, ctx);
++ err = restore_one_sighand_struct(ti, ctx);
++ if (err)
++ return err;
++ }
++
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx);
++ if (obj) {
++ struct signal_struct *sig = current->signal;
++ if (obj->o_obj != sig) {
++ return -EINVAL;
++ }
++ if (current->signal) {
++ set_virt_pgid(current, pid_type_to_vpid(PIDTYPE_PGID, current->signal->pgrp));
++ set_virt_sid(current, pid_type_to_vpid(PIDTYPE_SID, current->signal->session));
++ }
++ } else {
++ obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx);
++ if (obj == NULL)
++ return -ENOMEM;
++ cpt_obj_setpos(obj, ti->cpt_signal, ctx);
++ err = restore_one_signal_struct(ti, ctx);
++ if (err)
++ return err;
++ }
++
++ return 0;
++}
++
++static u32 decode_segment(u32 segid)
++{
++ if (segid == CPT_SEG_ZERO)
++ return 0;
++
++ /* TLS descriptors */
++ if (segid <= CPT_SEG_TLS3)
++ return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3;
++
++ /* LDT descriptor, it is just an index to LDT array */
++ if (segid >= CPT_SEG_LDT)
++ return ((segid - CPT_SEG_LDT) << 3) | 7;
++
++ /* Check for one of standard descriptors */
++#ifdef CONFIG_X86_64
++ if (segid == CPT_SEG_USER32_DS)
++ return __USER32_DS;
++ if (segid == CPT_SEG_USER32_CS)
++ return __USER32_CS;
++ if (segid == CPT_SEG_USER64_DS)
++ return __USER_DS;
++ if (segid == CPT_SEG_USER64_CS)
++ return __USER_CS;
++#else
++ if (segid == CPT_SEG_USER32_DS)
++ return __USER_DS;
++ if (segid == CPT_SEG_USER32_CS)
++ return __USER_CS;
++#endif
++ wprintk("Invalid segment reg %d\n", segid);
++ return 0;
++}
++
++unsigned long rct(unsigned long *child_tids)
++{
++ dprintk("rct: " CPT_FID "\n", CPT_TID(current));
++ current->clear_child_tid = (void*)child_tids[0];
++ current->set_child_tid = (void*)child_tids[1];
++ module_put(THIS_MODULE);
++ return (unsigned long)(child_tids+2);
++}
++
++unsigned long rlsi(void)
++{
++ int signr;
++ siginfo_t *info = current->last_siginfo;
++ struct pt_regs *regs = task_pt_regs(current);
++ struct k_sigaction *ka;
++ int ptrace_id;
++
++ dprintk("rlsi: " CPT_FID "\n", CPT_TID(current));
++
++ spin_lock_irq(&current->sighand->siglock);
++ current->last_siginfo = NULL;
++ recalc_sigpending();
++
++ ptrace_id = current->pn_state;
++ clear_pn_state(current);
++
++ switch (ptrace_id) {
++ case PN_STOP_TF:
++ case PN_STOP_TF_RT:
++ /* frame_*signal */
++ dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %lu %lu\n",
++ virt_pid(current), current->pid, current->comm,
++ info->si_signo, info->si_code,
++ current->exit_code, SYSCALL_NR(regs),
++ current->ptrace, current->ptrace_message);
++ goto out;
++ case PN_STOP_ENTRY:
++ case PN_STOP_LEAVE:
++ /* do_syscall_trace */
++ spin_unlock_irq(&current->sighand->siglock);
++ dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code);
++ if (current->exit_code) {
++ send_sig(current->exit_code, current, 1);
++ current->exit_code = 0;
++ }
++ if (ptrace_id == PN_STOP_ENTRY && SYSCALL_RETVAL(regs) == -ENOSYS) {
++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs);
++ SYSCALL_PC(regs) -= 2;
++ } else if (syscall_is(current, regs, rt_sigtimedwait)) {
++ if (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR) {
++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs);
++ SYSCALL_PC(regs) -= 2;
++ }
++ }
++ goto out_nolock;
++ case PN_STOP_FORK:
++ /* fork */
++ SYSCALL_RETVAL(regs) = current->ptrace_message;
++ dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs));
++ goto out;
++ case PN_STOP_VFORK:
++ /* after vfork */
++ SYSCALL_RETVAL(regs) = current->ptrace_message;
++ dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs));
++ goto out;
++ case PN_STOP_SIGNAL:
++ /* normal case : dequeue signal */
++ break;
++ case PN_STOP_EXIT:
++ dprintk("ptrace exit caught\n");
++ current->ptrace &= ~PT_TRACE_EXIT;
++ spin_unlock_irq(&current->sighand->siglock);
++ module_put(THIS_MODULE);
++ complete_and_exit(NULL, current->ptrace_message);
++ BUG();
++ case PN_STOP_EXEC:
++ eprintk("ptrace after exec caught: must not happen\n");
++ BUG();
++ default:
++ eprintk("ptrace with unknown identity %d\n", ptrace_id);
++ BUG();
++ }
++
++ signr = current->exit_code;
++ if (signr == 0) {
++ dprintk("rlsi: canceled signal %d\n", info->si_signo);
++ goto out;
++ }
++ current->exit_code = 0;
++
++ if (signr != info->si_signo) {
++ info->si_signo = signr;
++ info->si_errno = 0;
++ info->si_code = SI_USER;
++ info->si_pid = virt_pid(current->parent);
++ info->si_uid = current->parent->uid;
++ }
++
++ /* If the (new) signal is now blocked, requeue it. */
++ if (sigismember(&current->blocked, signr)) {
++ dprintk("going to requeue signal %d\n", signr);
++ goto out_resend_sig;
++ }
++
++ ka = &current->sighand->action[signr-1];
++ if (ka->sa.sa_handler == SIG_IGN) {
++ dprintk("going to resend signal %d (ignored)\n", signr);
++ goto out;
++ }
++ if (ka->sa.sa_handler != SIG_DFL) {
++ dprintk("going to resend signal %d (not SIG_DFL)\n", signr);
++ goto out_resend_sig;
++ }
++ if (signr == SIGCONT ||
++ signr == SIGCHLD ||
++ signr == SIGWINCH ||
++ signr == SIGURG ||
++ current->pid == 1)
++ goto out;
++
++ /* All the rest, which we cannot handle are requeued. */
++ dprintk("going to resend signal %d (sigh)\n", signr);
++out_resend_sig:
++ spin_unlock_irq(&current->sighand->siglock);
++ send_sig_info(signr, info, current);
++ module_put(THIS_MODULE);
++ return (unsigned long)(info+1);
++
++out:
++ spin_unlock_irq(&current->sighand->siglock);
++out_nolock:
++ module_put(THIS_MODULE);
++ return (unsigned long)(info+1);
++}
++
++static void ret_finish_stop(void)
++{
++ /* ...
++ * do_signal() ->
++ * get_signal_to_deliver() ->
++ * do_signal_stop() ->
++ * finish_stop()
++ *
++ * Normally after SIGCONT it will dequeue the next signal. If no signal
++ * is found, do_signal restarts syscall unconditionally.
++ * Otherwise signal handler is pushed on user stack.
++ */
++
++ dprintk("rfs: " CPT_FID "\n", CPT_TID(current));
++
++ clear_stop_state(current);
++ current->exit_code = 0;
++
++ module_put(THIS_MODULE);
++}
++
++static void ret_restart_sys(void)
++{
++ struct pt_regs *regs = task_pt_regs(current);
++
++ /* This hook is supposed to be executed, when we have
++ * to complete some interrupted syscall.
++ */
++ dprintk("rrs: " CPT_FID "\n", CPT_TID(current));
++
++ if (syscall_is(current,regs,pause)) {
++ if (SYSCALL_RETVAL(regs) == -ERESTARTNOHAND) {
++ current->state = TASK_INTERRUPTIBLE;
++ schedule();
++ }
++ } else if (syscall_is(current,regs,rt_sigtimedwait)) {
++ if (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR) {
++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs);
++ SYSCALL_PC(regs) -= 2;
++ }
++ } else if (syscall_is(current,regs,futex)) {
++ if (SYSCALL_RETVAL(regs) == -EINTR) {
++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs);
++ SYSCALL_PC(regs) -= 2;
++ }
++ }
++
++ if (!signal_pending(current)) {
++ if (SYSCALL_RETVAL(regs) == -ERESTARTSYS ||
++ SYSCALL_RETVAL(regs) == -ERESTARTNOINTR ||
++ SYSCALL_RETVAL(regs) == -ERESTARTNOHAND) {
++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs);
++ SYSCALL_PC(regs) -= 2;
++ } else if (SYSCALL_RETVAL(regs) == -ERESTART_RESTARTBLOCK) {
++ SYSCALL_RETVAL(regs) = __NR_restart_syscall;
++#ifdef CONFIG_X86_64
++ if (current->thread_info->flags&_TIF_IA32)
++ SYSCALL_RETVAL(regs) = __NR32_restart_syscall;
++#endif
++ SYSCALL_PC(regs) -= 2;
++ }
++ }
++
++ module_put(THIS_MODULE);
++}
++
++extern void ret_last_siginfo(void);
++extern void ret_child_tid(void);
++extern void ret_from_rst(void);
++extern void pre_ret_from_fork(void);
++
++#ifndef CONFIG_X86_64
++
++/* tsk->thread.eip points to pre_ret_from_fork
++ * Stack layout:
++ * [eip of the last hook]
++ * [args of the last hook]
++ * [eip of previous hook]
++ * [args of previous hook]
++ * ...
++ * [eip of the first hook]
++ * [args of the first hook]
++ * [ret_from_rst]
++ */
++
++static void * add_hook(task_t *tsk, void (*hook)(void), int argsize, int *hooks)
++{
++ ESP(tsk) -= sizeof(unsigned long);
++ *(unsigned long*)ESP(tsk) = tsk->thread.eip;
++ ESP(tsk) -= argsize;
++ tsk->thread.eip = (unsigned long)hook;
++ if (!try_module_get(THIS_MODULE)) BUG();
++ (*hooks)++;
++ return (void*)ESP(tsk);
++}
++
++static int restore_registers(task_t *tsk, struct pt_regs *regs,
++ struct cpt_task_image *ti, struct cpt_x86_regs *b)
++{
++ if (b->cpt_object != CPT_OBJ_X86_REGS)
++ return -EINVAL;
++
++ tsk->thread.esp = (unsigned long) regs;
++ tsk->thread.esp0 = (unsigned long) (regs+1);
++ tsk->thread.eip = (unsigned long) ret_from_rst;
++
++ tsk->thread.fs = decode_segment(b->cpt_fs);
++ tsk->thread.gs = decode_segment(b->cpt_gs);
++ tsk->thread.debugreg[0] = b->cpt_debugreg[0];
++ tsk->thread.debugreg[1] = b->cpt_debugreg[1];
++ tsk->thread.debugreg[2] = b->cpt_debugreg[2];
++ tsk->thread.debugreg[3] = b->cpt_debugreg[3];
++ tsk->thread.debugreg[4] = b->cpt_debugreg[4];
++ tsk->thread.debugreg[5] = b->cpt_debugreg[5];
++ tsk->thread.debugreg[6] = b->cpt_debugreg[6];
++ tsk->thread.debugreg[7] = b->cpt_debugreg[7];
++
++ memcpy(regs, &b->cpt_ebx, sizeof(struct pt_regs));
++
++ regs->xcs = decode_segment(b->cpt_xcs);
++ regs->xss = decode_segment(b->cpt_xss);
++ regs->xds = decode_segment(b->cpt_xds);
++ regs->xes = decode_segment(b->cpt_xes);
++
++ return 0;
++}
++
++#else
++
++/* Stack layout:
++ *
++ * [eip of the last hook]
++ * [args of the last hook]
++ * ...
++ * [eip of the first hook]
++ * [args of the first hook]
++ * [ret_from_fork+5]
++ */
++
++static void * add_hook(task_t *tsk, void (*hook)(void), int argsize, int *hooks)
++{
++ if (!*hooks) {
++ extern void ret_from_fork2(void);
++ ESP(tsk) -= sizeof(unsigned long);
++ *(unsigned long*)ESP(tsk) = (unsigned long)ret_from_fork2;
++ tsk->thread_info->flags |= _TIF_RESUME;
++ }
++ ESP(tsk) -= argsize + sizeof(unsigned long);
++ *(unsigned long*)ESP(tsk) = (unsigned long)hook;
++ if (!try_module_get(THIS_MODULE)) BUG();
++ (*hooks)++;
++ return (void*)(ESP(tsk) + sizeof(unsigned long));
++}
++
++static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s)
++{
++ memset(d, 0, sizeof(struct pt_regs));
++ d->rbp = s->cpt_ebp;
++ d->rbx = s->cpt_ebx;
++ d->rax = (s32)s->cpt_eax;
++ d->rcx = s->cpt_ecx;
++ d->rdx = s->cpt_edx;
++ d->rsi = s->cpt_esi;
++ d->rdi = s->cpt_edi;
++ d->orig_rax = (s32)s->cpt_orig_eax;
++ d->rip = s->cpt_eip;
++ d->cs = s->cpt_xcs;
++ d->eflags = s->cpt_eflags;
++ d->rsp = s->cpt_esp;
++ d->ss = s->cpt_xss;
++}
++
++static int restore_registers(task_t *tsk, struct pt_regs *regs,
++ struct cpt_task_image *ti, struct cpt_obj_bits *hdr)
++{
++ if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) {
++ struct cpt_x86_64_regs *b = (void*)hdr;
++
++ tsk->thread.rsp = (unsigned long) regs;
++ tsk->thread.rsp0 = (unsigned long) (regs+1);
++
++ tsk->thread.fs = b->cpt_fsbase;
++ tsk->thread.gs = b->cpt_gsbase;
++ tsk->thread.fsindex = decode_segment(b->cpt_fsindex);
++ tsk->thread.gsindex = decode_segment(b->cpt_gsindex);
++ tsk->thread.ds = decode_segment(b->cpt_ds);
++ tsk->thread.es = decode_segment(b->cpt_es);
++ tsk->thread.debugreg0 = b->cpt_debugreg[0];
++ tsk->thread.debugreg1 = b->cpt_debugreg[1];
++ tsk->thread.debugreg2 = b->cpt_debugreg[2];
++ tsk->thread.debugreg3 = b->cpt_debugreg[3];
++ tsk->thread.debugreg6 = b->cpt_debugreg[6];
++ tsk->thread.debugreg7 = b->cpt_debugreg[7];
++
++ memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs));
++
++ tsk->thread.userrsp = regs->rsp;
++ regs->cs = decode_segment(b->cpt_cs);
++ regs->ss = decode_segment(b->cpt_ss);
++ } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) {
++ struct cpt_x86_regs *b = (void*)hdr;
++
++ tsk->thread.rsp = (unsigned long) regs;
++ tsk->thread.rsp0 = (unsigned long) (regs+1);
++
++ tsk->thread.fs = 0;
++ tsk->thread.gs = 0;
++ tsk->thread.fsindex = decode_segment(b->cpt_fs);
++ tsk->thread.gsindex = decode_segment(b->cpt_gs);
++ tsk->thread.debugreg0 = b->cpt_debugreg[0];
++ tsk->thread.debugreg1 = b->cpt_debugreg[1];
++ tsk->thread.debugreg2 = b->cpt_debugreg[2];
++ tsk->thread.debugreg3 = b->cpt_debugreg[3];
++ tsk->thread.debugreg6 = b->cpt_debugreg[6];
++ tsk->thread.debugreg7 = b->cpt_debugreg[7];
++
++ xlate_ptregs_32_to_64(regs, b);
++
++ tsk->thread.userrsp = regs->rsp;
++ regs->cs = decode_segment(b->cpt_xcs);
++ regs->ss = decode_segment(b->cpt_xss);
++ tsk->thread.ds = decode_segment(b->cpt_xds);
++ tsk->thread.es = decode_segment(b->cpt_xes);
++ } else {
++ return -EINVAL;
++ }
++ return 0;
++}
++
++#endif
++
++int rst_restore_process(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ struct cpt_task_image *ti = obj->o_image;
++ struct pt_regs * regs;
++ struct cpt_object_hdr *b;
++ struct cpt_siginfo_image *lsi = NULL;
++ struct group_info *gids, *ogids;
++ int hooks = 0;
++ int i;
++
++ if (tsk == NULL) {
++ eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm);
++ return -EFAULT;
++ }
++
++ wait_task_inactive(tsk);
++ regs = task_pt_regs(tsk);
++
++ if (!tsk->exit_state) {
++ tsk->lock_depth = -1;
++#ifdef CONFIG_PREEMPT
++ tsk->thread_info->preempt_count--;
++#endif
++ }
++
++ if (tsk->static_prio != ti->cpt_static_prio)
++ set_user_nice(tsk, PRIO_TO_NICE(ti->cpt_static_prio));
++
++ cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked);
++ cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked);
++ cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked);
++ cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending);
++
++ tsk->uid = ti->cpt_uid;
++ tsk->euid = ti->cpt_euid;
++ tsk->suid = ti->cpt_suid;
++ tsk->fsuid = ti->cpt_fsuid;
++ tsk->gid = ti->cpt_gid;
++ tsk->egid = ti->cpt_egid;
++ tsk->sgid = ti->cpt_sgid;
++ tsk->fsgid = ti->cpt_fsgid;
++ memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective));
++ memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable));
++ memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted));
++ tsk->keep_capabilities = (ti->cpt_keepcap != 0);
++ tsk->did_exec = (ti->cpt_did_exec != 0);
++ gids = groups_alloc(ti->cpt_ngids);
++ ogids = tsk->group_info;
++ if (gids) {
++ int i;
++ for (i=0; i<32; i++)
++ gids->small_block[i] = ti->cpt_gids[i];
++ tsk->group_info = gids;
++ }
++ if (ogids)
++ put_group_info(ogids);
++ tsk->utime = ti->cpt_utime;
++ tsk->stime = ti->cpt_stime;
++ if (ctx->image_version == 0) {
++ tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC);
++ } else {
++ cpt_timespec_import(&tsk->start_time, ti->cpt_starttime);
++ }
++ _set_normalized_timespec(&tsk->start_time,
++ tsk->start_time.tv_sec -
++ get_exec_env()->init_entry->start_time.tv_sec,
++ tsk->start_time.tv_nsec -
++ get_exec_env()->init_entry->start_time.tv_nsec);
++
++ tsk->nvcsw = ti->cpt_nvcsw;
++ tsk->nivcsw = ti->cpt_nivcsw;
++ tsk->min_flt = ti->cpt_min_flt;
++ tsk->maj_flt = ti->cpt_maj_flt;
++
++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8)
++ tsk->cutime = ti->cpt_cutime;
++ tsk->cstime = ti->cpt_cstime;
++ tsk->cnvcsw = ti->cpt_cnvcsw;
++ tsk->cnivcsw = ti->cpt_cnivcsw;
++ tsk->cmin_flt = ti->cpt_cmin_flt;
++ tsk->cmaj_flt = ti->cpt_cmaj_flt;
++
++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++ __asm__("undefined\n");
++
++ for (i=0; i<RLIM_NLIMITS; i++) {
++ tsk->rlim[i].rlim_cur = ti->cpt_rlim_cur[i];
++ tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i];
++ }
++#else
++ if (thread_group_leader(tsk) && tsk->signal) {
++ tsk->signal->utime = ti->cpt_utime;
++ tsk->signal->stime = ti->cpt_stime;
++ tsk->signal->cutime = ti->cpt_cutime;
++ tsk->signal->cstime = ti->cpt_cstime;
++ tsk->signal->nvcsw = ti->cpt_nvcsw;
++ tsk->signal->nivcsw = ti->cpt_nivcsw;
++ tsk->signal->cnvcsw = ti->cpt_cnvcsw;
++ tsk->signal->cnivcsw = ti->cpt_cnivcsw;
++ tsk->signal->min_flt = ti->cpt_min_flt;
++ tsk->signal->maj_flt = ti->cpt_maj_flt;
++ tsk->signal->cmin_flt = ti->cpt_cmin_flt;
++ tsk->signal->cmaj_flt = ti->cpt_cmaj_flt;
++
++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS)
++ __asm__("undefined\n");
++
++ for (i=0; i<RLIM_NLIMITS; i++) {
++ tsk->signal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i];
++ tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i];
++ }
++ }
++#endif
++
++ for (i=0; i<3; i++) {
++ if (i >= GDT_ENTRY_TLS_ENTRIES) {
++ eprintk_ctx("too many tls descs\n");
++ } else {
++#ifndef CONFIG_X86_64
++ tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF;
++ tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32;
++#else
++ tsk->thread.tls_array[i] = ti->cpt_tls[i];
++#endif
++ }
++ }
++
++ clear_stopped_child_used_math(tsk);
++
++ b = (void *)(ti+1);
++ while ((void*)b < ((void*)ti) + ti->cpt_next) {
++ /* Siginfo objects are at the end of obj array */
++ if (b->cpt_object == CPT_OBJ_SIGINFO) {
++ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
++ restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next);
++ set_exec_env(env);
++ break;
++ }
++
++ switch (b->cpt_object) {
++ case CPT_OBJ_BITS:
++ if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE &&
++ cpu_has_fxsr) {
++ memcpy(&tsk->thread.i387,
++ (void*)b + b->cpt_hdrlen,
++ sizeof(struct i387_fxsave_struct));
++ if (ti->cpt_used_math)
++ set_stopped_child_used_math(tsk);
++ }
++#ifdef CONFIG_X86_32
++ else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD &&
++ !cpu_has_fxsr) {
++ memcpy(&tsk->thread.i387,
++ (void*)b + b->cpt_hdrlen,
++ sizeof(struct i387_fsave_struct));
++ if (ti->cpt_used_math)
++ set_stopped_child_used_math(tsk);
++ }
++#endif
++ break;
++ case CPT_OBJ_LASTSIGINFO:
++ lsi = (void*)b;
++ break;
++ case CPT_OBJ_X86_REGS:
++ case CPT_OBJ_X86_64_REGS:
++ if (restore_registers(tsk, regs, ti, (void*)b)) {
++ eprintk_ctx("cannot restore registers: image is corrupted\n");
++ return -EINVAL;
++ }
++ break;
++ case CPT_OBJ_SIGALTSTACK: {
++ struct cpt_sigaltstack_image *sas;
++ sas = (struct cpt_sigaltstack_image *)b;
++ tsk->sas_ss_sp = sas->cpt_stack;
++ tsk->sas_ss_size = sas->cpt_stacksize;
++ break;
++ }
++ }
++ b = ((void*)b) + b->cpt_next;
++ }
++
++ if (ti->cpt_ppid != ti->cpt_rppid) {
++ task_t *parent;
++ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env);
++ write_lock_irq(&tasklist_lock);
++ parent = find_task_by_pid_ve(ti->cpt_ppid);
++ if (parent && parent != tsk->parent) {
++ list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children);
++ REMOVE_LINKS(tsk);
++ tsk->parent = parent;
++ SET_LINKS(tsk);
++ }
++ write_unlock_irq(&tasklist_lock);
++ set_exec_env(env);
++ }
++
++ tsk->ptrace_message = ti->cpt_ptrace_message;
++ tsk->pn_state = ti->cpt_pn_state;
++ tsk->stopped_state = ti->cpt_stopped_state;
++ tsk->thread_info->flags = ti->cpt_thrflags;
++
++ /* The image was created with kernel < 2.6.16, while
++ * task hanged in sigsuspend -> do_signal.
++ *
++ * FIXME! This needs more brain efforts...
++ */
++ if (ti->cpt_sigsuspend_state) {
++ tsk->thread_info->flags |= _TIF_RESTORE_SIGMASK;
++ }
++
++#ifdef CONFIG_X86_64
++ tsk->thread_info->flags |= _TIF_FORK;
++ if (!ti->cpt_64bit)
++ tsk->thread_info->flags |= _TIF_IA32;
++#endif
++
++#ifndef CONFIG_X86_64
++ do {
++ if (regs->orig_eax == __NR__newselect && regs->edi) {
++ struct timeval tv;
++ if (access_process_vm(tsk, regs->edi, &tv,
++ sizeof(tv), 0) != sizeof(tv)) {
++ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n",
++ virt_pid(tsk), tsk->pid, tsk->comm,
++ regs->edi);
++ break;
++ }
++ dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n",
++ virt_pid(tsk), tsk->pid, tsk->comm,
++ tv.tv_sec, tv.tv_usec);
++ tv.tv_sec -= ctx->delta_time.tv_sec;
++ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
++ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
++ tv.tv_sec--;
++ } else {
++ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
++ }
++ if (tv.tv_sec < 0) {
++ tv.tv_sec = 0;
++ tv.tv_usec = 0;
++ }
++ dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n",
++ virt_pid(tsk), tsk->pid, tsk->comm,
++ tv.tv_sec, tv.tv_usec);
++ if (access_process_vm(tsk, regs->edi, &tv,
++ sizeof(tv), 1) != sizeof(tv)) {
++ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n",
++ virt_pid(tsk), tsk->pid, tsk->comm, regs->edi);
++ }
++
++ } else if (regs->orig_eax == __NR_select && regs->edi) {
++ struct {
++ unsigned long n;
++ fd_set __user *inp, *outp, *exp;
++ struct timeval __user *tvp;
++ } a;
++ struct timeval tv;
++ if (access_process_vm(tsk, regs->ebx, &a,
++ sizeof(a), 0) != sizeof(a)) {
++ wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid);
++ break;
++ }
++ if (access_process_vm(tsk, (unsigned long)a.tvp,
++ &tv, sizeof(tv), 0) != sizeof(tv)) {
++ wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid);
++ break;
++ }
++ dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n",
++ tsk->pid, tv.tv_sec, tv.tv_usec);
++ tv.tv_sec -= ctx->delta_time.tv_sec;
++ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) {
++ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000;
++ tv.tv_sec--;
++ } else {
++ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000;
++ }
++ if (tv.tv_sec < 0) {
++ tv.tv_sec = 0;
++ tv.tv_usec = 0;
++ }
++ dprintk_ctx("task %d: New timeval in select: %ld.%ld\n",
++ tsk->pid, tv.tv_sec, tv.tv_usec);
++ if (access_process_vm(tsk, (unsigned long)a.tvp,
++ &tv, sizeof(tv), 1) != sizeof(tv)) {
++ wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid);
++ }
++ }
++ } while (0);
++#endif
++
++ if (!tsk->exit_state && (long)SYSCALL_NR(regs) >= 0) {
++ if (SYSCALL_RETVAL(regs) == -ERESTARTSYS ||
++ SYSCALL_RETVAL(regs) == -ERESTARTNOINTR ||
++ SYSCALL_RETVAL(regs) == -ERESTARTNOHAND ||
++ SYSCALL_RETVAL(regs) == -ERESTART_RESTARTBLOCK ||
++ syscall_is(tsk,regs,pause) ||
++ (syscall_is(tsk,regs,rt_sigtimedwait) &&
++ (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR)) ||
++ (syscall_is(tsk,regs,futex) &&
++ (SYSCALL_RETVAL(regs) == -EINTR)))
++ add_hook(tsk, ret_restart_sys, 0, &hooks);
++ }
++
++ if (lsi || tsk->pn_state) {
++ /* ... -> ptrace_notify()
++ * or
++ * ... -> do_signal() -> get_signal_to_deliver() ->
++ * ptrace stop
++ */
++ tsk->last_siginfo = add_hook(tsk, ret_last_siginfo, sizeof(siginfo_t), &hooks);
++ memset(tsk->last_siginfo, 0, sizeof(siginfo_t));
++ if (lsi)
++ decode_siginfo(tsk->last_siginfo, lsi);
++ }
++
++ tsk->ptrace = ti->cpt_ptrace;
++ tsk->flags = ti->cpt_flags & ~PF_FROZEN;
++ clear_tsk_thread_flag(tsk, TIF_FREEZE);
++ tsk->exit_signal = ti->cpt_exit_signal;
++
++ if (tsk->stopped_state) {
++ dprintk_ctx("finish_stop\n");
++ if (ti->cpt_state != TASK_STOPPED)
++ eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state);
++ add_hook(tsk, ret_finish_stop, 0, &hooks);
++ }
++
++ if (!tsk->exit_state &&
++ (ti->cpt_set_tid || ti->cpt_clear_tid)) {
++ unsigned long *ptr = add_hook(tsk, ret_child_tid, sizeof(unsigned long)*2, &hooks);
++ ptr[0] = ti->cpt_clear_tid;
++ ptr[1] = ti->cpt_set_tid;
++ dprintk_ctx("settids\n");
++ }
++
++#ifdef CONFIG_X86_64
++ if (!hooks && (long)SYSCALL_NR(regs) < 0) {
++ extern void ret_from_fork2(void);
++ ESP(tsk) -= sizeof(unsigned long);
++ *(unsigned long*)ESP(tsk) = (unsigned long)ret_from_fork2;
++ tsk->thread_info->flags |= _TIF_RESUME;
++ }
++#else
++ tsk->thread.esp -= 4;
++ *(__u32*)tsk->thread.esp = tsk->thread.eip;
++ tsk->thread.eip = (unsigned long)pre_ret_from_fork;
++#endif
++
++ if (ti->cpt_state == TASK_TRACED)
++ tsk->state = TASK_TRACED;
++ else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) {
++ tsk->signal->it_virt_expires = 0;
++ tsk->signal->it_prof_expires = 0;
++ if (tsk->state != EXIT_DEAD)
++ eprintk_ctx("oops, schedule() did not make us dead\n");
++ }
++
++ if (thread_group_leader(tsk) &&
++ ti->cpt_it_real_value &&
++ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
++ DEFINE_KTIME(val);
++
++ if (ctx->image_version != 0) {
++ ktime_t delta;
++
++ val = ktime_add_ns(val, ti->cpt_it_real_value);
++ delta = timespec_to_ktime(ctx->delta_time);
++ val = ktime_sub(val, delta);
++ if (val.tv64 <= 0)
++ val.tv64 = NSEC_PER_USEC;
++ dprintk("rst itimer " CPT_FID " +%Ld %Ld %Lu\n", CPT_TID(tsk), val.tv64, delta.tv64, ti->cpt_it_real_value);
++ } else {
++ unsigned long jif = ti->cpt_it_real_value -
++ timespec_to_jiffies(&ctx->delta_time);
++ if ((long)jif <= 0)
++ jif = 1;
++ val = ktime_add_ns(val, (u64)jif*TICK_NSEC);
++ }
++ spin_lock_irq(&tsk->sighand->siglock);
++ if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) {
++ /* FIXME. Check!!!! */
++ hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_REL);
++ } else {
++ wprintk_ctx("Timer clash. Impossible?\n");
++ }
++ spin_unlock_irq(&tsk->sighand->siglock);
++
++ dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), val.tv64);
++ }
++
++ module_put(THIS_MODULE);
++ }
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_socket.c linux-2.6.16-026test015/kernel/cpt/rst_socket.c
+--- linux-2.6.16.orig/kernel/cpt/rst_socket.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_socket.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,876 @@
++/*
++ *
++ * kernel/cpt/rst_socket.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/namei.h>
++#include <linux/socket.h>
++#include <linux/un.h>
++#include <net/tcp.h>
++#include <net/sock.h>
++#include <net/scm.h>
++#include <net/af_unix.h>
++
++#include <ub/ub_mem.h>
++#include <ub/ub_orphan.h>
++#include <ub/ub_orphan.h>
++#include <ub/ub_net.h>
++#include <ub/ub_tcp.h>
++
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++
++#include "cpt_syscalls.h"
++
++
++static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si,
++ loff_t pos, struct cpt_context *ctx)
++{
++ if (sk->sk_socket) {
++ sk->sk_socket->flags = si->cpt_ssflags;
++ sk->sk_socket->state = si->cpt_sstate;
++ }
++ sk->sk_reuse = si->cpt_reuse;
++ sk->sk_shutdown = si->cpt_shutdown;
++ sk->sk_userlocks = si->cpt_userlocks;
++ sk->sk_no_check = si->cpt_no_check;
++ sock_reset_flag(sk, SOCK_DBG);
++ if (si->cpt_debug)
++ sock_set_flag(sk, SOCK_DBG);
++ sock_reset_flag(sk, SOCK_RCVTSTAMP);
++ if (si->cpt_rcvtstamp)
++ sock_set_flag(sk, SOCK_RCVTSTAMP);
++ sock_reset_flag(sk, SOCK_LOCALROUTE);
++ if (si->cpt_localroute)
++ sock_set_flag(sk, SOCK_LOCALROUTE);
++ sk->sk_protocol = si->cpt_protocol;
++ sk->sk_err = si->cpt_err;
++ sk->sk_err_soft = si->cpt_err_soft;
++ sk->sk_priority = si->cpt_priority;
++ sk->sk_rcvlowat = si->cpt_rcvlowat;
++ sk->sk_rcvtimeo = si->cpt_rcvtimeo;
++ if (si->cpt_rcvtimeo == CPT_NULL)
++ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
++ sk->sk_sndtimeo = si->cpt_sndtimeo;
++ if (si->cpt_sndtimeo == CPT_NULL)
++ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
++ sk->sk_rcvbuf = si->cpt_rcvbuf;
++ sk->sk_sndbuf = si->cpt_sndbuf;
++ sk->sk_bound_dev_if = si->cpt_bound_dev_if;
++ sk->sk_flags = si->cpt_flags;
++ sk->sk_lingertime = si->cpt_lingertime;
++ if (si->cpt_lingertime == CPT_NULL)
++ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
++ sk->sk_peercred.pid = si->cpt_peer_pid;
++ sk->sk_peercred.uid = si->cpt_peer_uid;
++ sk->sk_peercred.gid = si->cpt_peer_gid;
++ cpt_timeval_import(&sk->sk_stamp, si->cpt_stamp);
++ return 0;
++}
++
++static struct file *sock_mapfile(struct socket *sock)
++{
++ int fd = sock_map_fd(sock);
++
++ if (fd >= 0) {
++ struct file *file = sock->file;
++ get_file(file);
++ sc_close(fd);
++ return file;
++ }
++ return ERR_PTR(fd);
++}
++
++/* Assumption is that /tmp exists and writable.
++ * In previous versions we assumed that listen() will autobind
++ * the socket. It does not do this for AF_UNIX by evident reason:
++ * socket in abstract namespace is accessible, unlike socket bound
++ * to deleted FS object.
++ */
++
++static int
++select_deleted_name(char * name, cpt_context_t *ctx)
++{
++ int i;
++
++ for (i=0; i<100; i++) {
++ struct nameidata nd;
++ unsigned int rnd = net_random();
++
++ sprintf(name, "/tmp/SOCK.%08x", rnd);
++
++ if (path_lookup(name, 0, &nd) != 0)
++ return 0;
++
++ path_release(&nd);
++ }
++
++ eprintk_ctx("failed to allocate deleted socket inode\n");
++ return -ELOOP;
++}
++
++static int
++bind_unix_socket(struct socket *sock, struct cpt_sock_image *si,
++ cpt_context_t *ctx)
++{
++ int err;
++ char *name;
++ struct sockaddr* addr;
++ int addrlen;
++ struct sockaddr_un sun;
++ struct nameidata nd;
++
++ if ((addrlen = si->cpt_laddrlen) <= 2)
++ return 0;
++
++ nd.dentry = NULL;
++ name = ((char*)si->cpt_laddr) + 2;
++ addr = (struct sockaddr *)si->cpt_laddr;
++
++ if (name[0]) {
++ err = path_lookup(name, 0, &nd);
++ if (err) {
++ nd.dentry = NULL;
++ } else {
++ if (si->cpt_deleted) {
++ path_release(&nd);
++ nd.dentry = NULL;
++ addr = (struct sockaddr*)&sun;
++ addr->sa_family = AF_UNIX;
++ name = ((char*)addr) + 2;
++ err = select_deleted_name(name, ctx);
++ if (err)
++ return err;
++ addrlen = 2 + strlen(name);
++ } else if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) {
++ eprintk_ctx("bind_unix_socket: not a socket dentry\n");
++ path_release(&nd);
++ return -EINVAL;
++ }
++ }
++ if (nd.dentry)
++ sc_unlink(name);
++ }
++
++ err = sock->ops->bind(sock, addr, addrlen);
++
++ if (!err) {
++ if (nd.dentry) {
++ sc_chown(name, nd.dentry->d_inode->i_uid,
++ nd.dentry->d_inode->i_gid);
++ sc_chmod(name, nd.dentry->d_inode->i_mode);
++ }
++ if (si->cpt_deleted && name[0])
++ sc_unlink(name);
++ }
++ if (nd.dentry)
++ path_release(&nd);
++ return err;
++}
++
++static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si,
++ struct cpt_context *ctx)
++{
++ struct sock *sk = sock->sk;
++ cpt_object_t *obj;
++ struct sock *parent;
++
++ if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN)
++ return 0;
++
++ if (si->cpt_parent == -1)
++ return bind_unix_socket(sock, si, ctx);
++
++ obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
++ if (!obj)
++ return 0;
++
++ parent = obj->o_obj;
++ if (unix_sk(parent)->addr) {
++ if (unix_sk(sk)->addr &&
++ atomic_dec_and_test(&unix_sk(sk)->addr->refcnt))
++ kfree(unix_sk(sk)->addr);
++ atomic_inc(&unix_sk(parent)->addr->refcnt);
++ unix_sk(sk)->addr = unix_sk(parent)->addr;
++ }
++ return 0;
++}
++
++
++static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si,
++ struct cpt_context *ctx)
++{
++ int err;
++ struct socket *sock;
++ struct socket *sock2 = NULL;
++ struct file *file;
++ cpt_object_t *fobj;
++ cpt_object_t *pobj = NULL;
++
++ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol,
++ &sock);
++ if (err)
++ return err;
++
++ if (si->cpt_socketpair) {
++ err = sock_create_kern(si->cpt_family, si->cpt_type,
++ si->cpt_protocol, &sock2);
++ if (err)
++ goto err_out;
++
++ err = sock->ops->socketpair(sock, sock2);
++ if (err < 0)
++ goto err_out;
++
++ /* Socketpair with a peer outside our environment.
++ * So, we create real half-open pipe and do not worry
++ * about dead end anymore. */
++ if (si->cpt_peer == -1) {
++ sock_release(sock2);
++ sock2 = NULL;
++ }
++ }
++
++ cpt_obj_setobj(obj, sock->sk, ctx);
++
++ if (si->cpt_file != CPT_NULL) {
++ file = sock_mapfile(sock);
++ err = PTR_ERR(file);
++ if (IS_ERR(file))
++ goto err_out;
++
++ err = -ENOMEM;
++
++ obj->o_parent = file;
++
++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
++ goto err_out;
++ cpt_obj_setpos(fobj, si->cpt_file, ctx);
++ cpt_obj_setindex(fobj, si->cpt_index, ctx);
++ }
++
++ if (sock2) {
++ struct file *file2;
++
++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx);
++ if (!pobj) BUG();
++ if (pobj->o_obj) BUG();
++ cpt_obj_setobj(pobj, sock2->sk, ctx);
++
++ if (pobj->o_ppos != CPT_NULL) {
++ file2 = sock_mapfile(sock2);
++ err = PTR_ERR(file2);
++ if (IS_ERR(file2))
++ goto err_out;
++
++ err = -ENOMEM;
++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL)
++ goto err_out;
++ cpt_obj_setpos(fobj, pobj->o_ppos, ctx);
++ cpt_obj_setindex(fobj, si->cpt_peer, ctx);
++
++ pobj->o_parent = file2;
++ }
++ }
++
++ setup_sock_common(sock->sk, si, obj->o_pos, ctx);
++ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) {
++ inet_sk(sock->sk)->freebind = 1;
++ if (si->cpt_laddrlen) {
++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
++ if (err) {
++ dprintk_ctx("binding failed: %d, do not worry\n", err);
++ }
++ }
++ rst_socket_in(si, obj->o_pos, sock->sk, ctx);
++ } else if (sock->sk->sk_family == AF_NETLINK) {
++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
++ if (err) {
++ eprintk_ctx("AF_NETLINK binding failed: %d\n", err);
++ }
++ if (si->cpt_raddrlen) {
++ err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK);
++ if (err) {
++ eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err);
++ }
++ }
++ }
++ fixup_unix_address(sock, si, ctx);
++
++ if (sock2) {
++ err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx);
++ if (err)
++ return err;
++ setup_sock_common(sock2->sk, si, pobj->o_pos, ctx);
++ fixup_unix_address(sock2, si, ctx);
++ }
++
++ if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
++ && (int)si->cpt_parent != -1) {
++ cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
++ if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0)
++ sock->sk = NULL;
++ }
++
++
++ if (si->cpt_file == CPT_NULL && sock->sk &&
++ sock->sk->sk_family == AF_INET) {
++ struct sock *sk = sock->sk;
++
++ if (sk) {
++ sock->sk = NULL;
++
++ local_bh_disable();
++ bh_lock_sock(sk);
++ if (sock_owned_by_user(sk))
++ eprintk_ctx("oops, sock is locked by user\n");
++
++ sock_hold(sk);
++ sock_orphan(sk);
++ ub_inc_orphan_count(sk);
++ bh_unlock_sock(sk);
++ local_bh_enable();
++ sock_put(sk);
++ dprintk_ctx("orphaning socket %p\n", sk);
++ }
++ }
++
++ if (si->cpt_file == CPT_NULL && sock->sk == NULL)
++ sock_release(sock);
++
++ return 0;
++
++err_out:
++ if (sock2)
++ sock_release(sock2);
++ sock_release(sock);
++ return err;
++}
++
++static int open_listening_socket(loff_t pos, struct cpt_sock_image *si,
++ struct cpt_context *ctx)
++{
++ int err;
++ struct socket *sock;
++ struct file *file;
++ cpt_object_t *obj, *fobj;
++
++ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol,
++ &sock);
++ if (err) {
++ eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err);
++ return err;
++ }
++
++ sock->sk->sk_reuse = 2;
++ sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if;
++
++ if (sock->sk->sk_family == AF_UNIX) {
++ err = bind_unix_socket(sock, si, ctx);
++ } else if (si->cpt_laddrlen) {
++ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6)
++ inet_sk(sock->sk)->freebind = 1;
++
++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen);
++
++ if (err) {
++ eprintk_ctx("open_listening_socket: bind: %d\n", err);
++ goto err_out;
++ }
++ }
++
++ err = sock->ops->listen(sock, si->cpt_max_ack_backlog);
++ if (err) {
++ eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted);
++ goto err_out;
++ }
++
++ /* Now we may access socket body directly and fixup all the things. */
++
++ file = sock_mapfile(sock);
++ err = PTR_ERR(file);
++ if (IS_ERR(file)) {
++ eprintk_ctx("open_listening_socket: map: %d\n", err);
++ goto err_out;
++ }
++
++ err = -ENOMEM;
++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL)
++ goto err_out;
++ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL)
++ goto err_out;
++ cpt_obj_setpos(obj, pos, ctx);
++ cpt_obj_setindex(obj, si->cpt_index, ctx);
++ obj->o_parent = file;
++ cpt_obj_setpos(fobj, si->cpt_file, ctx);
++ cpt_obj_setindex(fobj, si->cpt_index, ctx);
++
++ setup_sock_common(sock->sk, si, pos, ctx);
++
++ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6)
++ rst_restore_synwait_queue(sock->sk, si, pos, ctx);
++
++ return 0;
++
++err_out:
++ sock_release(sock);
++ return err;
++}
++
++static int
++rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
++{
++ int err;
++ loff_t pos = *pos_p;
++ struct cpt_sockmc_image v;
++
++ err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx);
++ if (err)
++ return err;
++
++ *pos_p += v.cpt_next;
++
++ if (v.cpt_family == AF_INET)
++ return rst_sk_mcfilter_in(sk, &v, pos, ctx);
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ else if (v.cpt_family == AF_INET6)
++ return rst_sk_mcfilter_in6(sk, &v, pos, ctx);
++#endif
++ else
++ return -EAFNOSUPPORT;
++}
++
++
++static int
++rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
++{
++ int err;
++ struct sk_filter *fp, *old_fp;
++ loff_t pos = *pos_p;
++ struct cpt_obj_bits v;
++
++ err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx);
++ if (err)
++ return err;
++
++ *pos_p += v.cpt_next;
++
++ if (v.cpt_size % sizeof(struct sock_filter))
++ return -EINVAL;
++
++ fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC);
++ if (fp == NULL)
++ return -ENOMEM;
++ atomic_set(&fp->refcnt, 1);
++ fp->len = v.cpt_size/sizeof(struct sock_filter);
++
++ err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen);
++ if (err) {
++ sk_filter_release(sk, fp);
++ return err;
++ }
++
++ old_fp = sk->sk_filter;
++ sk->sk_filter = fp;
++ if (old_fp)
++ sk_filter_release(sk, old_fp);
++ return 0;
++}
++
++
++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx)
++{
++ int err;
++ loff_t pos = *pos_p;
++
++ err = rst_sock_attr_skfilter(pos_p, sk, ctx);
++ if (err && pos == *pos_p)
++ err = rst_sock_attr_mcfilter(pos_p, sk, ctx);
++ return err;
++}
++
++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx)
++{
++ int err;
++ struct sk_buff *skb;
++ struct cpt_skb_image v;
++ loff_t pos = *pos_p;
++ struct scm_fp_list *fpl = NULL;
++ struct timeval tmptv;
++
++ err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx);
++ if (err)
++ return ERR_PTR(err);
++ *pos_p = pos + v.cpt_next;
++
++ if (owner)
++ *owner = v.cpt_owner;
++ if (queue)
++ *queue = v.cpt_queue;
++
++ skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL);
++ if (skb == NULL)
++ return ERR_PTR(-ENOMEM);
++ skb_reserve(skb, v.cpt_hspace);
++ skb_put(skb, v.cpt_len);
++ skb->h.raw = skb->head + v.cpt_h;
++ skb->nh.raw = skb->head + v.cpt_nh;
++ skb->mac.raw = skb->head + v.cpt_mac;
++ if (sizeof(skb->cb) < sizeof(v.cpt_cb)) BUG();
++ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb));
++ skb->mac_len = v.cpt_mac_len;
++
++ skb->csum = v.cpt_csum;
++ skb->local_df = v.cpt_local_df;
++ skb->pkt_type = v.cpt_pkt_type;
++ skb->ip_summed = v.cpt_ip_summed;
++ skb->priority = v.cpt_priority;
++ skb->protocol = v.cpt_protocol;
++ cpt_timeval_import(&tmptv, v.cpt_stamp);
++ skb_set_timestamp(skb, &tmptv);
++
++ skb_shinfo(skb)->tso_segs = v.cpt_tso_segs;
++ skb_shinfo(skb)->tso_size = v.cpt_tso_size;
++ if (ctx->image_version == 0) {
++ skb_shinfo(skb)->tso_segs = 1;
++ skb_shinfo(skb)->tso_size = 0;
++ }
++
++ if (v.cpt_next > v.cpt_hdrlen) {
++ pos = pos + v.cpt_hdrlen;
++ while (pos < *pos_p) {
++ union {
++ struct cpt_obj_bits b;
++ struct cpt_fd_image f;
++ } u;
++
++ err = rst_get_object(-1, pos, &u, ctx);
++ if (err) {
++ kfree_skb(skb);
++ return ERR_PTR(err);
++ }
++ if (u.b.cpt_object == CPT_OBJ_BITS) {
++ if (u.b.cpt_size != v.cpt_hspace + skb->len) {
++ eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len);
++ kfree_skb(skb);
++ return ERR_PTR(-EINVAL);
++ }
++
++ err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen);
++ if (err) {
++ kfree_skb(skb);
++ return ERR_PTR(err);
++ }
++ } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) {
++ if (!fpl) {
++ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
++ if (!fpl) {
++ kfree_skb(skb);
++ return ERR_PTR(-ENOMEM);
++ }
++ fpl->count = 0;
++ UNIXCB(skb).fp = fpl;
++ }
++ fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx);
++ if (!IS_ERR(fpl->fp[fpl->count]))
++ fpl->count++;
++ }
++ pos += u.b.cpt_next;
++ }
++ }
++
++ return skb;
++}
++
++static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++ int i;
++ scm->fp = UNIXCB(skb).fp;
++ skb->destructor = sock_wfree;
++ UNIXCB(skb).fp = NULL;
++
++ for (i=scm->fp->count-1; i>=0; i--)
++ unix_notinflight(scm->fp->fp[i]);
++}
++
++static void unix_destruct_fds(struct sk_buff *skb)
++{
++ struct scm_cookie scm;
++ memset(&scm, 0, sizeof(scm));
++ unix_detach_fds(&scm, skb);
++ scm_destroy(&scm);
++ sock_wfree(skb);
++ module_put(THIS_MODULE);
++}
++
++
++static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si,
++ loff_t pos, struct cpt_context *ctx)
++{
++ loff_t endpos;
++
++ pos = pos + si->cpt_hdrlen;
++ endpos = pos + si->cpt_next;
++ while (pos < endpos) {
++ struct sk_buff *skb;
++ struct sock *owner_sk;
++ __u32 owner;
++
++ skb = rst_skb(&pos, &owner, NULL, ctx);
++ if (IS_ERR(skb)) {
++ if (PTR_ERR(skb) == -EINVAL) {
++ int err;
++
++ err = rst_sock_attr(&pos, sk, ctx);
++ if (err)
++ return err;
++ }
++ return PTR_ERR(skb);
++ }
++
++ owner_sk = unix_peer(sk);
++ if (owner != -1) {
++ cpt_object_t *pobj;
++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx);
++ if (pobj == NULL) {
++ eprintk_ctx("orphan af_unix skb?\n");
++ kfree_skb(skb);
++ continue;
++ }
++ owner_sk = pobj->o_obj;
++ }
++ if (owner_sk == NULL) {
++ dprintk_ctx("orphan af_unix skb 2?\n");
++ kfree_skb(skb);
++ continue;
++ }
++ skb_set_owner_w(skb, owner_sk);
++ if (UNIXCB(skb).fp) {
++ skb->destructor = unix_destruct_fds;
++ if (!try_module_get(THIS_MODULE)) BUG();
++ }
++ skb_queue_tail(&sk->sk_receive_queue, skb);
++ if (sk->sk_state == TCP_LISTEN) {
++ struct socket *sock = skb->sk->sk_socket;
++ if (sock == NULL) BUG();
++ if (sock->file) BUG();
++ skb->sk->sk_socket = NULL;
++ skb->sk->sk_sleep = NULL;
++ sock->sk = NULL;
++ sock_release(sock);
++ }
++ }
++ return 0;
++}
++
++
++/* All the sockets are created before we start to open files */
++
++int rst_sockets(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_SOCKET];
++ loff_t endsec;
++ cpt_object_t *obj;
++ struct cpt_section_hdr h;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err) {
++ eprintk_ctx("rst_sockets: ctx->pread: %d\n", err);
++ return err;
++ }
++ if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) {
++ eprintk_ctx("rst_sockets: hdr err\n");
++ return -EINVAL;
++ }
++
++ /* The first pass: we create socket index and open listening sockets. */
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
++ if (err) {
++ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
++ cpt_release_buf(ctx);
++ return err;
++ }
++ if (sbuf->cpt_state == TCP_LISTEN) {
++ err = open_listening_socket(sec, sbuf, ctx);
++ cpt_release_buf(ctx);
++ if (err) {
++ eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err);
++ return err;
++ }
++ } else {
++ cpt_release_buf(ctx);
++ obj = alloc_cpt_object(GFP_KERNEL, ctx);
++ if (obj == NULL)
++ return -ENOMEM;
++ cpt_obj_setindex(obj, sbuf->cpt_index, ctx);
++ cpt_obj_setpos(obj, sec, ctx);
++ obj->o_ppos = sbuf->cpt_file;
++ intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx);
++ }
++ sec += sbuf->cpt_next;
++ }
++
++ /* Pass 2: really restore sockets */
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct cpt_sock_image *sbuf;
++ if (obj->o_obj != NULL)
++ continue;
++ sbuf = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
++ if (err) {
++ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err);
++ cpt_release_buf(ctx);
++ return err;
++ }
++ if (sbuf->cpt_state == TCP_LISTEN) BUG();
++ err = open_socket(obj, sbuf, ctx);
++ cpt_release_buf(ctx);
++ if (err) {
++ eprintk_ctx("rst_sockets: open_socket: %d\n", err);
++ return err;
++ }
++ }
++
++ return 0;
++}
++
++int rst_orphans(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_ORPHANS];
++ loff_t endsec;
++ cpt_object_t *obj;
++ struct cpt_section_hdr h;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ struct cpt_sock_image *sbuf = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ obj = alloc_cpt_object(GFP_KERNEL, ctx);
++ if (obj == NULL) {
++ cpt_release_buf(ctx);
++ return -ENOMEM;
++ }
++ obj->o_pos = sec;
++ obj->o_ppos = sbuf->cpt_file;
++ err = open_socket(obj, sbuf, ctx);
++ dprintk_ctx("Restoring orphan: %d\n", err);
++ free_cpt_object(obj, ctx);
++ cpt_release_buf(ctx);
++ if (err)
++ return err;
++ sec += sbuf->cpt_next;
++ }
++
++ return 0;
++}
++
++
++/* Pass 3: I understand, this is not funny already :-),
++ * but we have to do another pass to establish links between
++ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX
++ * skb queues with proper skb->sk links.
++ *
++ * This could be made at the end of rst_sockets(), but we defer
++ * restoring af_unix queues up to the end of restoring files to
++ * make restoring passed FDs cleaner.
++ */
++
++int rst_sockets_complete(struct cpt_context *ctx)
++{
++ int err;
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct cpt_sock_image *sbuf;
++ struct sock *sk = obj->o_obj;
++ struct sock *peer;
++
++ if (!sk) BUG();
++
++ if (sk->sk_family != AF_UNIX)
++ continue;
++
++ sbuf = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++
++ if (sbuf->cpt_next > sbuf->cpt_hdrlen)
++ restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx);
++
++ cpt_release_buf(ctx);
++
++ if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) {
++ cpt_object_t *pobj;
++
++ sbuf = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++
++ if (sbuf->cpt_peer != -1) {
++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx);
++ if (pobj) {
++ peer = pobj->o_obj;
++ sock_hold(peer);
++ unix_peer(sk) = peer;
++ }
++ }
++ cpt_release_buf(ctx);
++ }
++ }
++
++ rst_orphans(ctx);
++
++ return 0;
++}
++
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_socket_in.c linux-2.6.16-026test015/kernel/cpt/rst_socket_in.c
+--- linux-2.6.16.orig/kernel/cpt/rst_socket_in.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_socket_in.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,494 @@
++/*
++ *
++ * kernel/cpt/rst_socket_in.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/fs.h>
++#include <linux/socket.h>
++#include <linux/tcp.h>
++#include <linux/jhash.h>
++#include <net/sock.h>
++#include <net/tcp.h>
++#include <linux/ipv6.h>
++#include <linux/igmp.h>
++#include <net/addrconf.h>
++#include <net/inet6_connection_sock.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_socket.h"
++#include "cpt_kernel.h"
++
++static inline unsigned long jiffies_import(__u32 tmo)
++{
++ __s32 delta = tmo;
++ return jiffies + (long)delta;
++}
++
++static inline __u32 tcp_jiffies_import(__u32 tmo)
++{
++ return ((__u32)jiffies) + tmo;
++}
++
++
++static int restore_queues(struct sock *sk, struct cpt_sock_image *si,
++ loff_t pos, struct cpt_context *ctx)
++{
++ loff_t endpos;
++
++ pos = pos + si->cpt_hdrlen;
++ endpos = pos + si->cpt_next;
++ while (pos < endpos) {
++ struct sk_buff *skb;
++ __u32 type;
++
++ skb = rst_skb(&pos, NULL, &type, ctx);
++ if (IS_ERR(skb)) {
++ if (PTR_ERR(skb) == -EINVAL) {
++ int err;
++
++ err = rst_sock_attr(&pos, sk, ctx);
++ if (err)
++ return err;
++ }
++ return PTR_ERR(skb);
++ }
++
++ if (sk->sk_type == SOCK_STREAM) {
++ if (type == CPT_SKB_RQ) {
++ sk_stream_set_owner_r(skb, sk);
++ ub_tcprcvbuf_charge_forced(sk, skb);
++ skb_queue_tail(&sk->sk_receive_queue, skb);
++ } else if (type == CPT_SKB_OFOQ) {
++ struct tcp_sock *tp = tcp_sk(sk);
++ sk_stream_set_owner_r(skb, sk);
++ ub_tcprcvbuf_charge_forced(sk, skb);
++ skb_queue_tail(&tp->out_of_order_queue, skb);
++ } else if (type == CPT_SKB_WQ) {
++ sk->sk_wmem_queued += skb->truesize;
++ sk->sk_forward_alloc -= skb->truesize;
++ ub_tcpsndbuf_charge_forced(sk, skb);
++ skb_queue_tail(&sk->sk_write_queue, skb);
++ } else {
++ wprintk_ctx("strange stream queue type %u\n", type);
++ kfree_skb(skb);
++ }
++ } else {
++ if (type == CPT_SKB_RQ) {
++ skb_set_owner_r(skb, sk);
++ skb_queue_tail(&sk->sk_receive_queue, skb);
++ } else if (type == CPT_SKB_WQ) {
++ struct inet_sock *inet = inet_sk(sk);
++ if (inet->cork.fragsize) {
++ skb_set_owner_w(skb, sk);
++ skb_queue_tail(&sk->sk_write_queue, skb);
++ } else {
++ eprintk_ctx("cork skb is dropped\n");
++ kfree_skb(skb);
++ }
++ } else {
++ wprintk_ctx("strange dgram queue type %u\n", type);
++ kfree_skb(skb);
++ }
++ }
++ }
++ return 0;
++}
++
++static struct sock *find_parent(__u16 sport, cpt_context_t *ctx)
++{
++ cpt_object_t *obj;
++ for_each_object(obj, CPT_OBJ_SOCKET) {
++ struct sock *sk = obj->o_obj;
++ if (sk &&
++ sk->sk_state == TCP_LISTEN &&
++ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) &&
++ inet_sk(sk)->sport == sport)
++ return sk;
++ }
++ return NULL;
++}
++
++static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
++ struct cpt_context *ctx)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++ struct sk_buff *skb;
++ tp->pred_flags = si->cpt_pred_flags;
++ tp->rcv_nxt = si->cpt_rcv_nxt;
++ tp->snd_nxt = si->cpt_snd_nxt;
++ tp->snd_una = si->cpt_snd_una;
++ tp->snd_sml = si->cpt_snd_sml;
++ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp);
++ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime);
++ tp->tcp_header_len = si->cpt_tcp_header_len;
++ inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending;
++ inet_csk(sk)->icsk_ack.quick = si->cpt_quick;
++ inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong;
++ inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked;
++ inet_csk(sk)->icsk_ack.ato = si->cpt_ato;
++ inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout);
++ inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime);
++ inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size;
++ inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss;
++ tp->snd_wl1 = si->cpt_snd_wl1;
++ tp->snd_wnd = si->cpt_snd_wnd;
++ tp->max_window = si->cpt_max_window;
++ inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie;
++ tp->mss_cache = si->cpt_mss_cache;
++ tp->rx_opt.mss_clamp = si->cpt_mss_clamp;
++ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len;
++ inet_csk(sk)->icsk_ca_state = si->cpt_ca_state;
++ inet_csk(sk)->icsk_retransmits = si->cpt_retransmits;
++ tp->reordering = si->cpt_reordering;
++ tp->frto_counter = si->cpt_frto_counter;
++ tp->frto_highmark = si->cpt_frto_highmark;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)
++ // // tp->adv_cong = si->cpt_adv_cong;
++#endif
++ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept;
++ inet_csk(sk)->icsk_backoff = si->cpt_backoff;
++ tp->srtt = si->cpt_srtt;
++ tp->mdev = si->cpt_mdev;
++ tp->mdev_max = si->cpt_mdev_max;
++ tp->rttvar = si->cpt_rttvar;
++ tp->rtt_seq = si->cpt_rtt_seq;
++ inet_csk(sk)->icsk_rto = si->cpt_rto;
++ tp->packets_out = si->cpt_packets_out;
++ tp->left_out = si->cpt_left_out;
++ tp->retrans_out = si->cpt_retrans_out;
++ tp->lost_out = si->cpt_lost_out;
++ tp->sacked_out = si->cpt_sacked_out;
++ tp->fackets_out = si->cpt_fackets_out;
++ tp->snd_ssthresh = si->cpt_snd_ssthresh;
++ tp->snd_cwnd = si->cpt_snd_cwnd;
++ tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt;
++ tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp;
++ tp->snd_cwnd_used = si->cpt_snd_cwnd_used;
++ tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp);
++ inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout);
++ tp->rcv_wnd = si->cpt_rcv_wnd;
++ tp->rcv_wup = si->cpt_rcv_wup;
++ tp->write_seq = si->cpt_write_seq;
++ tp->pushed_seq = si->cpt_pushed_seq;
++ tp->copied_seq = si->cpt_copied_seq;
++ tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok;
++ tp->rx_opt.wscale_ok = si->cpt_wscale_ok;
++ tp->rx_opt.sack_ok = si->cpt_sack_ok;
++ tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp;
++ tp->rx_opt.snd_wscale = si->cpt_snd_wscale;
++ tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale;
++ tp->nonagle = si->cpt_nonagle;
++ tp->keepalive_probes = si->cpt_keepalive_probes;
++ tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval;
++ tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr;
++ tp->rx_opt.ts_recent = si->cpt_ts_recent;
++ tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp;
++ tp->rx_opt.user_mss = si->cpt_user_mss;
++ tp->rx_opt.dsack = si->cpt_dsack;
++ tp->rx_opt.eff_sacks = si->cpt_num_sacks;
++ tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0];
++ tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1];
++ tp->selective_acks[0].start_seq = si->cpt_sack_array[2];
++ tp->selective_acks[0].end_seq = si->cpt_sack_array[3];
++ tp->selective_acks[1].start_seq = si->cpt_sack_array[4];
++ tp->selective_acks[1].end_seq = si->cpt_sack_array[5];
++ tp->selective_acks[2].start_seq = si->cpt_sack_array[6];
++ tp->selective_acks[2].end_seq = si->cpt_sack_array[7];
++ tp->selective_acks[3].start_seq = si->cpt_sack_array[8];
++ tp->selective_acks[3].end_seq = si->cpt_sack_array[9];
++
++ tp->window_clamp = si->cpt_window_clamp;
++ tp->rcv_ssthresh = si->cpt_rcv_ssthresh;
++ inet_csk(sk)->icsk_probes_out = si->cpt_probes_out;
++ tp->rx_opt.num_sacks = si->cpt_num_sacks;
++ tp->advmss = si->cpt_advmss;
++ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries;
++ tp->ecn_flags = si->cpt_ecn_flags;
++ tp->prior_ssthresh = si->cpt_prior_ssthresh;
++ tp->high_seq = si->cpt_high_seq;
++ tp->retrans_stamp = si->cpt_retrans_stamp;
++ tp->undo_marker = si->cpt_undo_marker;
++ tp->undo_retrans = si->cpt_undo_retrans;
++ tp->urg_seq = si->cpt_urg_seq;
++ tp->urg_data = si->cpt_urg_data;
++ inet_csk(sk)->icsk_pending = si->cpt_pending;
++ tp->urg_mode = si->cpt_urg_mode;
++ tp->snd_up = si->cpt_snd_up;
++ tp->keepalive_time = si->cpt_keepalive_time;
++ tp->keepalive_intvl = si->cpt_keepalive_intvl;
++ tp->linger2 = si->cpt_linger2;
++
++ sk->sk_send_head = NULL;
++ for (skb = skb_peek(&sk->sk_write_queue);
++ skb && skb != (struct sk_buff*)&sk->sk_write_queue;
++ skb = skb->next) {
++ if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) {
++ sk->sk_send_head = skb;
++ break;
++ }
++ }
++
++ if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) {
++ struct inet_sock *inet = inet_sk(sk);
++ if (inet->num == 0) {
++ cpt_object_t *lobj = NULL;
++
++ if ((int)si->cpt_parent != -1)
++ lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx);
++
++ if (lobj && lobj->o_obj) {
++ inet->num = ntohs(inet->sport);
++ local_bh_disable();
++ __inet_inherit_port(&tcp_hashinfo, lobj->o_obj, sk);
++ local_bh_enable();
++ dprintk_ctx("port inherited from parent\n");
++ } else {
++ struct sock *lsk = find_parent(inet->sport, ctx);
++ if (lsk) {
++ inet->num = ntohs(inet->sport);
++ local_bh_disable();
++ __inet_inherit_port(&tcp_hashinfo, lsk, sk);
++ local_bh_enable();
++ dprintk_ctx("port inherited\n");
++ } else {
++ eprintk_ctx("we are kinda lost...\n");
++ }
++ }
++ }
++
++ sk->sk_prot->hash(sk);
++
++ if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER)
++ sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout);
++ if (inet_csk(sk)->icsk_pending)
++ sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer,
++ inet_csk(sk)->icsk_timeout);
++ if (sock_flag(sk, SOCK_KEEPOPEN)) {
++ unsigned long expires = jiffies_import(si->cpt_ka_timeout);
++ if (time_after(jiffies, expires))
++ expires = jiffies + HZ;
++ sk_reset_timer(sk, &sk->sk_timer, expires);
++ }
++ }
++
++ return 0;
++}
++
++
++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk,
++ struct cpt_context *ctx)
++{
++ struct inet_sock *inet = inet_sk(sk);
++
++ lock_sock(sk);
++
++ sk->sk_state = si->cpt_state;
++
++ inet->daddr = si->cpt_daddr;
++ inet->dport = si->cpt_dport;
++ inet->saddr = si->cpt_saddr;
++ inet->rcv_saddr = si->cpt_rcv_saddr;
++ inet->sport = si->cpt_sport;
++ inet->uc_ttl = si->cpt_uc_ttl;
++ inet->tos = si->cpt_tos;
++ inet->cmsg_flags = si->cpt_cmsg_flags;
++ inet->mc_index = si->cpt_mc_index;
++ inet->mc_addr = si->cpt_mc_addr;
++ inet->hdrincl = si->cpt_hdrincl;
++ inet->mc_ttl = si->cpt_mc_ttl;
++ inet->mc_loop = si->cpt_mc_loop;
++ inet->pmtudisc = si->cpt_pmtudisc;
++ inet->recverr = si->cpt_recverr;
++ inet->freebind = si->cpt_freebind;
++ inet->id = si->cpt_idcounter;
++
++ inet->cork.flags = si->cpt_cork_flags;
++ inet->cork.fragsize = si->cpt_cork_fragsize;
++ inet->cork.length = si->cpt_cork_length;
++ inet->cork.addr = si->cpt_cork_addr;
++ inet->cork.fl.fl4_src = si->cpt_cork_saddr;
++ inet->cork.fl.fl4_dst = si->cpt_cork_daddr;
++ inet->cork.fl.oif = si->cpt_cork_oif;
++ if (inet->cork.fragsize) {
++ if (ip_route_output_key(&inet->cork.rt, &inet->cork.fl)) {
++ eprintk_ctx("failed to restore cork route\n");
++ inet->cork.fragsize = 0;
++ }
++ }
++
++ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) {
++ struct udp_sock *up = udp_sk(sk);
++ up->pending = si->cpt_udp_pending;
++ up->corkflag = si->cpt_udp_corkflag;
++ up->encap_type = si->cpt_udp_encap;
++ up->len = si->cpt_udp_len;
++ }
++
++ if (sk->sk_family == AF_INET6) {
++ struct ipv6_pinfo *np = inet6_sk(sk);
++
++ memcpy(&np->saddr, si->cpt_saddr6, 16);
++ memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16);
++ memcpy(&np->daddr, si->cpt_daddr6, 16);
++ np->flow_label = si->cpt_flow_label6;
++ np->frag_size = si->cpt_frag_size6;
++ np->hop_limit = si->cpt_hop_limit6;
++ np->mcast_hops = si->cpt_mcast_hops6;
++ np->mcast_oif = si->cpt_mcast_oif6;
++ np->rxopt.all = si->cpt_rxopt6;
++ np->mc_loop = si->cpt_mc_loop6;
++ np->recverr = si->cpt_recverr6;
++ np->sndflow = si->cpt_sndflow6;
++ np->pmtudisc = si->cpt_pmtudisc6;
++ np->ipv6only = si->cpt_ipv6only6;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ if (si->cpt_mapped) {
++ extern struct inet_connection_sock_af_ops ipv6_mapped;
++ if (sk->sk_type == SOCK_STREAM &&
++ sk->sk_protocol == IPPROTO_TCP) {
++ inet_csk(sk)->icsk_af_ops = &ipv6_mapped;
++ sk->sk_backlog_rcv = tcp_v4_do_rcv;
++ }
++ }
++#endif
++ }
++
++ restore_queues(sk, si, pos, ctx);
++
++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
++ rst_socket_tcp(si, pos, sk, ctx);
++
++ release_sock(sk);
++ return 0;
++}
++
++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx)
++{
++ struct request_sock *req;
++
++ if (lsk->sk_state != TCP_LISTEN)
++ return -EINVAL;
++
++ req = reqsk_alloc(&tcp_request_sock_ops);
++ if (!req)
++ return -ENOMEM;
++
++ sk->sk_socket = NULL;
++ sk->sk_sleep = NULL;
++ inet_csk_reqsk_queue_add(lsk, req, sk);
++ return 0;
++}
++
++static __inline__ u32 __tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
++{
++ return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
++}
++
++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si,
++ loff_t pos, struct cpt_context *ctx)
++{
++ int err;
++ loff_t end = si->cpt_next;
++
++ pos += si->cpt_hdrlen;
++ while (pos < end) {
++ struct cpt_openreq_image oi;
++
++ err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx);
++ if (err) {
++ err = rst_sock_attr(&pos, sk, ctx);
++ if (err)
++ return err;
++ continue;
++ }
++
++ if (oi.cpt_object == CPT_OBJ_OPENREQ) {
++ struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops);
++ if (req == NULL)
++ return -ENOMEM;
++
++ memset(req, 0, sizeof(*req));
++ tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn;
++ tcp_rsk(req)->snt_isn = oi.cpt_snt_isn;
++ inet_rsk(req)->rmt_port = oi.cpt_rmt_port;
++ req->mss = oi.cpt_mss;
++ req->retrans = oi.cpt_retrans;
++ inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale;
++ inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale;
++ inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok;
++ inet_rsk(req)->sack_ok = oi.cpt_sack_ok;
++ inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok;
++ inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok;
++ inet_rsk(req)->acked = oi.cpt_acked;
++ req->window_clamp = oi.cpt_window_clamp;
++ req->rcv_wnd = oi.cpt_rcv_wnd;
++ req->ts_recent = oi.cpt_ts_recent;
++ req->expires = jiffies_import(oi.cpt_expires);
++
++ if (oi.cpt_family == AF_INET) {
++ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4);
++ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4);
++ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
++ } else {
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++ memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16);
++ memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16);
++ inet6_rsk(req)->iif = oi.cpt_iif;
++ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
++#endif
++ }
++ }
++ pos += oi.cpt_next;
++ }
++ return 0;
++}
++
++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v,
++ loff_t pos, cpt_context_t *ctx)
++{
++ struct ip_mreqn imr;
++
++ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
++ eprintk_ctx("IGMPv3 is still not supported\n");
++ return -EINVAL;
++ }
++
++ memset(&imr, 0, sizeof(imr));
++ imr.imr_ifindex = v->cpt_ifindex;
++ imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0];
++ return ip_mc_join_group(sk, &imr);
++}
++
++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v,
++ loff_t pos, cpt_context_t *ctx)
++{
++
++ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) {
++ eprintk_ctx("IGMPv3 is still not supported\n");
++ return -EINVAL;
++ }
++
++ return ipv6_sock_mc_join(sk, v->cpt_ifindex,
++ (struct in6_addr*)v->cpt_mcaddr);
++}
++#endif
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_sysvipc.c linux-2.6.16-026test015/kernel/cpt/rst_sysvipc.c
+--- linux-2.6.16.orig/kernel/cpt/rst_sysvipc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_sysvipc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,409 @@
++/*
++ *
++ * kernel/cpt/rst_sysvipc.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/shm.h>
++/* FIXME. x86_64 has asm/ipc.h forgotten? */
++#include <asm-generic/ipc.h>
++#include <asm/uaccess.h>
++#include <asm/unistd.h>
++#include <ub/ub_mem.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_kernel.h"
++
++struct _warg {
++ struct file *file;
++ struct cpt_sysvshm_image *v;
++};
++
++static int fixup_one_shm(struct shmid_kernel *shp, void *arg)
++{
++ struct _warg *warg = arg;
++
++ if (shp->shm_file != warg->file)
++ return 0;
++ if (shp->shm_nattch)
++ return -EEXIST;
++
++ shp->shm_perm.uid = warg->v->cpt_uid;
++ shp->shm_perm.gid = warg->v->cpt_gid;
++ shp->shm_perm.cuid = warg->v->cpt_cuid;
++ shp->shm_perm.cgid = warg->v->cpt_cgid;
++ shp->shm_perm.mode = warg->v->cpt_mode;
++
++ shp->shm_atim = warg->v->cpt_atime;
++ shp->shm_dtim = warg->v->cpt_dtime;
++ shp->shm_ctim = warg->v->cpt_ctime;
++ shp->shm_cprid = warg->v->cpt_creator;
++ shp->shm_lprid = warg->v->cpt_last;
++
++ /* TODO: fix shp->mlock_user? */
++ return 1;
++}
++
++static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v)
++{
++ struct _warg warg;
++
++ warg.file = file;
++ warg.v = v;
++
++ return sysvipc_walk_shm(fixup_one_shm, &warg);
++}
++
++static int fixup_shm_data(struct file *file, loff_t pos, loff_t end,
++ struct cpt_context *ctx)
++{
++ struct cpt_page_block pgb;
++ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos);
++
++ do_write = file->f_dentry->d_inode->i_fop->write;
++ if (do_write == NULL) {
++ eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n");
++ return -EINVAL;
++ }
++
++ while (pos < end) {
++ loff_t opos;
++ loff_t ipos;
++ int count;
++ int err;
++
++ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx);
++ if (err)
++ return err;
++ dprintk_ctx("restoring SHM block: %08x-%08x\n",
++ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end);
++ ipos = pos + pgb.cpt_hdrlen;
++ opos = pgb.cpt_start;
++ count = pgb.cpt_end-pgb.cpt_start;
++ while (count > 0) {
++ mm_segment_t oldfs;
++ int copy = count;
++
++ if (copy > PAGE_SIZE)
++ copy = PAGE_SIZE;
++ (void)cpt_get_buf(ctx);
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos);
++ set_fs(oldfs);
++ if (err) {
++ __cpt_release_buf(ctx);
++ return err;
++ }
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ ipos += copy;
++ err = do_write(file, ctx->tmpbuf, copy, &opos);
++ set_fs(oldfs);
++ __cpt_release_buf(ctx);
++ if (err != copy) {
++ eprintk_ctx("write() failure\n");
++ if (err >= 0)
++ err = -EIO;
++ return err;
++ }
++ count -= copy;
++ }
++ pos += pgb.cpt_next;
++ }
++ return 0;
++}
++
++struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx)
++{
++ struct file *file;
++ int err;
++ loff_t dpos, epos;
++ union {
++ struct cpt_file_image fi;
++ struct cpt_sysvshm_image shmi;
++ struct cpt_inode_image ii;
++ } u;
++
++ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx);
++ if (err < 0)
++ goto err_out;
++ pos = u.fi.cpt_inode;
++ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx);
++ if (err < 0)
++ goto err_out;
++ dpos = pos + u.ii.cpt_hdrlen;
++ epos = pos + u.ii.cpt_next;
++ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx);
++ if (err < 0)
++ goto err_out;
++ dpos += u.shmi.cpt_next;
++
++ file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id,
++ u.shmi.cpt_segsz, u.shmi.cpt_mode);
++ if (!IS_ERR(file)) {
++ err = fixup_shm(file, &u.shmi);
++ if (err != -EEXIST && dpos < epos)
++ err = fixup_shm_data(file, dpos, epos, ctx);
++ }
++
++ return file;
++
++err_out:
++ return ERR_PTR(err);
++}
++
++static int attach_one_undo(int semid, struct sem_array *sma, void *arg)
++{
++ struct sem_undo *su = arg;
++ struct sem_undo_list *undo_list = current->sysvsem.undo_list;
++
++ if (semid != su->semid)
++ return 0;
++
++ su->proc_next = undo_list->proc_list;
++ undo_list->proc_list = su;
++
++ su->id_next = sma->undo;
++ sma->undo = su;
++
++ return 1;
++}
++
++static int attach_undo(struct sem_undo *su)
++{
++ return sysvipc_walk_sem(attach_one_undo, su);
++}
++
++static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx)
++{
++ int err;
++ struct sem_undo_list *undo_list;
++
++ if (current->sysvsem.undo_list) {
++ eprintk_ctx("Funny undo_list\n");
++ return 0;
++ }
++
++ undo_list = ub_kmalloc(sizeof(struct sem_undo_list), GFP_KERNEL);
++ if (undo_list == NULL)
++ return -ENOMEM;
++ memset(undo_list, 0, sizeof(struct sem_undo_list));
++ atomic_set(&undo_list->refcnt, 1);
++ spin_lock_init(&undo_list->lock);
++ current->sysvsem.undo_list = undo_list;
++
++ if (sui->cpt_next > sui->cpt_hdrlen) {
++ loff_t offset = pos + sui->cpt_hdrlen;
++ do {
++ struct sem_undo *new;
++ struct cpt_sysvsem_undo_image spi;
++ err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx);
++ if (err)
++ goto out;
++ new = ub_kmalloc(sizeof(struct sem_undo) +
++ sizeof(short)*spi.cpt_nsem, GFP_KERNEL);
++ if (!new) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem);
++ new->semadj = (short *) &new[1];
++ new->semid = spi.cpt_id;
++ err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen);
++ if (err) {
++ kfree(new);
++ goto out;
++ }
++ err = attach_undo(new);
++ if (err <= 0) {
++ if (err == 0)
++ err = -ENOENT;
++ kfree(new);
++ goto out;
++ }
++ offset += spi.cpt_next;
++ } while (offset < pos + sui->cpt_next);
++ }
++ err = 0;
++
++out:
++ return err;
++}
++
++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ __u32 flag = 0;
++
++#if 0
++ if (ti->cpt_sysvsem_undo == CPT_NULL ||
++ lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo))
++ flag |= CLONE_SYSVSEM;
++#endif
++ return flag;
++}
++
++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ int err;
++ struct sem_undo_list *f = current->sysvsem.undo_list;
++ cpt_object_t *obj;
++ struct cpt_object_hdr sui;
++
++ if (ti->cpt_sysvsem_undo == CPT_NULL) {
++ exit_sem(current);
++ return 0;
++ }
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx);
++ if (obj) {
++ if (obj->o_obj != f) {
++ exit_sem(current);
++ f = obj->o_obj;
++ atomic_inc(&f->refcnt);
++ current->sysvsem.undo_list = f;
++ }
++ return 0;
++ }
++
++ if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0)
++ goto out;
++
++ if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0)
++ goto out;
++
++ err = -ENOMEM;
++ obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx);
++ if (obj) {
++ err = 0;
++ cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx);
++ }
++
++ return 0;
++
++out:
++ return err;
++}
++
++struct _sarg {
++ int semid;
++ struct cpt_sysvsem_image *v;
++ __u32 *arr;
++};
++
++static int fixup_one_sem(int semid, struct sem_array *sma, void *arg)
++{
++ struct _sarg *warg = arg;
++
++ if (semid != warg->semid)
++ return 0;
++
++ sma->sem_perm.uid = warg->v->cpt_uid;
++ sma->sem_perm.gid = warg->v->cpt_gid;
++ sma->sem_perm.cuid = warg->v->cpt_cuid;
++ sma->sem_perm.cgid = warg->v->cpt_cgid;
++ sma->sem_perm.mode = warg->v->cpt_mode;
++ sma->sem_perm.seq = warg->v->cpt_seq;
++
++ sma->sem_ctime = warg->v->cpt_ctime;
++ sma->sem_otime = warg->v->cpt_otime;
++ memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8);
++ return 1;
++}
++
++static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr)
++{
++ struct _sarg warg;
++
++ warg.semid = semid;
++ warg.v = v;
++ warg.arr = arr;
++
++ return sysvipc_walk_sem(fixup_one_sem, &warg);
++}
++
++
++static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si,
++ struct cpt_context *ctx)
++{
++ int err;
++ __u32 *arr;
++ int nsems = (si->cpt_next - si->cpt_hdrlen)/8;
++
++ arr = kmalloc(nsems*8, GFP_KERNEL);
++ if (!arr)
++ return -ENOMEM;
++
++ err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen);
++ if (err)
++ goto out;
++ err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode);
++ if (err < 0) {
++ eprintk_ctx("SEM 3\n");
++ goto out;
++ }
++ err = fixup_sem(si->cpt_id, si, arr);
++ if (err == 0)
++ err = -ESRCH;
++ if (err > 0)
++ err = 0;
++out:
++ kfree(arr);
++ return err;
++}
++
++static int rst_sysv_sem(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_sysvsem_image sbuf;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ int err;
++ err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx);
++ if (err)
++ return err;
++ err = restore_sem(sec, &sbuf, ctx);
++ if (err)
++ return err;
++ sec += sbuf.cpt_next;
++ }
++ return 0;
++}
++
++int rst_sysv_ipc(struct cpt_context *ctx)
++{
++ return rst_sysv_sem(ctx);
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_tty.c linux-2.6.16-026test015/kernel/cpt/rst_tty.c
+--- linux-2.6.16.orig/kernel/cpt/rst_tty.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_tty.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,380 @@
++/*
++ *
++ * kernel/cpt/rst_tty.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/major.h>
++#include <linux/pipe_fs_i.h>
++#include <linux/mman.h>
++#include <linux/mount.h>
++#include <linux/tty.h>
++#include <linux/vmalloc.h>
++#include <asm/unistd.h>
++#include <asm/uaccess.h>
++#include <linux/cpt_image.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_mm.h"
++#include "cpt_files.h"
++#include "cpt_kernel.h"
++
++static int pty_setup(struct tty_struct *stty, loff_t pos,
++ struct cpt_tty_image *pi, struct cpt_context *ctx)
++{
++ unsigned long flags;
++
++ stty->pgrp = -1;
++ stty->session = 0;
++ stty->packet = pi->cpt_packet;
++ stty->stopped = pi->cpt_stopped;
++ stty->hw_stopped = pi->cpt_hw_stopped;
++ stty->flow_stopped = pi->cpt_flow_stopped;
++#define DONOT_CHANGE ((1<<TTY_CHARGED)|(1<<TTY_CLOSING)|(1<<TTY_LDISC))
++ flags = stty->flags & DONOT_CHANGE;
++ stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE);
++ stty->ctrl_status = pi->cpt_ctrl_status;
++ stty->winsize.ws_row = pi->cpt_ws_row;
++ stty->winsize.ws_col = pi->cpt_ws_col;
++ stty->winsize.ws_ypixel = pi->cpt_ws_prow;
++ stty->winsize.ws_xpixel = pi->cpt_ws_pcol;
++ stty->canon_column = pi->cpt_canon_column;
++ stty->column = pi->cpt_column;
++ stty->raw = pi->cpt_raw;
++ stty->real_raw = pi->cpt_real_raw;
++ stty->erasing = pi->cpt_erasing;
++ stty->lnext = pi->cpt_lnext;
++ stty->icanon = pi->cpt_icanon;
++ stty->closing = pi->cpt_closing;
++ stty->minimum_to_wake = pi->cpt_minimum_to_wake;
++
++ stty->termios->c_iflag = pi->cpt_c_iflag;
++ stty->termios->c_oflag = pi->cpt_c_oflag;
++ stty->termios->c_lflag = pi->cpt_c_lflag;
++ stty->termios->c_cflag = pi->cpt_c_cflag;
++ memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS);
++ memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags));
++
++ if (pi->cpt_next > pi->cpt_hdrlen) {
++ int err;
++ struct cpt_obj_bits b;
++ err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx);
++ if (err)
++ return err;
++ if (b.cpt_size == 0)
++ return 0;
++ err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen);
++ if (err)
++ return err;
++
++ spin_lock_irq(&stty->read_lock);
++ stty->read_tail = 0;
++ stty->read_cnt = b.cpt_size;
++ stty->read_head = b.cpt_size;
++ stty->canon_head = stty->read_tail + pi->cpt_canon_head;
++ stty->canon_data = pi->cpt_canon_data;
++ spin_unlock_irq(&stty->read_lock);
++ }
++
++ return 0;
++}
++
++/* Find slave/master tty in image, when we already know master/slave.
++ * It might be optimized, of course. */
++static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_TTY];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_tty_image *pibuf;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return CPT_NULL;
++ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
++ return CPT_NULL;
++ pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL);
++ if (pibuf == NULL) {
++ eprintk_ctx("cannot allocate buffer\n");
++ return CPT_NULL;
++ }
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx))
++ return CPT_NULL;
++ if (pibuf->cpt_index == pi->cpt_index &&
++ !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) &&
++ pos != sec) {
++ pty_setup(stty, sec, pibuf, ctx);
++ return sec;
++ }
++ sec += pibuf->cpt_next;
++ }
++ kfree(pibuf);
++ return CPT_NULL;
++}
++
++static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master,
++ struct cpt_context *ctx)
++{
++ int err;
++ struct iattr newattrs;
++ struct dentry *d = master->f_dentry;
++
++ newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE;
++ newattrs.ia_uid = ii->cpt_uid;
++ newattrs.ia_gid = ii->cpt_gid;
++ newattrs.ia_mode = ii->cpt_mode;
++
++ mutex_lock(&d->d_inode->i_mutex);
++ err = notify_change(d, &newattrs);
++ mutex_unlock(&d->d_inode->i_mutex);
++
++ return err;
++}
++
++/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open
++ * /dev/ptmx until we get pty with desired index.
++ */
++
++struct file *ptmx_open(int index, unsigned int flags)
++{
++ struct file *file;
++ struct file **stack = NULL;
++ int depth = 0;
++
++ for (;;) {
++ struct tty_struct *tty;
++
++ file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
++ if (IS_ERR(file))
++ break;
++ tty = file->private_data;
++ if (tty->index == index)
++ break;
++
++ if (depth == PAGE_SIZE/sizeof(struct file *)) {
++ fput(file);
++ file = ERR_PTR(-EBUSY);
++ break;
++ }
++ if (stack == NULL) {
++ stack = (struct file **)__get_free_page(GFP_KERNEL);
++ if (!stack) {
++ fput(file);
++ file = ERR_PTR(-ENOMEM);
++ break;
++ }
++ }
++ stack[depth] = file;
++ depth++;
++ }
++ while (depth > 0) {
++ depth--;
++ fput(stack[depth]);
++ }
++ if (stack)
++ free_page((unsigned long)stack);
++ return file;
++}
++
++
++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii,
++ unsigned flags, struct cpt_context *ctx)
++{
++ int err;
++ cpt_object_t *obj;
++ struct file *master, *slave;
++ struct tty_struct *stty;
++ struct cpt_tty_image *pi;
++ static char *a = "pqrstuvwxyzabcde";
++ static char *b = "0123456789abcdef";
++ char pairname[16];
++ unsigned master_flags, slave_flags;
++
++ if (fi->cpt_priv == CPT_NULL)
++ return ERR_PTR(-EINVAL);
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx);
++ if (obj && obj->o_parent) {
++ dprintk_ctx("obtained pty as pair to existing\n");
++ master = obj->o_parent;
++ stty = master->private_data;
++
++ if (stty->driver->subtype == PTY_TYPE_MASTER &&
++ (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) {
++ wprintk_ctx("cloning ptmx\n");
++ get_file(master);
++ return master;
++ }
++
++ master = dentry_open(dget(master->f_dentry),
++ mntget(master->f_vfsmnt), flags);
++ if (!IS_ERR(master)) {
++ stty = master->private_data;
++ if (stty->driver->subtype != PTY_TYPE_MASTER)
++ fixup_tty_attrs(ii, master, ctx);
++ }
++ return master;
++ }
++
++ pi = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return ERR_PTR(err);
++ }
++
++ master_flags = slave_flags = 0;
++ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER)
++ master_flags = flags;
++ else
++ slave_flags = flags;
++
++ /*
++ * Open pair master/slave.
++ */
++ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) {
++ master = ptmx_open(pi->cpt_index, master_flags);
++ } else {
++ sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]);
++ master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
++ }
++ if (IS_ERR(master)) {
++ eprintk_ctx("filp_open master: %Ld %ld\n", fi->cpt_priv, PTR_ERR(master));
++ cpt_release_buf(ctx);
++ return master;
++ }
++ stty = master->private_data;
++ clear_bit(TTY_PTY_LOCK, &stty->flags);
++ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM)
++ sprintf(pairname, "/dev/pts/%d", stty->index);
++ else
++ sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]);
++ slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0);
++ if (IS_ERR(slave)) {
++ eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave));
++ fput(master);
++ cpt_release_buf(ctx);
++ return slave;
++ }
++
++ if (pi->cpt_drv_subtype != PTY_TYPE_MASTER)
++ fixup_tty_attrs(ii, slave, ctx);
++
++ cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx);
++ cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx);
++ cpt_object_add(CPT_OBJ_FILE, master, ctx);
++ cpt_object_add(CPT_OBJ_FILE, slave, ctx);
++
++ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) {
++ loff_t pos;
++ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx);
++ obj->o_parent = master;
++ cpt_obj_setpos(obj, fi->cpt_priv, ctx);
++ pty_setup(stty, fi->cpt_priv, pi, ctx);
++
++ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx);
++ obj->o_parent = slave;
++ pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx);
++ cpt_obj_setpos(obj, pos, ctx);
++
++ obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx);
++ cpt_obj_setpos(obj, CPT_NULL, ctx);
++ get_file(master);
++ cpt_release_buf(ctx);
++ return master;
++ } else {
++ loff_t pos;
++ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx);
++ obj->o_parent = slave;
++ cpt_obj_setpos(obj, fi->cpt_priv, ctx);
++ pty_setup(stty->link, fi->cpt_priv, pi, ctx);
++
++ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx);
++ obj->o_parent = master;
++ pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx);
++ cpt_obj_setpos(obj, pos, ctx);
++
++ obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx);
++ cpt_obj_setpos(obj, CPT_NULL, ctx);
++ get_file(slave);
++ cpt_release_buf(ctx);
++ return slave;
++ }
++}
++
++int rst_tty_jobcontrol(struct cpt_context *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_TTY];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ cpt_object_t *obj;
++ struct cpt_tty_image *pibuf = cpt_get_buf(ctx);
++
++ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) {
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx);
++ if (obj) {
++ struct tty_struct *stty = obj->o_obj;
++ if ((int)pibuf->cpt_pgrp > 0) {
++ stty->pgrp = vpid_to_pid(pibuf->cpt_pgrp);
++ if (stty->pgrp == -1)
++ dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp);
++ } else if (pibuf->cpt_pgrp) {
++ stty->pgrp = alloc_pidmap();
++ if (stty->pgrp < 0) {
++ eprintk_ctx("cannot allocate stray tty->pgrp");
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ free_pidmap(stty->pgrp);
++ }
++ if ((int)pibuf->cpt_session > 0) {
++ int sess;
++ sess = vpid_to_pid(pibuf->cpt_session);
++ if (sess == -1) {
++ dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session);
++ } else if (stty->session <= 0) {
++ stty->session = sess;
++ } else if (stty->session != sess) {
++ wprintk_ctx("tty session mismatch 2\n");
++ }
++ }
++ }
++ sec += pibuf->cpt_next;
++ cpt_release_buf(ctx);
++ }
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_ubc.c linux-2.6.16-026test015/kernel/cpt/rst_ubc.c
+--- linux-2.6.16.orig/kernel/cpt/rst_ubc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_ubc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,108 @@
++/*
++ *
++ * kernel/cpt/rst_ubc.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/types.h>
++#include <ub/beancounter.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++
++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx);
++ if (obj == NULL) {
++ printk(KERN_ERR "RST: unknown ub @%Lu\n", pos);
++ return get_beancounter(get_exec_ub());
++ }
++ return get_beancounter(obj->o_obj);
++}
++
++static void restore_one_bc_parm(__u64 *dmp, struct ubparm *prm, int held)
++{
++ prm->barrier = (dmp[0] == CPT_NULL ? UB_MAXVALUE : dmp[0]);
++ prm->limit = (dmp[1] == CPT_NULL ? UB_MAXVALUE : dmp[1]);
++ if (held)
++ prm->held = dmp[2];
++ prm->maxheld = dmp[3];
++ prm->minheld = dmp[4];
++ prm->failcnt = dmp[5];
++}
++
++static int restore_one_bc(struct cpt_beancounter_image *v,
++ cpt_object_t *obj, struct cpt_context *ctx)
++{
++ struct user_beancounter *bc;
++ cpt_object_t *pobj;
++ int i;
++
++ if (v->cpt_parent != CPT_NULL) {
++ pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx);
++ if (pobj == NULL)
++ return -ESRCH;
++ bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1);
++ } else {
++ bc = get_exec_ub();
++ while (bc->parent)
++ bc = bc->parent;
++ get_beancounter(bc);
++ }
++ if (bc == NULL)
++ return -ENOMEM;
++ obj->o_obj = bc;
++
++ for (i = 0; i < UB_RESOURCES; i++)
++ restore_one_bc_parm(v->cpt_parms, bc->ub_parms, 0);
++ for (i = 0; i < UB_RESOURCES; i++)
++ restore_one_bc_parm(v->cpt_parms + UB_RESOURCES * 6,
++ bc->ub_store, 1);
++ return 0;
++}
++
++int rst_undump_ubc(struct cpt_context *ctx)
++{
++ loff_t start, end;
++ struct cpt_beancounter_image *v;
++ cpt_object_t *obj;
++ int err;
++
++ err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end);
++ if (err)
++ return err;
++
++ while (start < end) {
++ v = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_UBC, start, v, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++
++ obj = alloc_cpt_object(GFP_KERNEL, ctx);
++ cpt_obj_setpos(obj, start, ctx);
++ intern_cpt_object(CPT_OBJ_UBC, obj, ctx);
++
++ restore_one_bc(v, obj, ctx);
++
++ cpt_release_buf(ctx);
++ start += v->cpt_next;
++ }
++ return 0;
++}
++
++void rst_finish_ubc(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++
++ for_each_object(obj, CPT_OBJ_UBC)
++ put_beancounter(obj->o_obj);
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_undump.c linux-2.6.16-026test015/kernel/cpt/rst_undump.c
+--- linux-2.6.16.orig/kernel/cpt/rst_undump.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_undump.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,819 @@
++/*
++ *
++ * kernel/cpt/rst_undump.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/version.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/errno.h>
++#include <linux/pagemap.h>
++#include <linux/namespace.h>
++#include <linux/personality.h>
++#include <linux/binfmts.h>
++#include <linux/smp_lock.h>
++#include <linux/ve_proto.h>
++#include <linux/virtinfo.h>
++#include <linux/compat.h>
++#include <linux/vzcalluser.h>
++#include <ub/beancounter.h>
++#include <asm/desc.h>
++#include <asm/unistd.h>
++
++#include "cpt_obj.h"
++#include "cpt_context.h"
++#include "cpt_files.h"
++#include "cpt_mm.h"
++#include "cpt_process.h"
++#include "cpt_socket.h"
++#include "cpt_net.h"
++#include "cpt_ubc.h"
++#include "cpt_kernel.h"
++
++static int rst_utsname(cpt_context_t *ctx);
++
++
++struct thr_context {
++ struct completion init_complete;
++ struct completion task_done;
++ int error;
++ struct cpt_context *ctx;
++ cpt_object_t *tobj;
++};
++
++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx);
++
++static int vps_rst_veinfo(struct cpt_context *ctx)
++{
++ int err;
++ struct cpt_veinfo_image *i;
++ struct ve_struct *ve;
++ struct timespec delta;
++ loff_t start, end;
++
++ err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end);
++ if (err)
++ goto out;
++
++ i = cpt_get_buf(ctx);
++ err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx);
++ if (err)
++ goto out_rel;
++
++ ve = get_exec_env();
++ ve->_shm_ctlall = i->shm_ctl_all;
++ ve->_shm_ctlmax = i->shm_ctl_max;
++ ve->_shm_ctlmni = i->shm_ctl_mni;
++
++ ve->_msg_ctlmax = i->msg_ctl_max;
++ ve->_msg_ctlmni = i->msg_ctl_mni;
++ ve->_msg_ctlmnb = i->msg_ctl_mnb;
++
++ BUG_ON(sizeof(ve->_sem_ctls) != sizeof(i->sem_ctl_arr));
++ ve->_sem_ctls[0] = i->sem_ctl_arr[0];
++ ve->_sem_ctls[1] = i->sem_ctl_arr[1];
++ ve->_sem_ctls[2] = i->sem_ctl_arr[2];
++ ve->_sem_ctls[3] = i->sem_ctl_arr[3];
++
++ cpt_timespec_import(&delta, i->start_timespec_delta);
++ _set_normalized_timespec(&ve->start_timespec,
++ ve->start_timespec.tv_sec - delta.tv_sec,
++ ve->start_timespec.tv_nsec - delta.tv_nsec);
++ ve->start_jiffies -= i->start_jiffies_delta;
++ // // FIXME: what???
++ // // ve->start_cycles -= i->start_jiffies_delta * cycles_per_jiffy;
++
++ err = 0;
++out_rel:
++ cpt_release_buf(ctx);
++out:
++ return err;
++}
++
++static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ int err;
++ struct env_create_param2 param;
++
++ ctx->cpt_jiffies64 = get_jiffies_64();
++ do_gettimespec(&ctx->delta_time);
++
++ ctx->delta_time.tv_sec -= ctx->start_time.tv_sec;
++ if (ctx->start_time.tv_nsec > ctx->delta_time.tv_nsec) {
++ ctx->delta_time.tv_sec--;
++ ctx->delta_time.tv_nsec = 1000000000 - (ctx->start_time.tv_nsec - ctx->delta_time.tv_nsec);
++ } else {
++ ctx->delta_time.tv_nsec -= ctx->start_time.tv_nsec;
++ }
++
++ memset(&param, 0, sizeof(param));
++ param.iptables_mask = ctx->iptables_mask;
++
++ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, &param, sizeof(param));
++ if (err < 0)
++ eprintk_ctx("real_env_create: %d\n", err);
++ get_exec_env()->jiffies_fixup = ((ctx->delta_time.tv_sec < 0) ?
++ 0 : timespec_to_jiffies(&ctx->delta_time)) -
++ (unsigned long)(ctx->cpt_jiffies64 - ctx->virt_jiffies64);
++ return err < 0 ? err : 0;
++}
++
++
++static int hook(void *arg)
++{
++ struct thr_context *thr_ctx = arg;
++ struct cpt_context *ctx;
++ cpt_object_t *tobj;
++ struct cpt_task_image *ti;
++ int err = 0;
++
++ current->state = TASK_UNINTERRUPTIBLE;
++ complete(&thr_ctx->init_complete);
++ schedule();
++
++ ctx = thr_ctx->ctx;
++ tobj = thr_ctx->tobj;
++ ti = tobj->o_image;
++
++ current->fs->umask = 0;
++
++ if (ti->cpt_pid == 1) {
++ err = vps_rst_reparent_root(tobj, ctx);
++
++ if (err) {
++ rst_report_error(err, ctx);
++ goto out;
++ }
++
++ memcpy(&get_exec_env()->cap_default, &ti->cpt_ecap, sizeof(kernel_cap_t));
++
++ if (ctx->statusfile) {
++ fput(ctx->statusfile);
++ ctx->statusfile = NULL;
++ }
++
++ if (ctx->lockfile) {
++ mm_segment_t oldfs;
++ ssize_t err = -EINVAL;
++ char b;
++
++ oldfs = get_fs(); set_fs(KERNEL_DS);
++ if (ctx->lockfile->f_op && ctx->lockfile->f_op->read)
++ err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos);
++ set_fs(oldfs);
++ fput(ctx->lockfile);
++ ctx->lockfile = NULL;
++ }
++
++ err = vps_rst_veinfo(ctx);
++ if (err) {
++ eprintk_ctx("rst_veinfo: %d\n", err);
++ goto out;
++ }
++
++ err = rst_utsname(ctx);
++ if (err) {
++ eprintk_ctx("rst_utsname: %d\n", err);
++ goto out;
++ }
++
++ err = rst_root_namespace(ctx);
++ if (err) {
++ eprintk_ctx("rst_namespace: %d\n", err);
++ goto out;
++ }
++
++ if ((err = rst_restore_net(ctx)) != 0) {
++ eprintk_ctx("rst_restore_net: %d\n", err);
++ goto out;
++ }
++
++ err = rst_sockets(ctx);
++ if (err) {
++ eprintk_ctx("rst_sockets: %d\n", err);
++ goto out;
++ }
++ err = rst_sysv_ipc(ctx);
++ if (err) {
++ eprintk_ctx("rst_sysv_ipc: %d\n", err);
++ goto out;
++ }
++ }
++
++ do {
++ if (current->user->uid != ti->cpt_user) {
++ struct user_struct *u = alloc_uid(ti->cpt_user);
++ if (!u) {
++ eprintk_ctx("alloc_user\n");
++ } else {
++ switch_uid(u);
++ }
++ }
++ } while (0);
++
++ if ((err = rst_mm_complete(ti, ctx)) != 0) {
++ eprintk_ctx("rst_mm: %d\n", err);
++ goto out;
++ }
++
++ if ((err = rst_files_complete(ti, ctx)) != 0) {
++ eprintk_ctx("rst_files: %d\n", err);
++ goto out;
++ }
++
++ if ((err = rst_fs_complete(ti, ctx)) != 0) {
++ eprintk_ctx("rst_fs: %d\n", err);
++ goto out;
++ }
++
++ if ((err = rst_semundo_complete(ti, ctx)) != 0) {
++ eprintk_ctx("rst_semundo: %d\n", err);
++ goto out;
++ }
++
++ if ((err = rst_signal_complete(ti, ctx)) != 0) {
++ eprintk_ctx("rst_signal: %d\n", err);
++ goto out;
++ }
++
++ if (ti->cpt_namespace == CPT_NULL)
++ exit_namespace(current);
++
++ if (ti->cpt_personality != 0)
++ __set_personality(ti->cpt_personality);
++
++ current->set_child_tid = NULL;
++ current->clear_child_tid = NULL;
++ current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV);
++ current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV);
++ current->exit_code = ti->cpt_exit_code;
++ current->pdeath_signal = ti->cpt_pdeath_signal;
++
++ if (ti->cpt_restart.fn != CPT_RBL_0) {
++ if (ti->cpt_restart.fn != CPT_RBL_NANOSLEEP
++ && ti->cpt_restart.fn != CPT_RBL_COMPAT_NANOSLEEP
++ ) {
++ eprintk_ctx("unknown restart block\n");
++ } else {
++ current->thread_info->restart_block.fn = nanosleep_restart;
++#ifdef CONFIG_X86_64
++ if (!ti->cpt_64bit)
++ current->thread_info->restart_block.fn = compat_nanosleep_restart;
++#endif
++ if (ctx->image_version != 0) {
++ current->thread_info->restart_block.arg0 = ti->cpt_restart.arg0;
++ current->thread_info->restart_block.arg1 = ti->cpt_restart.arg1;
++ current->thread_info->restart_block.arg2 = ti->cpt_restart.arg2;
++ current->thread_info->restart_block.arg3 = ti->cpt_restart.arg3;
++ if (debug_level > 2) {
++ ktime_t e, e1;
++ struct timespec now;
++
++ do_posix_clock_monotonic_gettime(&now);
++ e = timespec_to_ktime(now);
++ e1.tv64 = ((u64)current->thread_info->restart_block.arg1 << 32) | (u64) current->thread_info->restart_block.arg0;
++ e = ktime_sub(e1, e);
++ dprintk("rst " CPT_FID " RBL %ld/%ld %Ld\n", CPT_TID(current),
++ current->thread_info->restart_block.arg1,
++ current->thread_info->restart_block.arg0, e.tv64);
++ }
++ } else {
++ struct timespec now;
++ ktime_t expire;
++ unsigned long val = ti->cpt_restart.arg0 -
++ timespec_to_jiffies(&ctx->delta_time);
++ if ((long)val <= 0)
++ val = 1;
++ do_posix_clock_monotonic_gettime(&now);
++ expire = ktime_add_ns(timespec_to_ktime(now), (u64)val*TICK_NSEC);
++ current->thread_info->restart_block.arg0 = expire.tv64 & 0xFFFFFFFF;
++ current->thread_info->restart_block.arg1 = expire.tv64 >> 32;
++ current->thread_info->restart_block.arg2 = ti->cpt_restart.arg1;
++ current->thread_info->restart_block.arg3 = CLOCK_MONOTONIC;
++ }
++ }
++ }
++
++ if (thread_group_leader(current)) {
++ current->signal->it_real_incr.tv64 = 0;
++ if (ctx->image_version != 0) {
++ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr);
++ } else {
++ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC);
++ }
++ current->signal->it_prof_incr = ti->cpt_it_prof_incr;
++ current->signal->it_virt_incr = ti->cpt_it_virt_incr;
++ current->signal->it_prof_expires = ti->cpt_it_prof_value;
++ current->signal->it_virt_expires = ti->cpt_it_virt_value;
++ }
++
++ err = rst_clone_children(tobj, ctx);
++ if (err) {
++ eprintk_ctx("rst_clone_children\n");
++ goto out;
++ }
++
++ if (ti->cpt_pid == 1) {
++ if ((err = rst_process_linkage(ctx)) != 0) {
++ eprintk_ctx("rst_process_linkage: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_do_filejobs(ctx)) != 0) {
++ eprintk_ctx("rst_do_filejobs: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_eventpoll(ctx)) != 0) {
++ eprintk_ctx("rst_eventpoll: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_sockets_complete(ctx)) != 0) {
++ eprintk_ctx("rst_sockets_complete: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_stray_files(ctx)) != 0) {
++ eprintk_ctx("rst_stray_files: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_posix_locks(ctx)) != 0) {
++ eprintk_ctx("rst_posix_locks: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_tty_jobcontrol(ctx)) != 0) {
++ eprintk_ctx("rst_tty_jobcontrol: %d\n", err);
++ goto out;
++ }
++ if ((err = rst_restore_fs(ctx)) != 0) {
++ eprintk_ctx("rst_restore_fs: %d\n", err);
++ goto out;
++ }
++ }
++
++out:
++ thr_ctx->error = err;
++ lock_kernel();
++ complete(&thr_ctx->task_done);
++
++ if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) {
++ preempt_disable();
++ current->exit_state = EXIT_ZOMBIE;
++ write_lock_irq(&tasklist_lock);
++ nr_zombie++;
++ write_unlock_irq(&tasklist_lock);
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9)
++ atomic_dec(&current->signal->live);
++#endif
++ current->flags |= PF_DEAD;
++ if (!(ti->cpt_flags&PF_DEAD))
++ wprintk_ctx("zombie %d,%d(%s) is not pf_dead\n", current->pid, virt_pid(current), current->comm);
++ module_put(current->thread_info->exec_domain->module);
++ if (current->binfmt)
++ module_put(current->binfmt->module);
++ } else {
++ __set_current_state(TASK_UNINTERRUPTIBLE);
++ }
++
++ schedule();
++
++ dprintk_ctx("leaked through %d/%d %p\n", current->pid, virt_pid(current), current->mm);
++
++ module_put(THIS_MODULE);
++ complete_and_exit(NULL, 0);
++ return 0;
++}
++
++#if 0
++static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx)
++{
++ struct task_beancounter *tbc;
++
++ tbc = task_bc(current);
++
++ put_beancounter(tbc->fork_sub);
++ tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx);
++ if (ti->cpt_mm_ub != CPT_NULL) {
++ put_beancounter(tbc->exec_ub);
++ tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx);
++ }
++}
++#endif
++
++static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx,
++ struct thr_context *thr_ctx)
++{
++ task_t *tsk;
++ int pid;
++
++ thr_ctx->ctx = ctx;
++ thr_ctx->error = 0;
++ init_completion(&thr_ctx->init_complete);
++ init_completion(&thr_ctx->task_done);
++#if 0
++ set_task_ubs(obj->o_image, ctx);
++#endif
++
++ pid = local_kernel_thread(hook, thr_ctx, 0, 0);
++ if (pid < 0)
++ return pid;
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_pid_ve(pid);
++ if (tsk)
++ get_task_struct(tsk);
++ read_unlock(&tasklist_lock);
++ if (tsk == NULL)
++ return -ESRCH;
++ cpt_obj_setobj(obj, tsk, ctx);
++ thr_ctx->tobj = obj;
++ return 0;
++}
++
++static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ task_t *tsk = obj->o_obj;
++ struct cpt_task_image *ti = obj->o_image;
++
++ memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm));
++ rst_mm_basic(obj, ti, ctx);
++ return 0;
++}
++
++static int make_baby(cpt_object_t *cobj,
++ struct cpt_task_image *pi,
++ struct cpt_context *ctx)
++{
++ unsigned long flags;
++ struct cpt_task_image *ci = cobj->o_image;
++ struct thr_context thr_ctx;
++ task_t *tsk;
++ pid_t pid;
++
++ flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx)
++ | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx);
++ if (ci->cpt_rppid != pi->cpt_pid) {
++ flags |= CLONE_THREAD|CLONE_PARENT;
++ if (ci->cpt_signal != pi->cpt_signal ||
++ !(flags&CLONE_SIGHAND) ||
++ (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) {
++ eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n",
++ (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid,
++ ci->cpt_signal, pi->cpt_signal, flags
++ );
++ return -EINVAL;
++ }
++ }
++
++ thr_ctx.ctx = ctx;
++ thr_ctx.error = 0;
++ init_completion(&thr_ctx.init_complete);
++ init_completion(&thr_ctx.task_done);
++ thr_ctx.tobj = cobj;
++
++#if 0
++ set_task_ubs(ci, ctx);
++#endif
++
++ pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid);
++ if (pid < 0)
++ return pid;
++
++ read_lock(&tasklist_lock);
++ tsk = find_task_by_pid_ve(pid);
++ if (tsk)
++ get_task_struct(tsk);
++ read_unlock(&tasklist_lock);
++ if (tsk == NULL)
++ return -ESRCH;
++ cpt_obj_setobj(cobj, tsk, ctx);
++ thr_ctx.tobj = cobj;
++ wait_for_completion(&thr_ctx.init_complete);
++#ifdef CONFIG_SMP
++ wait_task_inactive(cobj->o_obj);
++#endif
++ rst_basic_init_task(cobj, ctx);
++
++ /* clone() increases group_stop_count if it was not zero and
++ * CLONE_THREAD was asked. Undo.
++ */
++ if (current->signal->group_stop_count && (flags & CLONE_THREAD)) {
++ if (tsk->signal != current->signal) BUG();
++ current->signal->group_stop_count--;
++ }
++
++ wake_up_process(tsk);
++ wait_for_completion(&thr_ctx.task_done);
++ wait_task_inactive(tsk);
++
++ return thr_ctx.error;
++}
++
++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx)
++{
++ int err = 0;
++ struct cpt_task_image *ti = obj->o_image;
++ cpt_object_t *cobj;
++
++ for_each_object(cobj, CPT_OBJ_TASK) {
++ struct cpt_task_image *ci = cobj->o_image;
++ if (cobj == obj)
++ continue;
++ if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) ||
++ (ci->cpt_leader == ti->cpt_pid &&
++ ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) {
++ err = make_baby(cobj, ti, ctx);
++ if (err) {
++ eprintk_ctx("make_baby: %d\n", err);
++ return err;
++ }
++ }
++ }
++ return 0;
++}
++
++static int read_task_images(struct cpt_context *ctx)
++{
++ int err;
++ loff_t start, end;
++
++ err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end);
++ if (err)
++ return err;
++
++ while (start < end) {
++ cpt_object_t *obj;
++ struct cpt_task_image *ti = cpt_get_buf(ctx);
++
++ err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx);
++ if (err) {
++ cpt_release_buf(ctx);
++ return err;
++ }
++ if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) {
++ eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid);
++ cpt_release_buf(ctx);
++ return -EINVAL;
++ }
++ obj = alloc_cpt_object(GFP_KERNEL, ctx);
++ cpt_obj_setpos(obj, start, ctx);
++ intern_cpt_object(CPT_OBJ_TASK, obj, ctx);
++ obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL);
++ if (obj->o_image == NULL) {
++ cpt_release_buf(ctx);
++ return -ENOMEM;
++ }
++ memcpy(obj->o_image, ti, sizeof(*ti));
++ err = ctx->pread(obj->o_image + sizeof(*ti),
++ ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti));
++ cpt_release_buf(ctx);
++ if (err)
++ return err;
++ start += ti->cpt_next;
++ }
++ return 0;
++}
++
++
++static int vps_rst_restore_tree(struct cpt_context *ctx)
++{
++ int err;
++ cpt_object_t *obj;
++ struct thr_context thr_ctx_root;
++
++ err = read_task_images(ctx);
++ if (err)
++ return err;
++
++ err = rst_undump_ubc(ctx);
++ if (err)
++ return err;
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ err = create_root_task(obj, ctx, &thr_ctx_root);
++ if (err)
++ return err;
++
++ wait_for_completion(&thr_ctx_root.init_complete);
++#ifdef CONFIG_SMP
++ wait_task_inactive(obj->o_obj);
++#endif
++ rst_basic_init_task(obj, ctx);
++
++ wake_up_process(obj->o_obj);
++ wait_for_completion(&thr_ctx_root.task_done);
++ wait_task_inactive(obj->o_obj);
++ err = thr_ctx_root.error;
++ if (err)
++ return err;
++ break;
++ }
++
++ return err;
++}
++
++
++int vps_rst_undump(struct cpt_context *ctx)
++{
++ int err;
++ unsigned long umask;
++
++ err = rst_open_dumpfile(ctx);
++ if (err)
++ return err;
++
++#ifndef CONFIG_X86_64
++ if (ctx->tasks64) {
++ eprintk_ctx("Cannot restore 64 bit VE on this architecture\n");
++ return -EINVAL;
++ }
++#endif
++
++ umask = current->fs->umask;
++ current->fs->umask = 0;
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ err = rst_setup_pagein(ctx);
++#endif
++
++ if (err == 0)
++ err = vps_rst_restore_tree(ctx);
++
++ if (err == 0)
++ err = rst_restore_process(ctx);
++
++ current->fs->umask = umask;
++
++ return err;
++}
++
++static int rst_unlock_ve(struct cpt_context *ctx)
++{
++ struct ve_struct *env;
++
++ env = get_ve_by_id(ctx->ve_id);
++ if (!env)
++ return -ESRCH;
++ down_write(&env->op_sem);
++ env->is_locked = 0;
++ up_write(&env->op_sem);
++ put_ve(env);
++ return 0;
++}
++
++int rst_resume(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++ int err = 0;
++
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++
++ fput(file);
++ }
++
++ rst_resume_network(ctx);
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++ struct cpt_task_image *ti = obj->o_image;
++
++ if (!tsk)
++ continue;
++
++ if (ti->cpt_state == TASK_UNINTERRUPTIBLE) {
++ dprintk_ctx("task %d/%d(%s) is started\n", virt_pid(tsk), tsk->pid, tsk->comm);
++
++ /* Weird... If a signal is sent to stopped task,
++ * nobody makes recalc_sigpending(). We have to do
++ * this by hands after wake_up_process().
++ * if we did this before a signal could arrive before
++ * wake_up_process() and stall.
++ */
++ spin_lock_irq(&tsk->sighand->siglock);
++ if (!signal_pending(tsk))
++ recalc_sigpending_tsk(tsk);
++ spin_unlock_irq(&tsk->sighand->siglock);
++
++ wake_up_process(tsk);
++ } else {
++ if (ti->cpt_state == TASK_STOPPED ||
++ ti->cpt_state == TASK_TRACED) {
++ set_task_state(tsk, ti->cpt_state);
++ }
++ }
++ put_task_struct(tsk);
++ }
++
++ rst_unlock_ve(ctx);
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ rst_complete_pagein(ctx, 0);
++#endif
++
++ rst_finish_ubc(ctx);
++ cpt_object_destroy(ctx);
++
++ return err;
++}
++
++int rst_kill(struct cpt_context *ctx)
++{
++ cpt_object_t *obj;
++ int err = 0;
++
++ for_each_object(obj, CPT_OBJ_FILE) {
++ struct file *file = obj->o_obj;
++
++ fput(file);
++ }
++
++ for_each_object(obj, CPT_OBJ_TASK) {
++ task_t *tsk = obj->o_obj;
++
++ if (tsk == NULL)
++ continue;
++
++ if (tsk->exit_state == 0) {
++ send_sig(SIGKILL, tsk, 1);
++
++ spin_lock_irq(&tsk->sighand->siglock);
++ sigfillset(&tsk->blocked);
++ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL));
++ set_tsk_thread_flag(tsk, TIF_SIGPENDING);
++ clear_tsk_thread_flag(tsk, TIF_FREEZE);
++ if (tsk->flags & PF_FROZEN)
++ tsk->flags &= ~PF_FROZEN;
++ spin_unlock_irq(&tsk->sighand->siglock);
++
++ wake_up_process(tsk);
++ }
++
++ put_task_struct(tsk);
++ }
++
++#ifdef CONFIG_VZ_CHECKPOINT_LAZY
++ rst_complete_pagein(ctx, 1);
++#endif
++
++ rst_finish_ubc(ctx);
++ cpt_object_destroy(ctx);
++
++ return err;
++}
++
++static int rst_utsname(cpt_context_t *ctx)
++{
++ int err;
++ loff_t sec = ctx->sections[CPT_SECT_UTSNAME];
++ loff_t endsec;
++ struct cpt_section_hdr h;
++ struct cpt_object_hdr o;
++ int i;
++
++ if (sec == CPT_NULL)
++ return 0;
++
++ err = ctx->pread(&h, sizeof(h), ctx, sec);
++ if (err)
++ return err;
++ if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h))
++ return -EINVAL;
++
++ i = 0;
++ endsec = sec + h.cpt_next;
++ sec += h.cpt_hdrlen;
++ while (sec < endsec) {
++ int len;
++ char *ptr;
++ err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx);
++ if (err)
++ return err;
++ len = o.cpt_next - o.cpt_hdrlen;
++ if (len > __NEW_UTS_LEN+1)
++ return -ENAMETOOLONG;
++ switch (i) {
++ case 0:
++ ptr = ve_utsname.nodename; break;
++ case 1:
++ ptr = ve_utsname.domainname; break;
++ default:
++ return -EINVAL;
++ }
++ err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen);
++ if (err)
++ return err;
++ i++;
++ sec += o.cpt_next;
++ }
++
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/cpt/rst_x8664.S linux-2.6.16-026test015/kernel/cpt/rst_x8664.S
+--- linux-2.6.16.orig/kernel/cpt/rst_x8664.S 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/cpt/rst_x8664.S 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,61 @@
++#define ASSEMBLY 1
++
++#include <linux/config.h>
++#include <linux/linkage.h>
++#include <asm/segment.h>
++#include <asm/smp.h>
++#include <asm/cache.h>
++#include <asm/errno.h>
++#include <asm/dwarf2.h>
++#include <asm/calling.h>
++#include <asm/msr.h>
++#include <asm/unistd.h>
++#include <asm/thread_info.h>
++#include <asm/hw_irq.h>
++#include <asm/errno.h>
++#include <asm/asm-offsets.h>
++
++ .code64
++ .global schedule_tail_hook, schedule_tail_p
++ .align 8
++schedule_tail_hook:
++ movq schedule_tail_p(%rip),%r11
++ call *%r11
++ GET_THREAD_INFO(%rcx)
++ btr $22,threadinfo_flags(%rcx) /* TIF_RESUME */
++ jc 1f
++ retq
++
++ /* If TIF_RESUME is set, (%rsp) is pointer to hook function
++ * the hook will do the work and jump to the next hook,
++ * everything should end at ret_from_fork+5.
++ */
++1: addq $8,%rsp
++ retq
++
++ .align 8
++ .global ret_from_fork2
++ret_from_fork2:
++ cmpq $0,ORIG_RAX(%rsp)
++ jge ret_from_fork+5
++ RESTORE_REST
++ jmp int_ret_from_sys_call
++
++ .align 8
++ .global ret_last_siginfo
++ret_last_siginfo:
++ call rlsi
++ movq %rax,%rsp
++ retq
++
++ .align 8
++ .global ret_child_tid
++ret_child_tid:
++ movq %rsp,%rdi
++ call rct
++ movq %rax,%rsp
++ retq
++
++ .data
++schedule_tail_p:
++ .quad 0
+diff -upr linux-2.6.16.orig/kernel/cpu.c linux-2.6.16-026test015/kernel/cpu.c
+--- linux-2.6.16.orig/kernel/cpu.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/cpu.c 2006-07-04 14:41:39.000000000 +0400
+@@ -21,6 +21,11 @@ static DECLARE_MUTEX(cpucontrol);
+ static struct notifier_block *cpu_chain;
+
+ #ifdef CONFIG_HOTPLUG_CPU
++
++#ifdef CONFIG_SCHED_VCPU
++#error "CONFIG_HOTPLUG_CPU isn't supported with CONFIG_SCHED_VCPU"
++#endif
++
+ static struct task_struct *lock_cpu_hotplug_owner;
+ static int lock_cpu_hotplug_depth;
+
+@@ -95,8 +100,8 @@ static inline void check_for_tasks(int c
+ struct task_struct *p;
+
+ write_lock_irq(&tasklist_lock);
+- for_each_process(p) {
+- if (task_cpu(p) == cpu &&
++ for_each_process_all(p) {
++ if (task_pcpu(p) == cpu &&
+ (!cputime_eq(p->utime, cputime_zero) ||
+ !cputime_eq(p->stime, cputime_zero)))
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
+@@ -106,6 +111,13 @@ static inline void check_for_tasks(int c
+ write_unlock_irq(&tasklist_lock);
+ }
+
++#ifdef CONFIG_SCHED_VCPU
++#error VCPU vs. HOTPLUG: fix hotplug code below
++/*
++ * What should be fixed:
++ * - check for if (idle_cpu()) yield()
++ */
++#endif
+ /* Take this CPU down. */
+ static int take_cpu_down(void *unused)
+ {
+diff -upr linux-2.6.16.orig/kernel/cpuset.c linux-2.6.16-026test015/kernel/cpuset.c
+--- linux-2.6.16.orig/kernel/cpuset.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/cpuset.c 2006-07-04 14:41:38.000000000 +0400
+@@ -897,7 +897,7 @@ static int update_nodemask(struct cpuset
+ n = 0;
+
+ /* Load up mmarray[] with mm reference for each task in cpuset. */
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ struct mm_struct *mm;
+
+ if (n >= ntasks) {
+@@ -911,7 +911,7 @@ static int update_nodemask(struct cpuset
+ if (!mm)
+ continue;
+ mmarray[n++] = mm;
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+ write_unlock_irq(&tasklist_lock);
+
+ /*
+@@ -1125,7 +1125,7 @@ static int attach_task(struct cpuset *cs
+ if (pid) {
+ read_lock(&tasklist_lock);
+
+- tsk = find_task_by_pid(pid);
++ tsk = find_task_by_pid_all(pid);
+ if (!tsk || tsk->flags & PF_EXITING) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+@@ -1561,13 +1561,13 @@ static int pid_array_load(pid_t *pidarra
+
+ read_lock(&tasklist_lock);
+
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ if (p->cpuset == cs) {
+ pidarray[n++] = p->pid;
+ if (unlikely(n == npids))
+ goto array_full;
+ }
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+
+ array_full:
+ read_unlock(&tasklist_lock);
+diff -upr linux-2.6.16.orig/kernel/exec_domain.c linux-2.6.16-026test015/kernel/exec_domain.c
+--- linux-2.6.16.orig/kernel/exec_domain.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/exec_domain.c 2006-07-04 14:41:36.000000000 +0400
+@@ -140,6 +140,7 @@ __set_personality(u_long personality)
+ ep = lookup_exec_domain(personality);
+ if (ep == current_thread_info()->exec_domain) {
+ current->personality = personality;
++ module_put(ep->module);
+ return 0;
+ }
+
+diff -upr linux-2.6.16.orig/kernel/exit.c linux-2.6.16-026test015/kernel/exit.c
+--- linux-2.6.16.orig/kernel/exit.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/exit.c 2006-07-04 14:41:39.000000000 +0400
+@@ -42,7 +42,7 @@ extern struct task_struct *child_reaper;
+
+ int getrusage(struct task_struct *, int, struct rusage __user *);
+
+-static void exit_mm(struct task_struct * tsk);
++void exit_mm(struct task_struct * tsk);
+
+ static void __unhash_process(struct task_struct *p)
+ {
+@@ -57,18 +57,19 @@ static void __unhash_process(struct task
+ }
+
+ REMOVE_LINKS(p);
++ REMOVE_VE_LINKS(p);
+ }
+
+ void release_task(struct task_struct * p)
+ {
+ int zap_leader;
+ task_t *leader;
+- struct dentry *proc_dentry;
++ struct dentry *proc_dentry[2];
+
+ repeat:
+ atomic_dec(&p->user->processes);
+ spin_lock(&p->proc_lock);
+- proc_dentry = proc_pid_unhash(p);
++ proc_pid_unhash(p, proc_dentry);
+ write_lock_irq(&tasklist_lock);
+ if (unlikely(p->ptrace))
+ __ptrace_unlink(p);
+@@ -80,6 +81,8 @@ repeat:
+ * the process by __unhash_process.
+ */
+ __unhash_process(p);
++ nr_zombie--;
++ atomic_inc(&nr_dead);
+
+ /*
+ * If we are the last non-leader member of the thread
+@@ -107,6 +110,10 @@ repeat:
+ spin_unlock(&p->proc_lock);
+ proc_pid_flush(proc_dentry);
+ release_thread(p);
++#ifdef CONFIG_VE
++ if (atomic_dec_and_test(&VE_TASK_INFO(p)->owner_env->pcounter))
++ do_env_cleanup(VE_TASK_INFO(p)->owner_env);
++#endif
+ put_task_struct(p);
+
+ p = leader;
+@@ -118,10 +125,10 @@ repeat:
+
+ void unhash_process(struct task_struct *p)
+ {
+- struct dentry *proc_dentry;
++ struct dentry *proc_dentry[2];
+
+ spin_lock(&p->proc_lock);
+- proc_dentry = proc_pid_unhash(p);
++ proc_pid_unhash(p, proc_dentry);
+ write_lock_irq(&tasklist_lock);
+ __unhash_process(p);
+ write_unlock_irq(&tasklist_lock);
+@@ -139,14 +146,16 @@ int session_of_pgrp(int pgrp)
+ struct task_struct *p;
+ int sid = -1;
+
++ WARN_ON(is_virtual_pid(pgrp));
++
+ read_lock(&tasklist_lock);
+- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) {
+ if (p->signal->session > 0) {
+ sid = p->signal->session;
+ goto out;
+ }
+- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
+- p = find_task_by_pid(pgrp);
++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p);
++ p = find_task_by_pid_ve(pgrp);
+ if (p)
+ sid = p->signal->session;
+ out:
+@@ -168,17 +177,19 @@ static int will_become_orphaned_pgrp(int
+ struct task_struct *p;
+ int ret = 1;
+
+- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
++ WARN_ON(is_virtual_pid(pgrp));
++
++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) {
+ if (p == ignored_task
+ || p->exit_state
+- || p->real_parent->pid == 1)
++ || virt_pid(p->real_parent) == 1)
+ continue;
+ if (process_group(p->real_parent) != pgrp
+ && p->real_parent->signal->session == p->signal->session) {
+ ret = 0;
+ break;
+ }
+- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p);
+ return ret; /* (sighing) "Often!" */
+ }
+
+@@ -186,6 +197,8 @@ int is_orphaned_pgrp(int pgrp)
+ {
+ int retval;
+
++ WARN_ON(is_virtual_pid(pgrp));
++
+ read_lock(&tasklist_lock);
+ retval = will_become_orphaned_pgrp(pgrp, NULL);
+ read_unlock(&tasklist_lock);
+@@ -198,7 +211,7 @@ static int has_stopped_jobs(int pgrp)
+ int retval = 0;
+ struct task_struct *p;
+
+- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) {
+ if (p->state != TASK_STOPPED)
+ continue;
+
+@@ -214,7 +227,7 @@ static int has_stopped_jobs(int pgrp)
+
+ retval = 1;
+ break;
+- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p);
+ return retval;
+ }
+
+@@ -263,6 +276,9 @@ void __set_special_pids(pid_t session, p
+ {
+ struct task_struct *curr = current->group_leader;
+
++ WARN_ON(is_virtual_pid(pgrp));
++ WARN_ON(is_virtual_pid(session));
++
+ if (curr->signal->session != session) {
+ detach_pid(curr, PIDTYPE_SID);
+ curr->signal->session = session;
+@@ -281,6 +297,7 @@ void set_special_pids(pid_t session, pid
+ __set_special_pids(session, pgrp);
+ write_unlock_irq(&tasklist_lock);
+ }
++EXPORT_SYMBOL(set_special_pids);
+
+ /*
+ * Let kernel threads use this to say that they
+@@ -500,7 +517,7 @@ EXPORT_SYMBOL_GPL(exit_fs);
+ * Turn us into a lazy TLB process if we
+ * aren't already..
+ */
+-static void exit_mm(struct task_struct * tsk)
++void exit_mm(struct task_struct * tsk)
+ {
+ struct mm_struct *mm = tsk->mm;
+
+@@ -535,6 +552,7 @@ static void exit_mm(struct task_struct *
+ task_unlock(tsk);
+ mmput(mm);
+ }
++EXPORT_SYMBOL_GPL(exit_mm);
+
+ static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper)
+ {
+@@ -613,13 +631,12 @@ static void reparent_thread(task_t *p, t
+ static void forget_original_parent(struct task_struct * father,
+ struct list_head *to_release)
+ {
+- struct task_struct *p, *reaper = father;
++ struct task_struct *p, *tsk_reaper, *reaper = father;
+ struct list_head *_p, *_n;
+
+ do {
+ reaper = next_thread(reaper);
+ if (reaper == father) {
+- reaper = child_reaper;
+ break;
+ }
+ } while (reaper->exit_state);
+@@ -641,9 +658,16 @@ static void forget_original_parent(struc
+ /* if father isn't the real parent, then ptrace must be enabled */
+ BUG_ON(father != p->real_parent && !ptrace);
+
++ tsk_reaper = reaper;
++ if (tsk_reaper == father)
++#ifdef CONFIG_VE
++ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry;
++ if (tsk_reaper == p)
++#endif
++ tsk_reaper = child_reaper;
+ if (father == p->real_parent) {
+- /* reparent with a reaper, real father it's us */
+- choose_new_parent(p, reaper, child_reaper);
++ /* reparent with a tsk_reaper, real father it's us */
++ choose_new_parent(p, tsk_reaper, child_reaper);
+ reparent_thread(p, father, 0);
+ } else {
+ /* reparent ptraced task to its real parent */
+@@ -664,7 +688,15 @@ static void forget_original_parent(struc
+ }
+ list_for_each_safe(_p, _n, &father->ptrace_children) {
+ p = list_entry(_p,struct task_struct,ptrace_list);
+- choose_new_parent(p, reaper, child_reaper);
++
++ tsk_reaper = reaper;
++ if (tsk_reaper == father)
++#ifdef CONFIG_VE
++ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry;
++ if (tsk_reaper == p)
++#endif
++ tsk_reaper = child_reaper;
++ choose_new_parent(p, tsk_reaper, child_reaper);
+ reparent_thread(p, father, 1);
+ }
+ }
+@@ -760,6 +792,9 @@ static void exit_notify(struct task_stru
+ && !capable(CAP_KILL))
+ tsk->exit_signal = SIGCHLD;
+
++ if (tsk->exit_signal != -1 && t == child_reaper)
++ /* We dont want people slaying init. */
++ tsk->exit_signal = SIGCHLD;
+
+ /* If something other than our normal parent is ptracing us, then
+ * send it a SIGCHLD instead of honoring exit_signal. exit_signal
+@@ -778,6 +813,7 @@ static void exit_notify(struct task_stru
+ unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT)))
+ state = EXIT_DEAD;
+ tsk->exit_state = state;
++ nr_zombie++;
+
+ write_unlock_irq(&tasklist_lock);
+
+@@ -792,6 +828,82 @@ static void exit_notify(struct task_stru
+ release_task(tsk);
+ }
+
++#ifdef CONFIG_VE
++/*
++ * Handle exitting of init process, it's a special case for VE.
++ */
++static void do_initproc_exit(void)
++{
++ struct task_struct *tsk;
++ struct ve_struct *env;
++ struct siginfo info;
++ struct task_struct *g, *p;
++ long delay = 1L;
++
++ tsk = current;
++ env = VE_TASK_INFO(current)->owner_env;
++ if (env->init_entry != tsk)
++ return;
++
++ if (ve_is_super(env) && tsk->pid == 1)
++ panic("Attempted to kill init!");
++
++ memset(&info, 0, sizeof(info));
++ info.si_errno = 0;
++ info.si_code = SI_KERNEL;
++ info.si_pid = virt_pid(tsk);
++ info.si_uid = current->uid;
++ info.si_signo = SIGKILL;
++
++ /*
++ * Here the VE changes its state into "not running".
++ * op_sem taken for write is a barrier to all VE manipulations from
++ * ioctl: it waits for operations currently in progress and blocks all
++ * subsequent operations until is_running is set to 0 and op_sem is
++ * released.
++ */
++ down_write(&env->op_sem);
++ env->is_running = 0;
++ up_write(&env->op_sem);
++
++ /* send kill to all processes of VE */
++ read_lock(&tasklist_lock);
++ do_each_thread_ve(g, p) {
++ force_sig_info(SIGKILL, &info, p);
++ } while_each_thread_ve(g, p);
++ read_unlock(&tasklist_lock);
++
++ /* wait for all init childs exit */
++ while (atomic_read(&env->pcounter) > 1) {
++ if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0)
++ continue;
++ /* it was ENOCHLD or no more children somehow */
++ if (atomic_read(&env->pcounter) == 1)
++ break;
++
++ /* clear all signals to avoid wakeups */
++ if (signal_pending(tsk))
++ flush_signals(tsk);
++ /* we have child without signal sent */
++ __set_current_state(TASK_INTERRUPTIBLE);
++ schedule_timeout(delay);
++ delay = (delay < HZ) ? (delay << 1) : HZ;
++ read_lock(&tasklist_lock);
++ do_each_thread_ve(g, p) {
++ if (p != tsk)
++ force_sig_info(SIGKILL, &info, p);
++ } while_each_thread_ve(g, p);
++ read_unlock(&tasklist_lock);
++ }
++ env->init_entry = child_reaper;
++ write_lock_irq(&tasklist_lock);
++ REMOVE_LINKS(tsk);
++ tsk->parent = tsk->real_parent = child_reaper;
++ SET_LINKS(tsk);
++ write_unlock_irq(&tasklist_lock);
++}
++#endif
++
+ fastcall NORET_TYPE void do_exit(long code)
+ {
+ struct task_struct *tsk = current;
+@@ -805,14 +917,20 @@ fastcall NORET_TYPE void do_exit(long co
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
++#ifdef CONFIG_VE
++ do_initproc_exit();
++#else
+ if (unlikely(tsk->pid == 1))
+ panic("Attempted to kill init!");
++#endif
+ if (tsk->io_context)
+ exit_io_context();
+
+ if (unlikely(current->ptrace & PT_TRACE_EXIT)) {
+ current->ptrace_message = code;
++ set_pn_state(current, PN_STOP_EXIT);
+ ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
++ clear_pn_state(current);
+ }
+
+ /*
+@@ -828,14 +946,6 @@ fastcall NORET_TYPE void do_exit(long co
+
+ tsk->flags |= PF_EXITING;
+
+- /*
+- * Make sure we don't try to process any timer firings
+- * while we are already exiting.
+- */
+- tsk->it_virt_expires = cputime_zero;
+- tsk->it_prof_expires = cputime_zero;
+- tsk->it_sched_expires = 0;
+-
+ if (unlikely(in_atomic()))
+ printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
+ current->comm, current->pid,
+@@ -911,7 +1021,14 @@ asmlinkage long sys_exit(int error_code)
+
+ task_t fastcall *next_thread(const task_t *p)
+ {
+- return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
++ task_t *tsk;
++
++ tsk = pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID);
++#ifdef CONFIG_VE
++ /* all threads should belong to ONE ve! */
++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env);
++#endif
++ return tsk;
+ }
+
+ EXPORT_SYMBOL(next_thread);
+@@ -960,14 +1077,19 @@ asmlinkage void sys_exit_group(int error
+ static int eligible_child(pid_t pid, int options, task_t *p)
+ {
+ if (pid > 0) {
+- if (p->pid != pid)
++ if ((is_virtual_pid(pid) ? virt_pid(p) : p->pid) != pid)
+ return 0;
+ } else if (!pid) {
+ if (process_group(p) != process_group(current))
+ return 0;
+ } else if (pid != -1) {
+- if (process_group(p) != -pid)
+- return 0;
++ if (__is_virtual_pid(-pid)) {
++ if (virt_pgid(p) != -pid)
++ return 0;
++ } else {
++ if (process_group(p) != -pid)
++ return 0;
++ }
+ }
+
+ /*
+@@ -1157,7 +1279,7 @@ static int wait_task_zombie(task_t *p, i
+ p->exit_state = EXIT_ZOMBIE;
+ return retval;
+ }
+- retval = p->pid;
++ retval = get_task_pid(p);
+ if (p->real_parent != p->parent) {
+ write_lock_irq(&tasklist_lock);
+ /* Double-check with lock held. */
+@@ -1292,7 +1414,7 @@ bail_ref:
+ if (!retval && infop)
+ retval = put_user(p->uid, &infop->si_uid);
+ if (!retval)
+- retval = p->pid;
++ retval = get_task_pid(p);
+ put_task_struct(p);
+
+ BUG_ON(!retval);
+@@ -1574,6 +1696,7 @@ asmlinkage long sys_wait4(pid_t pid, int
+ prevent_tail_call(ret);
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(sys_wait4);
+
+ #ifdef __ARCH_WANT_SYS_WAITPID
+
+diff -upr linux-2.6.16.orig/kernel/fairsched.c linux-2.6.16-026test015/kernel/fairsched.c
+--- linux-2.6.16.orig/kernel/fairsched.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/fairsched.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1288 @@
++/*
++ * Fair Scheduler
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * Start-tag scheduling follows the theory presented in
++ * http://www.cs.utexas.edu/users/dmcl/papers/ps/SIGCOMM96.ps
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <asm/timex.h>
++#include <asm/atomic.h>
++#include <linux/spinlock.h>
++#include <asm/semaphore.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/fs.h>
++#include <linux/dcache.h>
++#include <linux/sysctl.h>
++#include <linux/module.h>
++#include <linux/vmalloc.h>
++#include <linux/sched.h>
++#include <linux/fairsched.h>
++#include <linux/vsched.h>
++
++/* we need it for vsched routines in sched.c */
++spinlock_t fairsched_lock = SPIN_LOCK_UNLOCKED;
++
++#ifdef CONFIG_FAIRSCHED
++
++#define FAIRSHED_DEBUG " debug"
++
++
++/*********************************************************************/
++/*
++ * Special arithmetics
++ */
++/*********************************************************************/
++
++#define CYCLES_SHIFT (8)
++#define SCYCLES_TIME(time) \
++ ((scycles_t) {((time) + (1 << CYCLES_SHIFT) - 1) >> CYCLES_SHIFT})
++
++#define CYCLES_ZERO (0)
++static inline int CYCLES_BEFORE(cycles_t x, cycles_t y)
++{
++ return (__s64)(x-y) < 0;
++}
++static inline int CYCLES_AFTER(cycles_t x, cycles_t y)
++{
++ return (__s64)(y-x) < 0;
++}
++static inline void CYCLES_DADD(cycles_t *x, fschdur_t y) {*x+=y.d;}
++
++#define FSCHDUR_ZERO (0)
++#define TICK_DUR ((fschdur_t){cycles_per_jiffy})
++static inline fschdur_t FSCHDURATION(cycles_t x, cycles_t y)
++{
++ return (fschdur_t){x - y};
++}
++static inline int FSCHDUR_CMP(fschdur_t x, fschdur_t y)
++{
++ if (x.d < y.d) return -1;
++ if (x.d > y.d) return 1;
++ return 0;
++}
++static inline fschdur_t FSCHDUR_SUB(fschdur_t x, fschdur_t y)
++{
++ return (fschdur_t){x.d - y.d};
++}
++
++#define FSCHTAG_ZERO ((fschtag_t){0})
++static inline int FSCHTAG_CMP(fschtag_t x, fschtag_t y)
++{
++ if (x.t < y.t) return -1;
++ if (x.t > y.t) return 1;
++ return 0;
++}
++static inline fschtag_t FSCHTAG_MAX(fschtag_t x, fschtag_t y)
++{
++ return x.t >= y.t ? x : y;
++}
++static inline int FSCHTAG_DADD(fschtag_t *tag, fschdur_t dur, unsigned w)
++{
++ cycles_t new_tag;
++ new_tag = tag->t + (cycles_t)dur.d * w;
++ if (new_tag < tag->t)
++ return -1;
++ /* DEBUG */
++ if (new_tag >= (1ULL << 48))
++ return -1;
++ tag->t = new_tag;
++ return 0;
++}
++static inline int FSCHTAG_ADD(fschtag_t *tag, fschtag_t y)
++{
++ cycles_t new_tag;
++ new_tag = tag->t + y.t;
++ if (new_tag < tag->t)
++ return -1;
++ tag->t = new_tag;
++ return 0;
++}
++static inline fschtag_t FSCHTAG_SUB(fschtag_t x, fschtag_t y)
++{
++ return (fschtag_t){x.t - y.t};
++}
++
++#define FSCHVALUE_ZERO ((fschvalue_t){0})
++#define TICK_VALUE ((fschvalue_t){(cycles_t)cycles_per_jiffy << FSCHRATE_SHIFT})
++static inline fschvalue_t FSCHVALUE(unsigned long t)
++{
++ return (fschvalue_t){(cycles_t)t << FSCHRATE_SHIFT};
++}
++static inline int FSCHVALUE_CMP(fschvalue_t x, fschvalue_t y)
++{
++ if (x.v < y.v) return -1;
++ if (x.v > y.v) return 1;
++ return 0;
++}
++static inline void FSCHVALUE_DADD(fschvalue_t *val, fschdur_t dur,
++ unsigned rate)
++{
++ val->v += (cycles_t)dur.d * rate;
++}
++static inline fschvalue_t FSCHVALUE_SUB(fschvalue_t x, fschvalue_t y)
++{
++ return (fschvalue_t){x.v - y.v};
++}
++static inline cycles_t FSCHVALUE_TO_DELAY(fschvalue_t val, unsigned rate)
++{
++ unsigned long t;
++ /*
++ * Here we lose precision to make the division 32-bit on IA-32.
++ * The value is not greater than TICK_VALUE.
++ * (TICK_VALUE >> FSCHRATE_SHIFT) fits unsigned long.
++ */
++ t = (val.v + (1 << FSCHRATE_SHIFT) - 1) >> FSCHRATE_SHIFT;
++ return (cycles_t)((t + rate - 1) / rate) << FSCHRATE_SHIFT;
++}
++
++
++/*********************************************************************/
++/*
++ * Global data
++ */
++/*********************************************************************/
++
++#define fsch_assert(x) \
++ do { \
++ static int count; \
++ if (!(x) && count++ < 10) \
++ printk("fsch_assert " #x " failed\n"); \
++ } while (0)
++
++/*
++ * Configurable parameters
++ */
++unsigned fairsched_max_latency = 25; /* jiffies */
++
++/*
++ * Parameters initialized at startup
++ */
++/* Number of online CPUs */
++unsigned fairsched_nr_cpus;
++/* Token Bucket depth (burst size) */
++static fschvalue_t max_value;
++
++struct fairsched_node fairsched_init_node = {
++ .id = INT_MAX,
++#ifdef CONFIG_VE
++ .owner_env = get_ve0(),
++#endif
++ .weight = 1,
++};
++EXPORT_SYMBOL(fairsched_init_node);
++
++struct fairsched_node fairsched_idle_node = {
++ .id = -1,
++};
++
++static int fairsched_nr_nodes;
++static LIST_HEAD(fairsched_node_head);
++static LIST_HEAD(fairsched_running_head);
++static LIST_HEAD(fairsched_delayed_head);
++
++DEFINE_PER_CPU(cycles_t, prev_schedule);
++static fschtag_t max_latency;
++
++static DECLARE_MUTEX(fairsched_mutex);
++
++/*********************************************************************/
++/*
++ * Small helper routines
++ */
++/*********************************************************************/
++
++/* this didn't proved to be very valuable statistics... */
++#define fairsched_inc_ve_strv(node, cycles) do {} while(0)
++#define fairsched_dec_ve_strv(node, cycles) do {} while(0)
++
++/*********************************************************************/
++/*
++ * Runlist management
++ */
++/*********************************************************************/
++
++/*
++ * Returns the start_tag of the first runnable node, or 0.
++ */
++static inline fschtag_t virtual_time(void)
++{
++ struct fairsched_node *p;
++
++ if (!list_empty(&fairsched_running_head)) {
++ p = list_first_entry(&fairsched_running_head,
++ struct fairsched_node, runlist);
++ return p->start_tag;
++ }
++ return FSCHTAG_ZERO;
++}
++
++static void fairsched_recompute_max_latency(void)
++{
++ struct fairsched_node *p;
++ unsigned w;
++ fschtag_t tag;
++
++ w = FSCHWEIGHT_MAX;
++ list_for_each_entry(p, &fairsched_node_head, nodelist) {
++ if (p->weight < w)
++ w = p->weight;
++ }
++ tag = FSCHTAG_ZERO;
++ (void) FSCHTAG_DADD(&tag, TICK_DUR,
++ fairsched_nr_cpus * fairsched_max_latency * w);
++ max_latency = tag;
++}
++
++static void fairsched_reset_start_tags(void)
++{
++ struct fairsched_node *cnode;
++ fschtag_t min_tag;
++
++ min_tag = virtual_time();
++ list_for_each_entry(cnode, &fairsched_node_head, nodelist) {
++ if (FSCHTAG_CMP(cnode->start_tag, min_tag) > 0)
++ cnode->start_tag = FSCHTAG_SUB(cnode->start_tag,
++ min_tag);
++ else
++ cnode->start_tag = FSCHTAG_ZERO;
++ }
++}
++
++static void fairsched_running_insert(struct fairsched_node *node)
++{
++ struct list_head *tmp;
++ struct fairsched_node *p;
++ fschtag_t start_tag_max;
++
++ if (!list_empty(&fairsched_running_head)) {
++ start_tag_max = virtual_time();
++ if (!FSCHTAG_ADD(&start_tag_max, max_latency) &&
++ FSCHTAG_CMP(start_tag_max, node->start_tag) < 0)
++ node->start_tag = start_tag_max;
++ }
++
++ list_for_each(tmp, &fairsched_running_head) {
++ p = list_entry(tmp, struct fairsched_node, runlist);
++ if (FSCHTAG_CMP(node->start_tag, p->start_tag) <= 0)
++ break;
++ }
++ /* insert node just before tmp */
++ list_add_tail(&node->runlist, tmp);
++}
++
++static inline void fairsched_running_insert_fromsleep(
++ struct fairsched_node *node)
++{
++ node->start_tag = FSCHTAG_MAX(node->start_tag, virtual_time());
++ fairsched_running_insert(node);
++}
++
++
++/*********************************************************************/
++/*
++ * CPU limiting helper functions
++ *
++ * These functions compute rates, delays and manipulate with sleep
++ * lists and so on.
++ */
++/*********************************************************************/
++
++/*
++ * Insert a node into the list of nodes removed from scheduling,
++ * sorted by the time at which the the node is allowed to run,
++ * historically called `delay'.
++ */
++static void fairsched_delayed_insert(struct fairsched_node *node)
++{
++ struct fairsched_node *p;
++ struct list_head *tmp;
++
++ list_for_each(tmp, &fairsched_delayed_head) {
++ p = list_entry(tmp, struct fairsched_node,
++ runlist);
++ if (CYCLES_AFTER(p->delay, node->delay))
++ break;
++ }
++ /* insert node just before tmp */
++ list_add_tail(&node->runlist, tmp);
++}
++
++static inline void nodevalue_add(struct fairsched_node *node,
++ fschdur_t duration, unsigned rate)
++{
++ FSCHVALUE_DADD(&node->value, duration, rate);
++ if (FSCHVALUE_CMP(node->value, max_value) > 0)
++ node->value = max_value;
++}
++
++/*
++ * The node has been selected to run.
++ * This function accounts in advance for the time that the node will run.
++ * The advance not used by the node will be credited back.
++ */
++static void fairsched_ratelimit_charge_advance(
++ struct fairsched_node *node,
++ cycles_t time)
++{
++ fsch_assert(!node->delayed);
++ fsch_assert(FSCHVALUE_CMP(node->value, TICK_VALUE) >= 0);
++
++ /*
++ * Account for the time passed since last update.
++ * It might be needed if the node has become runnable because of
++ * a wakeup, but hasn't gone through other functions updating
++ * the bucket value.
++ */
++ if (CYCLES_AFTER(time, node->last_updated_at)) {
++ nodevalue_add(node, FSCHDURATION(time, node->last_updated_at),
++ node->rate);
++ node->last_updated_at = time;
++ }
++
++ /* charge for the full tick the node might be running */
++ node->value = FSCHVALUE_SUB(node->value, TICK_VALUE);
++ if (FSCHVALUE_CMP(node->value, TICK_VALUE) < 0) {
++ list_del(&node->runlist);
++ node->delayed = 1;
++ node->delay = node->last_updated_at + FSCHVALUE_TO_DELAY(
++ FSCHVALUE_SUB(TICK_VALUE, node->value),
++ node->rate);
++ node->nr_ready = 0;
++ fairsched_delayed_insert(node);
++ }
++}
++
++static void fairsched_ratelimit_credit_unused(
++ struct fairsched_node *node,
++ cycles_t time, fschdur_t duration)
++{
++ /* account for the time passed since last update */
++ if (CYCLES_AFTER(time, node->last_updated_at)) {
++ nodevalue_add(node, FSCHDURATION(time, node->last_updated_at),
++ node->rate);
++ node->last_updated_at = time;
++ }
++
++ /*
++ * When the node was given this CPU, it was charged for 1 tick.
++ * Credit back the unused time.
++ */
++ if (FSCHDUR_CMP(duration, TICK_DUR) < 0)
++ nodevalue_add(node, FSCHDUR_SUB(TICK_DUR, duration),
++ 1 << FSCHRATE_SHIFT);
++
++ /* check if the node is allowed to run */
++ if (FSCHVALUE_CMP(node->value, TICK_VALUE) < 0) {
++ /*
++ * The node was delayed and remain such.
++ * But since the bucket value has been updated,
++ * update the delay time and move the node in the list.
++ */
++ fsch_assert(node->delayed);
++ node->delay = node->last_updated_at + FSCHVALUE_TO_DELAY(
++ FSCHVALUE_SUB(TICK_VALUE, node->value),
++ node->rate);
++ } else if (node->delayed) {
++ /*
++ * The node was delayed, but now it is allowed to run.
++ * We do not manipulate with lists, it will be done by the
++ * caller.
++ */
++ node->nr_ready = node->nr_runnable;
++ node->delayed = 0;
++ }
++}
++
++static void fairsched_delayed_wake(cycles_t time)
++{
++ struct fairsched_node *p;
++
++ while (!list_empty(&fairsched_delayed_head)) {
++ p = list_entry(fairsched_delayed_head.next,
++ struct fairsched_node,
++ runlist);
++ if (CYCLES_AFTER(p->delay, time))
++ break;
++
++ /* ok, the delay period is completed */
++ /* account for the time passed since last update */
++ if (CYCLES_AFTER(time, p->last_updated_at)) {
++ nodevalue_add(p, FSCHDURATION(time, p->last_updated_at),
++ p->rate);
++ p->last_updated_at = time;
++ }
++
++ fsch_assert(FSCHVALUE_CMP(p->value, TICK_VALUE) >= 0);
++ p->nr_ready = p->nr_runnable;
++ p->delayed = 0;
++ list_del_init(&p->runlist);
++ if (p->nr_ready)
++ fairsched_running_insert_fromsleep(p);
++ }
++}
++
++static struct fairsched_node *fairsched_find(unsigned int id);
++
++void fairsched_cpu_online_map(int id, cpumask_t *mask)
++{
++ /* FIXME - obtain real map */
++ *mask = cpu_online_map;
++#if 0
++ struct fairsched_node *node;
++
++ down(&fairsched_mutex);
++ node = fairsched_find(id);
++ if (node == NULL)
++ *mask = CPU_MASK_NONE;
++ else
++ vsched_cpu_online_map(node->vsched, mask);
++ up(&fairsched_mutex);
++#endif
++}
++
++/*********************************************************************/
++/*
++ * The heart of the algorithm:
++ * fairsched_incrun, fairsched_decrun, fairsched_schedule
++ *
++ * Note: old property nr_ready >= nr_pcpu doesn't hold anymore.
++ * However, nr_runnable, nr_ready and delayed are maintained in sync.
++ */
++/*********************************************************************/
++
++/*
++ * Called on a wakeup inside the node.
++ */
++void fairsched_incrun(struct fairsched_node *node)
++{
++ if (!node->delayed && !node->nr_ready++)
++ /* the node wasn't on the running list, insert */
++ fairsched_running_insert_fromsleep(node);
++ node->nr_runnable++;
++}
++
++/*
++ * Called from inside schedule() when a sleeping state is entered.
++ */
++void fairsched_decrun(struct fairsched_node *node)
++{
++ if (!node->delayed && !--node->nr_ready)
++ /* nr_ready changed 1->0, remove from the running list */
++ list_del_init(&node->runlist);
++ --node->nr_runnable;
++}
++
++void fairsched_inccpu(struct fairsched_node *node)
++{
++ node->nr_pcpu++;
++ fairsched_dec_ve_strv(node, cycles);
++}
++
++static inline void __fairsched_deccpu(struct fairsched_node *node)
++{
++ node->nr_pcpu--;
++ fairsched_inc_ve_strv(node, cycles);
++}
++
++void fairsched_deccpu(struct fairsched_node *node)
++{
++ if (node == &fairsched_idle_node)
++ return;
++
++ __fairsched_deccpu(node);
++}
++
++static void fairsched_account(struct fairsched_node *node,
++ cycles_t time)
++{
++ fschdur_t duration;
++
++ duration = FSCHDURATION(time, __get_cpu_var(prev_schedule));
++#ifdef CONFIG_VE
++ CYCLES_DADD(&node->owner_env->cpu_used_ve, duration);
++#endif
++
++ /*
++ * The duration is not greater than TICK_DUR since
++ * task->need_resched is always 1.
++ */
++ if (FSCHTAG_DADD(&node->start_tag, duration, node->weight)) {
++ fairsched_reset_start_tags();
++ (void) FSCHTAG_DADD(&node->start_tag, duration,
++ node->weight);
++ }
++
++ list_del_init(&node->runlist);
++ if (node->rate_limited)
++ fairsched_ratelimit_credit_unused(node, time, duration);
++ if (!node->delayed) {
++ if (node->nr_ready)
++ fairsched_running_insert(node);
++ } else
++ fairsched_delayed_insert(node);
++}
++
++/*
++ * Scheduling decision
++ *
++ * Updates CPU usage for the node releasing the CPU and selects a new node.
++ */
++struct fairsched_node *fairsched_schedule(
++ struct fairsched_node *prev_node,
++ struct fairsched_node *cur_node,
++ int cur_node_active,
++ cycles_t time)
++{
++ struct fairsched_node *p;
++
++ if (prev_node != &fairsched_idle_node)
++ fairsched_account(prev_node, time);
++ __get_cpu_var(prev_schedule) = time;
++
++ fairsched_delayed_wake(time);
++
++ list_for_each_entry(p, &fairsched_running_head, runlist) {
++ if (p->nr_pcpu < p->nr_ready ||
++ (cur_node_active && p == cur_node)) {
++ if (p->rate_limited)
++ fairsched_ratelimit_charge_advance(p, time);
++ return p;
++ }
++ }
++ return NULL;
++}
++
++
++/*********************************************************************/
++/*
++ * System calls
++ *
++ * All do_xxx functions are called under fairsched semaphore and after
++ * capability check.
++ *
++ * The binary interfaces follow some other Fair Scheduler implementations
++ * (although some system call arguments are not needed for our implementation).
++ */
++/*********************************************************************/
++
++static struct fairsched_node *fairsched_find(unsigned int id)
++{
++ struct fairsched_node *p;
++
++ list_for_each_entry(p, &fairsched_node_head, nodelist) {
++ if (p->id == id)
++ return p;
++ }
++ return NULL;
++}
++
++static int do_fairsched_mknod(unsigned int parent, unsigned int weight,
++ unsigned int newid)
++{
++ struct fairsched_node *node;
++ int retval;
++
++ retval = -EINVAL;
++ if (weight < 1 || weight > FSCHWEIGHT_MAX)
++ goto out;
++ if (newid < 0 || newid > INT_MAX)
++ goto out;
++
++ retval = -EBUSY;
++ if (fairsched_find(newid) != NULL)
++ goto out;
++
++ retval = -ENOMEM;
++ node = kmalloc(sizeof(*node), GFP_KERNEL);
++ if (node == NULL)
++ goto out;
++
++ memset(node, 0, sizeof(*node));
++ node->weight = weight;
++ INIT_LIST_HEAD(&node->runlist);
++ node->id = newid;
++#ifdef CONFIG_VE
++ node->owner_env = get_exec_env();
++#endif
++
++ spin_lock_irq(&fairsched_lock);
++ list_add(&node->nodelist, &fairsched_node_head);
++ fairsched_nr_nodes++;
++ fairsched_recompute_max_latency();
++ spin_unlock_irq(&fairsched_lock);
++
++ retval = newid;
++out:
++ return retval;
++}
++
++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
++ unsigned int newid)
++{
++ int retval;
++
++ if (!capable(CAP_SETVEID))
++ return -EPERM;
++
++ down(&fairsched_mutex);
++ retval = do_fairsched_mknod(parent, weight, newid);
++ up(&fairsched_mutex);
++
++ return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_mknod);
++
++static int do_fairsched_rmnod(unsigned int id)
++{
++ struct fairsched_node *node;
++ int retval;
++
++ retval = -EINVAL;
++ node = fairsched_find(id);
++ if (node == NULL)
++ goto out;
++ if (node == &fairsched_init_node)
++ goto out;
++
++ retval = vsched_destroy(node->vsched);
++ if (retval)
++ goto out;
++
++ spin_lock_irq(&fairsched_lock);
++ list_del(&node->runlist); /* required for delayed nodes */
++ list_del(&node->nodelist);
++ fairsched_nr_nodes--;
++ fairsched_recompute_max_latency();
++ spin_unlock_irq(&fairsched_lock);
++
++ kfree(node);
++ retval = 0;
++out:
++ return retval;
++}
++
++asmlinkage int sys_fairsched_rmnod(unsigned int id)
++{
++ int retval;
++
++ if (!capable(CAP_SETVEID))
++ return -EPERM;
++
++ down(&fairsched_mutex);
++ retval = do_fairsched_rmnod(id);
++ up(&fairsched_mutex);
++
++ return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_rmnod);
++
++int do_fairsched_chwt(unsigned int id, unsigned weight)
++{
++ struct fairsched_node *node;
++
++ if (id == 0)
++ return -EINVAL;
++ if (weight < 1 || weight > FSCHWEIGHT_MAX)
++ return -EINVAL;
++
++ node = fairsched_find(id);
++ if (node == NULL)
++ return -ENOENT;
++
++ spin_lock_irq(&fairsched_lock);
++ node->weight = weight;
++ fairsched_recompute_max_latency();
++ spin_unlock_irq(&fairsched_lock);
++
++ return 0;
++}
++
++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight)
++{
++ int retval;
++
++ if (!capable(CAP_SETVEID))
++ return -EPERM;
++
++ down(&fairsched_mutex);
++ retval = do_fairsched_chwt(id, weight);
++ up(&fairsched_mutex);
++
++ return retval;
++}
++
++int do_fairsched_rate(unsigned int id, int op, unsigned rate)
++{
++ struct fairsched_node *node;
++ cycles_t time;
++ int retval;
++
++ if (id == 0)
++ return -EINVAL;
++ if (op == 0 && (rate < 1 || rate >= (1UL << 31)))
++ return -EINVAL;
++
++ node = fairsched_find(id);
++ if (node == NULL)
++ return -ENOENT;
++
++ retval = -EINVAL;
++ spin_lock_irq(&fairsched_lock);
++ time = get_cycles();
++ switch (op) {
++ case 0:
++ node->rate = rate;
++ if (node->rate > (fairsched_nr_cpus << FSCHRATE_SHIFT))
++ node->rate =
++ fairsched_nr_cpus << FSCHRATE_SHIFT;
++ node->rate_limited = 1;
++ node->value = max_value;
++ if (node->delayed) {
++ list_del(&node->runlist);
++ node->delay = time;
++ fairsched_delayed_insert(node);
++ node->last_updated_at = time;
++ fairsched_delayed_wake(time);
++ }
++ retval = node->rate;
++ break;
++ case 1:
++ node->rate = 0; /* This assignment is not needed
++ for the kernel code, and it should
++ not rely on rate being 0 when it's
++ unset. This is a band-aid for some
++ existing tools (don't know which one
++ exactly). --SAW */
++ node->rate_limited = 0;
++ node->value = max_value;
++ if (node->delayed) {
++ list_del(&node->runlist);
++ node->delay = time;
++ fairsched_delayed_insert(node);
++ node->last_updated_at = time;
++ fairsched_delayed_wake(time);
++ }
++ retval = 0;
++ break;
++ case 2:
++ if (node->rate_limited)
++ retval = node->rate;
++ else
++ retval = -ENODATA;
++ break;
++ }
++ spin_unlock_irq(&fairsched_lock);
++
++ return retval;
++}
++
++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate)
++{
++ int retval;
++
++ if (!capable(CAP_SETVEID))
++ return -EPERM;
++
++ down(&fairsched_mutex);
++ retval = do_fairsched_rate(id, op, rate);
++ up(&fairsched_mutex);
++
++ return retval;
++}
++
++/*
++ * Called under fairsched_mutex.
++ */
++static int __do_fairsched_mvpr(struct task_struct *p,
++ struct fairsched_node *node)
++{
++ int retval;
++
++ if (node->vsched == NULL) {
++ retval = vsched_create(node->id, node);
++ if (retval < 0)
++ return retval;
++ }
++
++ /* no need to destroy vsched in case of mvpr failure */
++ return vsched_mvpr(p, node->vsched);
++}
++
++int do_fairsched_mvpr(pid_t pid, unsigned int nodeid)
++{
++ struct task_struct *p;
++ struct fairsched_node *node;
++ int retval;
++
++ retval = -ENOENT;
++ node = fairsched_find(nodeid);
++ if (node == NULL)
++ goto out;
++
++ read_lock(&tasklist_lock);
++ retval = -ESRCH;
++ p = find_task_by_pid_all(pid);
++ if (p == NULL)
++ goto out_unlock;
++ get_task_struct(p);
++ read_unlock(&tasklist_lock);
++
++ retval = __do_fairsched_mvpr(p, node);
++ put_task_struct(p);
++ return retval;
++
++out_unlock:
++ read_unlock(&tasklist_lock);
++out:
++ return retval;
++}
++
++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid)
++{
++ int retval;
++
++ if (!capable(CAP_SETVEID))
++ return -EPERM;
++
++ down(&fairsched_mutex);
++ retval = do_fairsched_mvpr(pid, nodeid);
++ up(&fairsched_mutex);
++
++ return retval;
++}
++EXPORT_SYMBOL(sys_fairsched_mvpr);
++
++
++/*********************************************************************/
++/*
++ * proc interface
++ */
++/*********************************************************************/
++
++struct fairsched_node_dump {
++#ifdef CONFIG_VE
++ envid_t veid;
++#endif
++ int id;
++ unsigned weight;
++ unsigned rate;
++ unsigned rate_limited : 1,
++ delayed : 1;
++ fschtag_t start_tag;
++ fschvalue_t value;
++ cycles_t delay;
++ int nr_ready;
++ int nr_runnable;
++ int nr_pcpu;
++ int nr_tasks, nr_runtasks;
++};
++
++struct fairsched_dump {
++ int len, compat;
++ struct fairsched_node_dump nodes[0];
++};
++
++static struct fairsched_dump *fairsched_do_dump(int compat)
++{
++ int nr_nodes;
++ int len, i;
++ struct fairsched_dump *dump;
++ struct fairsched_node *node;
++ struct fairsched_node_dump *p;
++ unsigned long flags;
++
++start:
++ nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1);
++ len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]);
++ dump = ub_vmalloc(len);
++ if (dump == NULL)
++ goto out;
++
++ spin_lock_irqsave(&fairsched_lock, flags);
++ if (ve_is_super(get_exec_env()) && nr_nodes < fairsched_nr_nodes)
++ goto repeat;
++ p = dump->nodes;
++ list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) {
++ if ((char *)p - (char *)dump >= len)
++ break;
++ p->nr_tasks = 0;
++ p->nr_runtasks = 0;
++#ifdef CONFIG_VE
++ if (!ve_accessible(node->owner_env, get_exec_env()))
++ continue;
++ p->veid = node->owner_env->veid;
++ if (compat) {
++ p->nr_tasks = atomic_read(&node->owner_env->pcounter);
++ for (i = 0; i < NR_CPUS; i++)
++ p->nr_runtasks +=
++ VE_CPU_STATS(node->owner_env, i)
++ ->nr_running;
++ if (p->nr_runtasks < 0)
++ p->nr_runtasks = 0;
++ }
++#endif
++ p->id = node->id;
++ p->weight = node->weight;
++ p->rate = node->rate;
++ p->rate_limited = node->rate_limited;
++ p->delayed = node->delayed;
++ p->start_tag = node->start_tag;
++ p->value = node->value;
++ p->delay = node->delay;
++ p->nr_ready = node->nr_ready;
++ p->nr_runnable = node->nr_runnable;
++ p->nr_pcpu = node->nr_pcpu;
++ p++;
++ }
++ dump->len = p - dump->nodes;
++ dump->compat = compat;
++ spin_unlock_irqrestore(&fairsched_lock, flags);
++
++out:
++ return dump;
++
++repeat:
++ spin_unlock_irqrestore(&fairsched_lock, flags);
++ vfree(dump);
++ goto start;
++}
++
++#define FAIRSCHED_PROC_HEADLINES 2
++
++#if defined(CONFIG_VE)
++/*
++ * File format is dictated by compatibility reasons.
++ */
++static int fairsched_seq_show(struct seq_file *m, void *v)
++{
++ struct fairsched_dump *dump;
++ struct fairsched_node_dump *p;
++ unsigned vid, nid, pid, r;
++
++ dump = m->private;
++ p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL);
++ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
++ if (p == dump->nodes)
++ seq_printf(m, "Version: 2.6 debug\n");
++ else if (p == dump->nodes + 1)
++ seq_printf(m,
++ " veid "
++ " id "
++ " parent "
++ "weight "
++ " rate "
++ "tasks "
++ " run "
++ "cpus"
++ " "
++ "flg "
++ "ready "
++ " start_tag "
++ " value "
++ " delay"
++ "\n");
++ } else {
++ p -= FAIRSCHED_PROC_HEADLINES;
++ vid = nid = pid = 0;
++ r = (unsigned long)v & 3;
++ if (p == dump->nodes) {
++ if (r == 2)
++ nid = p->id;
++ } else {
++ if (!r)
++ nid = p->id;
++ else if (r == 1)
++ vid = pid = p->id;
++ else
++ vid = p->id, nid = 1;
++ }
++ seq_printf(m,
++ "%10u "
++ "%10u %10u %6u %5u %5u %5u %4u"
++ " "
++ " %c%c %5u %20Lu %20Lu %20Lu"
++ "\n",
++ vid,
++ nid,
++ pid,
++ p->weight,
++ p->rate,
++ p->nr_tasks,
++ p->nr_runtasks,
++ p->nr_pcpu,
++ p->rate_limited ? 'L' : '.',
++ p->delayed ? 'D' : '.',
++ p->nr_ready,
++ p->start_tag.t,
++ p->value.v,
++ p->delay
++ );
++ }
++
++ return 0;
++}
++
++static void *fairsched_seq_start(struct seq_file *m, loff_t *pos)
++{
++ struct fairsched_dump *dump;
++ unsigned long l;
++
++ dump = m->private;
++ if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES)
++ return NULL;
++ if (*pos < FAIRSCHED_PROC_HEADLINES)
++ return dump->nodes + *pos;
++ /* guess why... */
++ l = (unsigned long)(dump->nodes +
++ ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3);
++ l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3;
++ return (void *)l;
++}
++static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ ++*pos;
++ return fairsched_seq_start(m, pos);
++}
++#endif
++
++static int fairsched2_seq_show(struct seq_file *m, void *v)
++{
++ struct fairsched_dump *dump;
++ struct fairsched_node_dump *p;
++
++ dump = m->private;
++ p = v;
++ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) {
++ if (p == dump->nodes)
++ seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n");
++ else if (p == dump->nodes + 1)
++ seq_printf(m,
++ " id "
++ "weight "
++ " rate "
++ " run "
++ "cpus"
++#ifdef FAIRSHED_DEBUG
++ " "
++ "flg "
++ "ready "
++ " start_tag "
++ " value "
++ " delay"
++#endif
++ "\n");
++ } else {
++ p -= FAIRSCHED_PROC_HEADLINES;
++ seq_printf(m,
++ "%10u %6u %5u %5u %4u"
++#ifdef FAIRSHED_DEBUG
++ " "
++ " %c%c %5u %20Lu %20Lu %20Lu"
++#endif
++ "\n",
++ p->id,
++ p->weight,
++ p->rate,
++ p->nr_runnable,
++ p->nr_pcpu
++#ifdef FAIRSHED_DEBUG
++ ,
++ p->rate_limited ? 'L' : '.',
++ p->delayed ? 'D' : '.',
++ p->nr_ready,
++ p->start_tag.t,
++ p->value.v,
++ p->delay
++#endif
++ );
++ }
++
++ return 0;
++}
++
++static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos)
++{
++ struct fairsched_dump *dump;
++
++ dump = m->private;
++ if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES)
++ return NULL;
++ return dump->nodes + *pos;
++}
++static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ ++*pos;
++ return fairsched2_seq_start(m, pos);
++}
++static void fairsched2_seq_stop(struct seq_file *m, void *v)
++{
++}
++
++#ifdef CONFIG_VE
++static struct seq_operations fairsched_seq_op = {
++ .start = fairsched_seq_start,
++ .next = fairsched_seq_next,
++ .stop = fairsched2_seq_stop,
++ .show = fairsched_seq_show
++};
++#endif
++static struct seq_operations fairsched2_seq_op = {
++ .start = fairsched2_seq_start,
++ .next = fairsched2_seq_next,
++ .stop = fairsched2_seq_stop,
++ .show = fairsched2_seq_show
++};
++static int fairsched_seq_open(struct inode *inode, struct file *file)
++{
++ int ret;
++ struct seq_file *m;
++ int compat;
++
++#ifdef CONFIG_VE
++ compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1);
++ ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op);
++#else
++ compat = 0;
++ ret = seq_open(file, fairsched2_seq_op);
++#endif
++ if (ret)
++ return ret;
++ m = file->private_data;
++ m->private = fairsched_do_dump(compat);
++ if (m->private == NULL) {
++ seq_release(inode, file);
++ ret = -ENOMEM;
++ }
++ return ret;
++}
++static int fairsched_seq_release(struct inode *inode, struct file *file)
++{
++ struct seq_file *m;
++ struct fairsched_dump *dump;
++
++ m = file->private_data;
++ dump = m->private;
++ m->private = NULL;
++ vfree(dump);
++ seq_release(inode, file);
++ return 0;
++}
++static struct file_operations proc_fairsched_operations = {
++ .open = fairsched_seq_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = fairsched_seq_release
++};
++
++
++/*********************************************************************/
++/*
++ * Fairsched initialization
++ */
++/*********************************************************************/
++
++int fsch_sysctl_latency(ctl_table *ctl, int write, struct file *filp,
++ void *buffer, size_t *lenp, loff_t *ppos)
++{
++ int *valp = ctl->data;
++ int val = *valp;
++ int ret;
++
++ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
++
++ if (!write || *valp == val)
++ return ret;
++
++ spin_lock_irq(&fairsched_lock);
++ fairsched_recompute_max_latency();
++ spin_unlock_irq(&fairsched_lock);
++ return ret;
++}
++
++static void fairsched_calibrate(void)
++{
++ fairsched_nr_cpus = num_online_cpus();
++ max_value = FSCHVALUE(cycles_per_jiffy * (fairsched_nr_cpus + 1));
++}
++
++void __init fairsched_init_early(void)
++{
++ list_add(&fairsched_init_node.nodelist, &fairsched_node_head);
++ fairsched_nr_nodes++;
++}
++
++/*
++ * Note: this function is execute late in the initialization sequence.
++ * We ourselves need calibrated cycles and initialized procfs...
++ * The consequence of this late initialization is that start tags are
++ * efficiently ignored and each node preempts others on insertion.
++ * But it isn't a problem (only init node can be runnable).
++ */
++void __init fairsched_init_late(void)
++{
++ struct proc_dir_entry *entry;
++
++ if (get_cycles() == 0)
++ panic("FAIRSCHED: no TSC!\n");
++ fairsched_calibrate();
++ fairsched_recompute_max_latency();
++
++ entry = create_proc_glob_entry("fairsched", S_IRUGO, NULL);
++ if (entry)
++ entry->proc_fops = &proc_fairsched_operations;
++ entry = create_proc_glob_entry("fairsched2", S_IRUGO, NULL);
++ if (entry)
++ entry->proc_fops = &proc_fairsched_operations;
++}
++
++
++#else /* CONFIG_FAIRSCHED */
++
++
++/*********************************************************************/
++/*
++ * No Fairsched
++ */
++/*********************************************************************/
++
++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight,
++ unsigned int newid)
++{
++ return -ENOSYS;
++}
++
++asmlinkage int sys_fairsched_rmnod(unsigned int id)
++{
++ return -ENOSYS;
++}
++
++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight)
++{
++ return -ENOSYS;
++}
++
++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid)
++{
++ return -ENOSYS;
++}
++
++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate)
++{
++ return -ENOSYS;
++}
++
++void __init fairsched_init_late(void)
++{
++}
++
++#endif /* CONFIG_FAIRSCHED */
+diff -upr linux-2.6.16.orig/kernel/fork.c linux-2.6.16-026test015/kernel/fork.c
+--- linux-2.6.16.orig/kernel/fork.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/fork.c 2006-07-04 14:41:39.000000000 +0400
+@@ -20,6 +20,7 @@
+ #include <linux/vmalloc.h>
+ #include <linux/completion.h>
+ #include <linux/namespace.h>
++#include <linux/file.h>
+ #include <linux/personality.h>
+ #include <linux/mempolicy.h>
+ #include <linux/sem.h>
+@@ -52,11 +53,15 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+
++#include <ub/ub_vmpages.h>
++#include <ub/ub_misc.h>
++
+ /*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+ unsigned long total_forks; /* Handle normal Linux uptimes. */
+ int nr_threads; /* The idle threads do not count.. */
++EXPORT_SYMBOL(nr_threads);
+
+ int max_threads; /* tunable limit on nr_threads */
+
+@@ -103,6 +108,7 @@ static kmem_cache_t *mm_cachep;
+
+ void free_task(struct task_struct *tsk)
+ {
++ ub_task_uncharge(tsk);
+ free_thread_info(tsk->thread_info);
+ free_task_struct(tsk);
+ }
+@@ -122,9 +128,14 @@ void __put_task_struct_cb(struct rcu_hea
+ free_uid(tsk->user);
+ put_group_info(tsk->group_info);
+
++#ifdef CONFIG_VE
++ put_ve(VE_TASK_INFO(tsk)->owner_env);
++ atomic_dec(&nr_dead);
++#endif
+ if (!profile_handoff_task(tsk))
+ free_task(tsk);
+ }
++EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
+ void __init fork_init(unsigned long mempages)
+ {
+@@ -135,7 +146,7 @@ void __init fork_init(unsigned long memp
+ /* create a slab on which task_structs can be allocated */
+ task_struct_cachep =
+ kmem_cache_create("task_struct", sizeof(struct task_struct),
+- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL);
++ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_UBC, NULL, NULL);
+ #endif
+
+ /*
+@@ -166,22 +177,30 @@ static struct task_struct *dup_task_stru
+
+ tsk = alloc_task_struct();
+ if (!tsk)
+- return NULL;
++ goto out;
+
+ ti = alloc_thread_info(tsk);
+- if (!ti) {
+- free_task_struct(tsk);
+- return NULL;
+- }
++ if (!ti)
++ goto out_tsk;
+
+ *tsk = *orig;
+ tsk->thread_info = ti;
+ setup_thread_stack(tsk, orig);
+
++ if (ub_task_charge(orig, tsk))
++ goto out_ti;
++
+ /* One for us, one for whoever does the "release_task()" (usually parent) */
+ atomic_set(&tsk->usage,2);
+ atomic_set(&tsk->fs_excl, 0);
+ return tsk;
++
++out_ti:
++ free_thread_info(ti);
++out_tsk:
++ free_task_struct(tsk);
++out:
++ return NULL;
+ }
+
+ #ifdef CONFIG_MMU
+@@ -219,7 +238,12 @@ static inline int dup_mmap(struct mm_str
+ -pages);
+ continue;
+ }
++
+ charge = 0;
++ if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start,
++ mpnt->vm_flags & ~VM_LOCKED,
++ mpnt->vm_file, UB_HARD))
++ goto fail_noch;
+ if (mpnt->vm_flags & VM_ACCOUNT) {
+ unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
+ if (security_vm_enough_memory(len))
+@@ -238,6 +262,7 @@ static inline int dup_mmap(struct mm_str
+ tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_mm = mm;
+ tmp->vm_next = NULL;
++ set_vma_rss(tmp, 0);
+ anon_vma_link(tmp);
+ file = tmp->vm_file;
+ if (file) {
+@@ -266,7 +291,7 @@ static inline int dup_mmap(struct mm_str
+ rb_parent = &tmp->vm_rb;
+
+ mm->map_count++;
+- retval = copy_page_range(mm, oldmm, mpnt);
++ retval = copy_page_range(mm, oldmm, tmp, mpnt);
+
+ if (tmp->vm_ops && tmp->vm_ops->open)
+ tmp->vm_ops->open(tmp);
+@@ -283,6 +308,9 @@ out:
+ fail_nomem_policy:
+ kmem_cache_free(vm_area_cachep, tmp);
+ fail_nomem:
++ ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start,
++ mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file);
++fail_noch:
+ retval = -ENOMEM;
+ vm_unacct_memory(charge);
+ goto out;
+@@ -313,7 +341,8 @@ static inline void mm_free_pgd(struct mm
+
+ #include <linux/init_task.h>
+
+-static struct mm_struct * mm_init(struct mm_struct * mm)
++static struct mm_struct * mm_init(struct mm_struct * mm,
++ struct task_struct *tsk)
+ {
+ atomic_set(&mm->mm_users, 1);
+ atomic_set(&mm->mm_count, 1);
+@@ -328,11 +357,14 @@ static struct mm_struct * mm_init(struct
+ mm->ioctx_list = NULL;
+ mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
++ set_mm_ub(mm, tsk);
+
+ if (likely(!mm_alloc_pgd(mm))) {
+ mm->def_flags = 0;
+ return mm;
+ }
++
++ put_mm_ub(mm);
+ free_mm(mm);
+ return NULL;
+ }
+@@ -347,10 +379,11 @@ struct mm_struct * mm_alloc(void)
+ mm = allocate_mm();
+ if (mm) {
+ memset(mm, 0, sizeof(*mm));
+- mm = mm_init(mm);
++ mm = mm_init(mm, NULL);
+ }
+ return mm;
+ }
++EXPORT_SYMBOL_GPL(mm_alloc);
+
+ /*
+ * Called when the last reference to the mm
+@@ -362,8 +395,10 @@ void fastcall __mmdrop(struct mm_struct
+ BUG_ON(mm == &init_mm);
+ mm_free_pgd(mm);
+ destroy_context(mm);
++ put_mm_ub(mm);
+ free_mm(mm);
+ }
++EXPORT_SYMBOL_GPL(__mmdrop);
+
+ /*
+ * Decrement the use count and release all resources for an mm.
+@@ -466,7 +501,7 @@ static struct mm_struct *dup_mm(struct t
+
+ memcpy(mm, oldmm, sizeof(*mm));
+
+- if (!mm_init(mm))
++ if (!mm_init(mm, tsk))
+ goto fail_nomem;
+
+ if (init_new_context(tsk, mm))
+@@ -720,7 +755,7 @@ out_release:
+ free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+ free_fd_array(new_fdt->fd, new_fdt->max_fds);
+ kmem_cache_free(files_cachep, newf);
+- goto out;
++ return NULL;
+ }
+
+ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+@@ -896,7 +931,7 @@ asmlinkage long sys_set_tid_address(int
+ {
+ current->clear_child_tid = tidptr;
+
+- return current->pid;
++ return virt_pid(current);
+ }
+
+ /*
+@@ -913,7 +948,7 @@ static task_t *copy_process(unsigned lon
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr,
+- int pid)
++ int pid, long pid0)
+ {
+ int retval;
+ struct task_struct *p = NULL;
+@@ -974,12 +1009,20 @@ static task_t *copy_process(unsigned lon
+ p->did_exec = 0;
+ copy_flags(clone_flags, p);
+ p->pid = pid;
++#ifdef CONFIG_VE
++ set_virt_pid(p, alloc_vpid(p->pid, pid0 ? : -1));
++ if (virt_pid(p) < 0)
++ goto bad_fork_cleanup_module;
++#endif
+ retval = -EFAULT;
+ if (clone_flags & CLONE_PARENT_SETTID)
+- if (put_user(p->pid, parent_tidptr))
++ if (put_user(virt_pid(p), parent_tidptr))
+ goto bad_fork_cleanup;
+
+ p->proc_dentry = NULL;
++#ifdef CONFIG_VE
++ p->ve_task_info.glob_proc_dentry = NULL;
++#endif
+
+ INIT_LIST_HEAD(&p->children);
+ INIT_LIST_HEAD(&p->sibling);
+@@ -1027,8 +1070,13 @@ static task_t *copy_process(unsigned lon
+ #endif
+
+ p->tgid = p->pid;
+- if (clone_flags & CLONE_THREAD)
++ set_virt_tgid(p, virt_pid(p));
++ set_virt_pgid(p, virt_pgid(current));
++ set_virt_sid(p, virt_sid(current));
++ if (clone_flags & CLONE_THREAD) {
+ p->tgid = current->tgid;
++ set_virt_tgid(p, virt_tgid(current));
++ }
+
+ if ((retval = security_task_alloc(p)))
+ goto bad_fork_cleanup_policy;
+@@ -1111,8 +1159,8 @@ static task_t *copy_process(unsigned lon
+ */
+ p->cpus_allowed = current->cpus_allowed;
+ if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
+- !cpu_online(task_cpu(p))))
+- set_task_cpu(p, smp_processor_id());
++ !vcpu_online(task_cpu(p))))
++ set_task_cpu(p, task_cpu(current));
+
+ /*
+ * Check for pending SIGKILL! The new thread should not be allowed
+@@ -1181,6 +1229,12 @@ static task_t *copy_process(unsigned lon
+ if (unlikely(p->ptrace & PT_PTRACED))
+ __ptrace_link(p, current->parent);
+
++#ifdef CONFIG_VE
++ SET_VE_LINKS(p);
++ atomic_inc(&p->ve_task_info.owner_env->pcounter);
++ get_ve(p->ve_task_info.owner_env);
++ seqcount_init(&p->ve_task_info.wakeup_lock);
++#endif
+ if (thread_group_leader(p)) {
+ p->signal->tty = current->signal->tty;
+ p->signal->pgrp = process_group(current);
+@@ -1228,6 +1282,11 @@ bad_fork_cleanup_cpuset:
+ #endif
+ cpuset_exit(p);
+ bad_fork_cleanup:
++#ifdef CONFIG_VE
++ if (virt_pid(p) != p->pid && virt_pid(p) > 0)
++ free_vpid(virt_pid(p), get_exec_env());
++bad_fork_cleanup_module:
++#endif
+ if (p->binfmt)
+ module_put(p->binfmt->module);
+ bad_fork_cleanup_put_domain:
+@@ -1253,7 +1312,7 @@ task_t * __devinit fork_idle(int cpu)
+ task_t *task;
+ struct pt_regs regs;
+
+- task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
++ task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0, 0);
+ if (!task)
+ return ERR_PTR(-ENOMEM);
+ init_idle(task, cpu);
+@@ -1283,12 +1342,13 @@ static inline int fork_traceflag (unsign
+ * It copies the process, and if successful kick-starts
+ * it and waits for it to finish using the VM if required.
+ */
+-long do_fork(unsigned long clone_flags,
++long do_fork_pid(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+- int __user *child_tidptr)
++ int __user *child_tidptr,
++ long pid0)
+ {
+ struct task_struct *p;
+ int trace = 0;
+@@ -1302,7 +1362,8 @@ long do_fork(unsigned long clone_flags,
+ clone_flags |= CLONE_PTRACE;
+ }
+
+- p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);
++ p = copy_process(clone_flags, stack_start, regs, stack_size,
++ parent_tidptr, child_tidptr, pid, pid0);
+ /*
+ * Do this prior waking up the new thread - the thread pointer
+ * might get invalid after that point, if the thread exits quickly.
+@@ -1310,6 +1371,7 @@ long do_fork(unsigned long clone_flags,
+ if (!IS_ERR(p)) {
+ struct completion vfork;
+
++ pid = virt_pid(p);
+ if (clone_flags & CLONE_VFORK) {
+ p->vfork_done = &vfork;
+ init_completion(&vfork);
+@@ -1330,13 +1392,18 @@ long do_fork(unsigned long clone_flags,
+
+ if (unlikely (trace)) {
+ current->ptrace_message = pid;
++ set_pn_state(current, PN_STOP_FORK);
+ ptrace_notify ((trace << 8) | SIGTRAP);
++ clear_pn_state(current);
+ }
+
+ if (clone_flags & CLONE_VFORK) {
+ wait_for_completion(&vfork);
+- if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
++ if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
++ set_pn_state(current, PN_STOP_VFORK);
+ ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
++ clear_pn_state(current);
++ }
+ }
+ } else {
+ free_pidmap(pid);
+@@ -1349,26 +1416,39 @@ long do_fork(unsigned long clone_flags,
+ #define ARCH_MIN_MMSTRUCT_ALIGN 0
+ #endif
+
++EXPORT_SYMBOL(do_fork_pid);
++
++long do_fork(unsigned long clone_flags,
++ unsigned long stack_start,
++ struct pt_regs *regs,
++ unsigned long stack_size,
++ int __user *parent_tidptr,
++ int __user *child_tidptr)
++{
++ return do_fork_pid(clone_flags, stack_start, regs, stack_size,
++ parent_tidptr, child_tidptr, 0);
++}
++
+ void __init proc_caches_init(void)
+ {
+ sighand_cachep = kmem_cache_create("sighand_cache",
+ sizeof(struct sighand_struct), 0,
+- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL);
+ signal_cachep = kmem_cache_create("signal_cache",
+ sizeof(struct signal_struct), 0,
+- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL);
+ files_cachep = kmem_cache_create("files_cache",
+ sizeof(struct files_struct), 0,
+- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL);
+ fs_cachep = kmem_cache_create("fs_cache",
+ sizeof(struct fs_struct), 0,
+- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL);
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), 0,
+ SLAB_PANIC, NULL, NULL);
+ mm_cachep = kmem_cache_create("mm_struct",
+ sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
+- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL);
+ }
+
+
+diff -upr linux-2.6.16.orig/kernel/hrtimer.c linux-2.6.16-026test015/kernel/hrtimer.c
+--- linux-2.6.16.orig/kernel/hrtimer.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/hrtimer.c 2006-07-04 14:41:39.000000000 +0400
+@@ -439,6 +439,7 @@ hrtimer_start(struct hrtimer *timer, kti
+
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(hrtimer_start);
+
+ /**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+@@ -467,6 +468,7 @@ int hrtimer_try_to_cancel(struct hrtimer
+ return ret;
+
+ }
++EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
+
+ /**
+ * hrtimer_cancel - cancel a timer and wait for the handler to finish.
+@@ -504,6 +506,7 @@ ktime_t hrtimer_get_remaining(const stru
+
+ return rem;
+ }
++EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+
+ #ifdef CONFIG_NO_IDLE_HZ
+ /**
+@@ -670,7 +673,7 @@ void hrtimer_run_queues(void)
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ */
+-static ktime_t __sched
++ktime_t __sched
+ schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
+ {
+ /* fn stays NULL, meaning single-shot wakeup: */
+@@ -697,7 +700,7 @@ schedule_hrtimer_interruptible(struct hr
+ return schedule_hrtimer(timer, mode);
+ }
+
+-static long __sched nanosleep_restart(struct restart_block *restart)
++long __sched nanosleep_restart(struct restart_block *restart)
+ {
+ struct timespec __user *rmtp;
+ struct timespec tu;
+@@ -726,6 +729,7 @@ static long __sched nanosleep_restart(st
+ /* The other values in restart are already filled in */
+ return -ERESTART_RESTARTBLOCK;
+ }
++EXPORT_SYMBOL_GPL(nanosleep_restart);
+
+ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+ const enum hrtimer_mode mode, const clockid_t clockid)
+diff -upr linux-2.6.16.orig/kernel/irq/handle.c linux-2.6.16-026test015/kernel/irq/handle.c
+--- linux-2.6.16.orig/kernel/irq/handle.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/irq/handle.c 2006-07-04 14:41:37.000000000 +0400
+@@ -14,6 +14,8 @@
+
+ #include "internals.h"
+
++#include <ub/beancounter.h>
++
+ /*
+ * Linux has a controller-independent interrupt architecture.
+ * Every controller has a 'controller-template', that is used
+@@ -80,10 +82,12 @@ fastcall int handle_IRQ_event(unsigned i
+ struct irqaction *action)
+ {
+ int ret, retval = 0, status = 0;
++ struct user_beancounter *ub;
+
+ if (!(action->flags & SA_INTERRUPT))
+ local_irq_enable();
+
++ ub = set_exec_ub(get_ub0());
+ do {
+ ret = action->handler(irq, action->dev_id, regs);
+ if (ret == IRQ_HANDLED)
+@@ -91,6 +95,7 @@ fastcall int handle_IRQ_event(unsigned i
+ retval |= ret;
+ action = action->next;
+ } while (action);
++ (void)set_exec_ub(ub);
+
+ if (status & SA_SAMPLE_RANDOM)
+ add_interrupt_randomness(irq);
+diff -upr linux-2.6.16.orig/kernel/kmod.c linux-2.6.16-026test015/kernel/kmod.c
+--- linux-2.6.16.orig/kernel/kmod.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/kmod.c 2006-07-04 14:41:38.000000000 +0400
+@@ -78,6 +78,10 @@ int request_module(const char *fmt, ...)
+ #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
+ static int kmod_loop_msg;
+
++ /* Don't allow request_module() inside VE. */
++ if (!ve_is_super(get_exec_env()))
++ return -EPERM;
++
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+@@ -246,6 +250,9 @@ int call_usermodehelper_keys(char *path,
+ };
+ DECLARE_WORK(work, __call_usermodehelper, &sub_info);
+
++ if (!ve_is_super(get_exec_env()))
++ return -EPERM;
++
+ if (!khelper_wq)
+ return -EBUSY;
+
+diff -upr linux-2.6.16.orig/kernel/kthread.c linux-2.6.16-026test015/kernel/kthread.c
+--- linux-2.6.16.orig/kernel/kthread.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/kthread.c 2006-07-04 14:41:38.000000000 +0400
+@@ -114,7 +114,7 @@ static void keventd_create_kthread(void
+ create->result = ERR_PTR(pid);
+ } else {
+ wait_for_completion(&create->started);
+- create->result = find_task_by_pid(pid);
++ create->result = find_task_by_pid_all(pid);
+ }
+ complete(&create->done);
+ }
+diff -upr linux-2.6.16.orig/kernel/module.c linux-2.6.16-026test015/kernel/module.c
+--- linux-2.6.16.orig/kernel/module.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/module.c 2006-07-04 14:41:38.000000000 +0400
+@@ -2130,6 +2130,8 @@ static void *m_start(struct seq_file *m,
+ loff_t n = 0;
+
+ down(&module_mutex);
++ if (!ve_is_super(get_exec_env()))
++ return NULL;
+ list_for_each(i, &modules) {
+ if (n++ == *pos)
+ break;
+diff -upr linux-2.6.16.orig/kernel/mutex-debug.c linux-2.6.16-026test015/kernel/mutex-debug.c
+--- linux-2.6.16.orig/kernel/mutex-debug.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/mutex-debug.c 2006-07-04 14:41:38.000000000 +0400
+@@ -193,12 +193,12 @@ retry:
+ if (count != 10)
+ printk(" locked it.\n");
+
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ show_task_locks(p);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+
+ printk("\n");
+ show_held_locks(NULL);
+diff -upr linux-2.6.16.orig/kernel/panic.c linux-2.6.16-026test015/kernel/panic.c
+--- linux-2.6.16.orig/kernel/panic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/panic.c 2006-07-04 14:41:38.000000000 +0400
+@@ -23,6 +23,8 @@
+ int panic_timeout;
+ int panic_on_oops;
+ int tainted;
++int kernel_text_csum_broken;
++EXPORT_SYMBOL(kernel_text_csum_broken);
+
+ EXPORT_SYMBOL(panic_timeout);
+
+@@ -156,7 +158,8 @@ const char *print_tainted(void)
+ {
+ static char buf[20];
+ if (tainted) {
+- snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c",
++ snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c",
++ kernel_text_csum_broken ? 'B' : ' ',
+ tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
+ tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
+ tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
+diff -upr linux-2.6.16.orig/kernel/pid.c linux-2.6.16-026test015/kernel/pid.c
+--- linux-2.6.16.orig/kernel/pid.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/pid.c 2006-07-04 14:41:39.000000000 +0400
+@@ -27,6 +27,10 @@
+ #include <linux/bootmem.h>
+ #include <linux/hash.h>
+
++#ifdef CONFIG_VE
++static void __free_vpid(int vpid, struct ve_struct *ve);
++#endif
++
+ #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
+ static struct hlist_head *pid_hash[PIDTYPE_MAX];
+ static int pidhash_shift;
+@@ -57,8 +61,14 @@ typedef struct pidmap {
+ void *page;
+ } pidmap_t;
+
++#ifdef CONFIG_VE
++#define PIDMAP_NRFREE (BITS_PER_PAGE/2)
++#else
++#define PIDMAP_NRFREE BITS_PER_PAGE
++#endif
++
+ static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
+- { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
++ { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(PIDMAP_NRFREE), NULL } };
+
+ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+
+@@ -67,9 +77,13 @@ fastcall void free_pidmap(int pid)
+ pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
+ int offset = pid & BITS_PER_PAGE_MASK;
+
+- clear_bit(offset, map->page);
++ BUG_ON(__is_virtual_pid(pid) || pid == 1);
++
++ if (test_and_clear_bit(offset, map->page) == 0)
++ BUG();
+ atomic_inc(&map->nr_free);
+ }
++EXPORT_SYMBOL_GPL(free_pidmap);
+
+ int alloc_pidmap(void)
+ {
+@@ -77,6 +91,8 @@ int alloc_pidmap(void)
+ pidmap_t *map;
+
+ pid = last + 1;
++ if (__is_virtual_pid(pid))
++ pid += VPID_DIV;
+ if (pid >= pid_max)
+ pid = RESERVED_PIDS;
+ offset = pid & BITS_PER_PAGE_MASK;
+@@ -106,6 +122,8 @@ int alloc_pidmap(void)
+ return pid;
+ }
+ offset = find_next_offset(map, offset);
++ if (__is_virtual_pid(offset))
++ offset += VPID_DIV;
+ pid = mk_pid(map, offset);
+ /*
+ * find_next_offset() found a bit, the pid from it
+@@ -130,6 +148,7 @@ int alloc_pidmap(void)
+ }
+ return -1;
+ }
++EXPORT_SYMBOL_GPL(alloc_pidmap);
+
+ struct pid * fastcall find_pid(enum pid_type type, int nr)
+ {
+@@ -143,6 +162,7 @@ struct pid * fastcall find_pid(enum pid_
+ }
+ return NULL;
+ }
++EXPORT_SYMBOL(find_pid);
+
+ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
+ {
+@@ -162,6 +182,7 @@ int fastcall attach_pid(task_t *task, en
+
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(attach_pid);
+
+ static fastcall int __detach_pid(task_t *task, enum pid_type type)
+ {
+@@ -201,13 +222,27 @@ void fastcall detach_pid(task_t *task, e
+ if (tmp != type && find_pid(tmp, nr))
+ return;
+
++#ifdef CONFIG_VE
++ __free_vpid(task->pids[type].vnr, VE_TASK_INFO(task)->owner_env);
++#endif
+ free_pidmap(nr);
+ }
++EXPORT_SYMBOL_GPL(detach_pid);
+
+ task_t *find_task_by_pid_type(int type, int nr)
+ {
++ BUG();
++ return NULL;
++}
++
++EXPORT_SYMBOL(find_task_by_pid_type);
++
++task_t *find_task_by_pid_type_all(int type, int nr)
++{
+ struct pid *pid;
+
++ BUG_ON(nr != -1 && is_virtual_pid(nr));
++
+ pid = find_pid(type, nr);
+ if (!pid)
+ return NULL;
+@@ -215,7 +250,35 @@ task_t *find_task_by_pid_type(int type,
+ return pid_task(&pid->pid_list, type);
+ }
+
+-EXPORT_SYMBOL(find_task_by_pid_type);
++EXPORT_SYMBOL(find_task_by_pid_type_all);
++
++#ifdef CONFIG_VE
++
++task_t *find_task_by_pid_type_ve(int type, int nr)
++{
++ task_t *tsk;
++ int gnr = nr;
++ struct pid *pid;
++
++ if (is_virtual_pid(nr)) {
++ gnr = __vpid_to_pid(nr);
++ if (unlikely(gnr == -1))
++ return NULL;
++ }
++
++ pid = find_pid(type, gnr);
++ if (!pid)
++ return NULL;
++
++ tsk = pid_task(&pid->pid_list, type);
++ if (!ve_accessible(VE_TASK_INFO(tsk)->owner_env, get_exec_env()))
++ return NULL;
++ return tsk;
++}
++
++EXPORT_SYMBOL(find_task_by_pid_type_ve);
++
++#endif
+
+ /*
+ * This function switches the PIDs if a non-leader thread calls
+@@ -234,12 +297,16 @@ void switch_exec_pids(task_t *leader, ta
+
+ leader->pid = leader->tgid = thread->pid;
+ thread->pid = thread->tgid;
++ set_virt_tgid(leader, virt_pid(thread));
++ set_virt_pid(leader, virt_pid(thread));
++ set_virt_pid(thread, virt_tgid(thread));
+
+ attach_pid(thread, PIDTYPE_PID, thread->pid);
+ attach_pid(thread, PIDTYPE_TGID, thread->tgid);
+ attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+ attach_pid(thread, PIDTYPE_SID, thread->signal->session);
+ list_add_tail(&thread->tasks, &init_task.tasks);
++ SET_VE_LINKS(thread);
+
+ attach_pid(leader, PIDTYPE_PID, leader->pid);
+ attach_pid(leader, PIDTYPE_TGID, leader->tgid);
+@@ -247,6 +314,362 @@ void switch_exec_pids(task_t *leader, ta
+ attach_pid(leader, PIDTYPE_SID, leader->signal->session);
+ }
+
++#ifdef CONFIG_VE
++
++/* Virtual PID bits.
++ *
++ * At the moment all internal structures in kernel store real global pid.
++ * The only place, where virtual PID is used, is at user frontend. We
++ * remap virtual pids obtained from user to global ones (vpid_to_pid) and
++ * map globals to virtuals before showing them to user (virt_pid_type).
++ *
++ * We hold virtual PIDs inside struct pid, so map global -> virtual is easy.
++ */
++
++pid_t _pid_type_to_vpid(int type, pid_t pid)
++{
++ struct pid * p;
++
++ if (unlikely(is_virtual_pid(pid)))
++ return -1;
++
++ read_lock(&tasklist_lock);
++ p = find_pid(type, pid);
++ if (p) {
++ pid = p->vnr;
++ } else {
++ pid = -1;
++ }
++ read_unlock(&tasklist_lock);
++ return pid;
++}
++EXPORT_SYMBOL_GPL(_pid_type_to_vpid);
++
++pid_t pid_type_to_vpid(int type, pid_t pid)
++{
++ int vpid;
++
++ if (unlikely(pid <= 0))
++ return pid;
++
++ BUG_ON(is_virtual_pid(pid));
++
++ if (ve_is_super(get_exec_env()))
++ return pid;
++
++ vpid = _pid_type_to_vpid(type, pid);
++ if (unlikely(vpid == -1)) {
++ /* It is allowed: global pid can be used everywhere.
++ * This can happen, when kernel remembers stray pids:
++ * signal queues, locks etc.
++ */
++ vpid = pid;
++ }
++ return vpid;
++}
++EXPORT_SYMBOL_GPL(pid_type_to_vpid);
++
++/* To map virtual pids to global we maintain special hash table.
++ *
++ * Mapping entries are allocated when a process with non-trivial
++ * mapping is forked, which is possible only after VE migrated.
++ * Mappings are destroyed, when a global pid is removed from global
++ * pidmap, which means we do not need to refcount mappings.
++ */
++
++static struct hlist_head *vpid_hash;
++
++struct vpid_mapping
++{
++ int vpid;
++ int veid;
++ int pid;
++ struct hlist_node link;
++ struct rcu_head rcu;
++};
++
++static kmem_cache_t *vpid_mapping_cachep;
++
++static inline int vpid_hashfn(int vnr, int veid)
++{
++ return hash_long((unsigned long)(vnr+(veid<<16)), pidhash_shift);
++}
++
++struct vpid_mapping *__lookup_vpid_mapping(int vnr, int veid)
++{
++ struct hlist_node *elem;
++ struct vpid_mapping *map;
++
++ hlist_for_each_entry_rcu(map, elem,
++ &vpid_hash[vpid_hashfn(vnr, veid)], link) {
++ if (map->vpid == vnr && map->veid == veid)
++ return map;
++ }
++ return NULL;
++}
++
++/* __vpid_to_pid() is raw version of vpid_to_pid(). It is to be used
++ * only under tasklist_lock. In some places we must use only this version
++ * (f.e. __kill_pg_info is called under write lock!)
++ *
++ * Caller should pass virtual pid. This function returns an error, when
++ * seeing a global pid.
++ */
++int __vpid_to_pid(int pid)
++{
++ struct vpid_mapping *map;
++
++ if (unlikely(!is_virtual_pid(pid) || ve_is_super(get_exec_env())))
++ return -1;
++
++ if (!get_exec_env()->sparse_vpid) {
++ if (pid != 1)
++ return pid - VPID_DIV;
++ return get_exec_env()->init_entry->pid;
++ }
++
++ map = __lookup_vpid_mapping(pid, VEID(get_exec_env()));
++ if (map)
++ return map->pid;
++ return -1;
++}
++EXPORT_SYMBOL_GPL(__vpid_to_pid);
++
++int vpid_to_pid(int pid)
++{
++ /* User gave bad pid. It is his problem. */
++ if (unlikely(pid <= 0))
++ return pid;
++
++ if (!is_virtual_pid(pid))
++ return pid;
++
++ read_lock(&tasklist_lock);
++ pid = __vpid_to_pid(pid);
++ read_unlock(&tasklist_lock);
++ return pid;
++}
++EXPORT_SYMBOL_GPL(vpid_to_pid);
++
++/* VEs which never migrated have trivial "arithmetic" mapping pid <-> vpid:
++ *
++ * vpid == 1 -> ve->init_task->pid
++ * else pid & ~VPID_DIV
++ *
++ * In this case VE has ve->sparse_vpid = 0 and we do not use vpid hash table.
++ *
++ * When VE migrates and we see non-trivial mapping the first time, we
++ * scan process table and populate mapping hash table.
++ */
++
++static int add_mapping(int pid, int vpid, int veid, struct hlist_head *cache)
++{
++ if (unlikely(pid <= 0 || vpid <= 0))
++ return 0;
++
++ /* VE can contain non-virtual (VE_ENTER'ed) processes when
++ * switching to sparse mapping. We should not create mappings
++ * for them. */
++ if (unlikely(!__is_virtual_pid(vpid) && vpid != 1)) {
++ printk("DEBUG (do not worry, but report): non-virtual pid while switching mode %d %d\n", pid, vpid);
++ return 0;
++ }
++
++ if (!__lookup_vpid_mapping(vpid, veid)) {
++ struct vpid_mapping *m;
++ if (hlist_empty(cache)) {
++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_ATOMIC);
++ if (unlikely(m == NULL))
++ return -ENOMEM;
++ } else {
++ m = hlist_entry(cache->first, struct vpid_mapping, link);
++ hlist_del_rcu(&m->link);
++ }
++ m->pid = pid;
++ m->vpid = vpid;
++ m->veid = veid;
++ hlist_add_head_rcu(&m->link,
++ &vpid_hash[vpid_hashfn(vpid, veid)]);
++ }
++ return 0;
++}
++
++static int switch_to_sparse_mapping(int pid)
++{
++ struct ve_struct *env = get_exec_env();
++ struct hlist_head cache;
++ task_t *g, *t;
++ int pcount;
++ int err;
++
++ /* Transition happens under write_lock_irq, so we try to make
++ * it more reliable and fast preallocating mapping entries.
++ * pcounter may be not enough, we could have lots of orphaned
++ * process groups and sessions, which also require mappings.
++ */
++ INIT_HLIST_HEAD(&cache);
++ pcount = atomic_read(&env->pcounter);
++ err = -ENOMEM;
++ while (pcount > 0) {
++ struct vpid_mapping *m;
++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL);
++ if (!m)
++ goto out;
++ hlist_add_head(&m->link, &cache);
++ pcount--;
++ }
++
++ write_lock_irq(&tasklist_lock);
++ err = 0;
++ if (env->sparse_vpid)
++ goto out_unlock;
++
++ err = -ENOMEM;
++ do_each_thread_ve(g, t) {
++ if (t->pid == pid)
++ continue;
++ if (add_mapping(t->pid, virt_pid(t), VEID(env), &cache))
++ goto out_unlock;
++ } while_each_thread_ve(g, t);
++
++ for_each_process_ve(t) {
++ if (t->pid == pid)
++ continue;
++
++ if (add_mapping(t->tgid, virt_tgid(t), VEID(env), &cache))
++ goto out_unlock;
++ if (add_mapping(t->signal->pgrp, virt_pgid(t), VEID(env), &cache))
++ goto out_unlock;
++ if (add_mapping(t->signal->session, virt_sid(t), VEID(env), &cache))
++ goto out_unlock;
++ }
++ env->sparse_vpid = 1;
++ err = 0;
++
++out_unlock:
++ if (err) {
++ int i;
++
++ for (i=0; i<(1<<pidhash_shift); i++) {
++ struct hlist_node *elem, *next;
++ struct vpid_mapping *map;
++
++ hlist_for_each_entry_safe(map, elem, next, &vpid_hash[i], link) {
++ if (map->veid == VEID(env)) {
++ hlist_del(elem);
++ hlist_add_head(elem, &cache);
++ }
++ }
++ }
++ }
++ write_unlock_irq(&tasklist_lock);
++
++out:
++ while (!hlist_empty(&cache)) {
++ struct vpid_mapping *m;
++ m = hlist_entry(cache.first, struct vpid_mapping, link);
++ hlist_del_rcu(&m->link);
++ kmem_cache_free(vpid_mapping_cachep, m);
++ }
++ return err;
++}
++
++int alloc_vpid(int pid, int virt_pid)
++{
++ int result;
++ struct vpid_mapping *m;
++ struct ve_struct *env = get_exec_env();
++
++ if (ve_is_super(env) || !env->virt_pids)
++ return pid;
++
++ if (!env->sparse_vpid) {
++ if (virt_pid == -1)
++ return pid + VPID_DIV;
++
++ if (virt_pid == 1 || virt_pid == pid + VPID_DIV)
++ return virt_pid;
++
++ if ((result = switch_to_sparse_mapping(pid)) < 0)
++ return result;
++ }
++
++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL);
++ if (!m)
++ return -ENOMEM;
++
++ m->pid = pid;
++ m->veid = VEID(env);
++
++ result = (virt_pid == -1) ? pid + VPID_DIV : virt_pid;
++
++ write_lock_irq(&tasklist_lock);
++ if (unlikely(__lookup_vpid_mapping(result, m->veid))) {
++ if (virt_pid > 0) {
++ result = -EEXIST;
++ goto out;
++ }
++
++ /* No luck. Now we search for some not-existing vpid.
++ * It is weak place. We do linear search. */
++ do {
++ result++;
++ if (!__is_virtual_pid(result))
++ result += VPID_DIV;
++ if (result >= pid_max)
++ result = RESERVED_PIDS + VPID_DIV;
++ } while (__lookup_vpid_mapping(result, m->veid) != NULL);
++
++ /* And set last_pid in hope future alloc_pidmap to avoid
++ * collisions after future alloc_pidmap() */
++ last_pid = result - VPID_DIV;
++ }
++ if (result > 0) {
++ m->vpid = result;
++ hlist_add_head_rcu(&m->link,
++ &vpid_hash[vpid_hashfn(result, m->veid)]);
++ }
++out:
++ write_unlock_irq(&tasklist_lock);
++ if (result < 0)
++ kmem_cache_free(vpid_mapping_cachep, m);
++ return result;
++}
++EXPORT_SYMBOL(alloc_vpid);
++
++static void vpid_free_cb(struct rcu_head *rhp)
++{
++ struct vpid_mapping *m;
++
++ m = container_of(rhp, struct vpid_mapping, rcu);
++ kmem_cache_free(vpid_mapping_cachep, m);
++}
++
++static void __free_vpid(int vpid, struct ve_struct *ve)
++{
++ struct vpid_mapping *m;
++
++ if (!ve->sparse_vpid)
++ return;
++
++ if (!__is_virtual_pid(vpid) && (vpid != 1 || ve_is_super(ve)))
++ return;
++
++ m = __lookup_vpid_mapping(vpid, ve->veid);
++ BUG_ON(m == NULL);
++ hlist_del_rcu(&m->link);
++ call_rcu(&m->rcu, vpid_free_cb);
++}
++
++void free_vpid(int vpid, struct ve_struct *ve)
++{
++ write_lock_irq(&tasklist_lock);
++ __free_vpid(vpid, ve);
++ write_unlock_irq(&tasklist_lock);
++}
++EXPORT_SYMBOL(free_vpid);
++#endif
++
+ /*
+ * The pid hash table is scaled according to the amount of memory in the
+ * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or
+@@ -273,6 +696,14 @@ void __init pidhash_init(void)
+ for (j = 0; j < pidhash_size; j++)
+ INIT_HLIST_HEAD(&pid_hash[i][j]);
+ }
++
++#ifdef CONFIG_VE
++ vpid_hash = alloc_bootmem(pidhash_size * sizeof(struct hlist_head));
++ if (!vpid_hash)
++ panic("Could not alloc vpid_hash!\n");
++ for (j = 0; j < pidhash_size; j++)
++ INIT_HLIST_HEAD(&vpid_hash[j]);
++#endif
+ }
+
+ void __init pidmap_init(void)
+@@ -289,4 +720,12 @@ void __init pidmap_init(void)
+
+ for (i = 0; i < PIDTYPE_MAX; i++)
+ attach_pid(current, i, 0);
++
++#ifdef CONFIG_VE
++ vpid_mapping_cachep =
++ kmem_cache_create("vpid_mapping",
++ sizeof(struct vpid_mapping),
++ __alignof__(struct vpid_mapping),
++ SLAB_PANIC|SLAB_UBC, NULL, NULL);
++#endif
+ }
+diff -upr linux-2.6.16.orig/kernel/posix-cpu-timers.c linux-2.6.16-026test015/kernel/posix-cpu-timers.c
+--- linux-2.6.16.orig/kernel/posix-cpu-timers.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/posix-cpu-timers.c 2006-07-04 14:41:38.000000000 +0400
+@@ -20,7 +20,7 @@ static int check_clock(const clockid_t w
+ return 0;
+
+ read_lock(&tasklist_lock);
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
+ p->tgid != current->tgid : p->tgid != pid)) {
+ error = -EINVAL;
+@@ -292,7 +292,7 @@ int posix_cpu_clock_get(const clockid_t
+ */
+ struct task_struct *p;
+ read_lock(&tasklist_lock);
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ if (p) {
+ if (CPUCLOCK_PERTHREAD(which_clock)) {
+ if (p->tgid == current->tgid) {
+@@ -336,7 +336,7 @@ int posix_cpu_timer_create(struct k_itim
+ if (pid == 0) {
+ p = current;
+ } else {
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ if (p && p->tgid != current->tgid)
+ p = NULL;
+ }
+@@ -344,7 +344,7 @@ int posix_cpu_timer_create(struct k_itim
+ if (pid == 0) {
+ p = current->group_leader;
+ } else {
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ if (p && p->tgid != pid)
+ p = NULL;
+ }
+@@ -1173,6 +1173,9 @@ static void check_process_timers(struct
+ }
+ t = tsk;
+ do {
++ if (unlikely(t->flags & PF_EXITING))
++ continue;
++
+ ticks = cputime_add(cputime_add(t->utime, t->stime),
+ prof_left);
+ if (!cputime_eq(prof_expires, cputime_zero) &&
+@@ -1193,11 +1196,7 @@ static void check_process_timers(struct
+ t->it_sched_expires > sched)) {
+ t->it_sched_expires = sched;
+ }
+-
+- do {
+- t = next_thread(t);
+- } while (unlikely(t->flags & PF_EXITING));
+- } while (t != tsk);
++ } while ((t = next_thread(t)) != tsk);
+ }
+ }
+
+@@ -1289,30 +1288,30 @@ void run_posix_cpu_timers(struct task_st
+
+ #undef UNEXPIRED
+
+- BUG_ON(tsk->exit_state);
+-
+ /*
+ * Double-check with locks held.
+ */
+ read_lock(&tasklist_lock);
+- spin_lock(&tsk->sighand->siglock);
++ if (likely(tsk->signal != NULL)) {
++ spin_lock(&tsk->sighand->siglock);
+
+- /*
+- * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+- * all the timers that are firing, and put them on the firing list.
+- */
+- check_thread_timers(tsk, &firing);
+- check_process_timers(tsk, &firing);
++ /*
++ * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
++ * all the timers that are firing, and put them on the firing list.
++ */
++ check_thread_timers(tsk, &firing);
++ check_process_timers(tsk, &firing);
+
+- /*
+- * We must release these locks before taking any timer's lock.
+- * There is a potential race with timer deletion here, as the
+- * siglock now protects our private firing list. We have set
+- * the firing flag in each timer, so that a deletion attempt
+- * that gets the timer lock before we do will give it up and
+- * spin until we've taken care of that timer below.
+- */
+- spin_unlock(&tsk->sighand->siglock);
++ /*
++ * We must release these locks before taking any timer's lock.
++ * There is a potential race with timer deletion here, as the
++ * siglock now protects our private firing list. We have set
++ * the firing flag in each timer, so that a deletion attempt
++ * that gets the timer lock before we do will give it up and
++ * spin until we've taken care of that timer below.
++ */
++ spin_unlock(&tsk->sighand->siglock);
++ }
+ read_unlock(&tasklist_lock);
+
+ /*
+diff -upr linux-2.6.16.orig/kernel/posix-timers.c linux-2.6.16-026test015/kernel/posix-timers.c
+--- linux-2.6.16.orig/kernel/posix-timers.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/posix-timers.c 2006-07-04 14:41:38.000000000 +0400
+@@ -31,6 +31,7 @@
+ * POSIX clocks & timers
+ */
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/smp_lock.h>
+ #include <linux/interrupt.h>
+ #include <linux/slab.h>
+@@ -48,6 +49,8 @@
+ #include <linux/workqueue.h>
+ #include <linux/module.h>
+
++#include <ub/beancounter.h>
++
+ /*
+ * Management arrays for POSIX timers. Timers are kept in slab memory
+ * Timer ids are allocated by an external routine that keeps track of the
+@@ -241,7 +244,8 @@ static __init int init_posix_timers(void
+ register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+- sizeof (struct k_itimer), 0, 0, NULL, NULL);
++ sizeof (struct k_itimer), 0,
++ SLAB_UBC, NULL, NULL);
+ idr_init(&posix_timers_id);
+ return 0;
+ }
+@@ -294,6 +298,13 @@ void do_schedule_next_timer(struct sigin
+
+ int posix_timer_event(struct k_itimer *timr,int si_private)
+ {
++ int ret;
++ struct ve_struct *ve;
++ struct user_beancounter *ub;
++
++ ve = set_exec_env(timr->it_process->ve_task_info.owner_env);
++ ub = set_exec_ub(timr->it_process->task_bc.task_ub);
++
+ memset(&timr->sigq->info, 0, sizeof(siginfo_t));
+ timr->sigq->info.si_sys_private = si_private;
+ /* Send signal to the process that owns this timer.*/
+@@ -306,11 +317,11 @@ int posix_timer_event(struct k_itimer *t
+
+ if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+ struct task_struct *leader;
+- int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
++ ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
+ timr->it_process);
+
+ if (likely(ret >= 0))
+- return ret;
++ goto out;
+
+ timr->it_sigev_notify = SIGEV_SIGNAL;
+ leader = timr->it_process->group_leader;
+@@ -318,8 +329,12 @@ int posix_timer_event(struct k_itimer *t
+ timr->it_process = leader;
+ }
+
+- return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
++ ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+ timr->it_process);
++out:
++ (void)set_exec_ub(ub);
++ (void)set_exec_env(ve);
++ return ret;
+ }
+ EXPORT_SYMBOL_GPL(posix_timer_event);
+
+@@ -366,7 +381,7 @@ static struct task_struct * good_sigeven
+ struct task_struct *rtn = current->group_leader;
+
+ if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
+- (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
++ (!(rtn = find_task_by_pid_ve(event->sigev_notify_thread_id)) ||
+ rtn->tgid != current->tgid ||
+ (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ return NULL;
+diff -upr linux-2.6.16.orig/kernel/power/Kconfig linux-2.6.16-026test015/kernel/power/Kconfig
+--- linux-2.6.16.orig/kernel/power/Kconfig 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/power/Kconfig 2006-07-04 14:41:39.000000000 +0400
+@@ -38,7 +38,7 @@ config PM_DEBUG
+
+ config SOFTWARE_SUSPEND
+ bool "Software Suspend"
+- depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
++ depends on PM && SWAP && X86 || ((FRV || PPC32) && !SMP)
+ ---help---
+ Enable the possibility of suspending the machine.
+ It doesn't need APM.
+diff -upr linux-2.6.16.orig/kernel/power/process.c linux-2.6.16-026test015/kernel/power/process.c
+--- linux-2.6.16.orig/kernel/power/process.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/power/process.c 2006-07-04 14:41:39.000000000 +0400
+@@ -38,18 +38,23 @@ void refrigerator(void)
+ processes around? */
+ long save;
+ save = current->state;
++ current->state = TASK_UNINTERRUPTIBLE;
+ pr_debug("%s entered refrigerator\n", current->comm);
+- printk("=");
++ /* printk("="); */
+
+- frozen_process(current);
+ spin_lock_irq(&current->sighand->siglock);
+- recalc_sigpending(); /* We sent fake signal, clean it up */
++ if (test_and_clear_thread_flag(TIF_FREEZE)) {
++ recalc_sigpending(); /* We sent fake signal, clean it up */
++ current->flags |= PF_FROZEN;
++ } else {
++ /* Freeze request could be canceled before we entered
++ * refrigerator(). In this case we do nothing. */
++ current->state = save;
++ }
+ spin_unlock_irq(&current->sighand->siglock);
+
+- while (frozen(current)) {
+- current->state = TASK_UNINTERRUPTIBLE;
++ while (current->flags & PF_FROZEN)
+ schedule();
+- }
+ pr_debug("%s left refrigerator\n", current->comm);
+ current->state = save;
+ }
+@@ -67,7 +72,7 @@ int freeze_processes(void)
+ do {
+ todo = 0;
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ if (!freezeable(p))
+ continue;
+ if (frozen(p))
+@@ -78,7 +83,7 @@ int freeze_processes(void)
+ signal_wake_up(p, 0);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ todo++;
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+ read_unlock(&tasklist_lock);
+ yield(); /* Yield is okay here */
+ if (todo && time_after(jiffies, start_time + TIMEOUT)) {
+@@ -95,15 +100,15 @@ int freeze_processes(void)
+ */
+ if (todo) {
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p)
++ do_each_thread_all(g, p)
+ if (freezing(p)) {
+ pr_debug(" clean up: %s\n", p->comm);
+- p->flags &= ~PF_FREEZE;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
++ clear_tsk_thread_flag(p, TIF_FREEZE);
+ recalc_sigpending_tsk(p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+- while_each_thread(g, p);
++ while_each_thread_all(g, p);
+ read_unlock(&tasklist_lock);
+ return todo;
+ }
+@@ -119,12 +124,12 @@ void thaw_processes(void)
+
+ printk( "Restarting tasks..." );
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ if (!freezeable(p))
+ continue;
+ if (!thaw_process(p))
+ printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+
+ read_unlock(&tasklist_lock);
+ schedule();
+diff -upr linux-2.6.16.orig/kernel/printk.c linux-2.6.16-026test015/kernel/printk.c
+--- linux-2.6.16.orig/kernel/printk.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/printk.c 2006-07-04 14:41:38.000000000 +0400
+@@ -30,7 +30,9 @@
+ #include <linux/smp.h>
+ #include <linux/security.h>
+ #include <linux/bootmem.h>
++#include <linux/vzratelimit.h>
+ #include <linux/syscalls.h>
++#include <linux/veprintk.h>
+
+ #include <asm/uaccess.h>
+
+@@ -83,7 +85,7 @@ static int console_locked;
+ * It is also used in interesting ways to provide interlocking in
+ * release_console_sem().
+ */
+-static DEFINE_SPINLOCK(logbuf_lock);
++DEFINE_SPINLOCK(logbuf_lock);
+
+ #define LOG_BUF_MASK (log_buf_len-1)
+ #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
+@@ -114,6 +116,7 @@ static int preferred_console = -1;
+
+ /* Flag: console code may call schedule() */
+ static int console_may_schedule;
++int console_silence_loglevel;
+
+ #ifdef CONFIG_PRINTK
+
+@@ -160,6 +163,19 @@ static int __init console_setup(char *st
+
+ __setup("console=", console_setup);
+
++static int __init setup_console_silencelevel(char *str)
++{
++ int level;
++
++ if (get_option(&str, &level) != 1)
++ return 0;
++
++ console_silence_loglevel = level;
++ return 1;
++}
++
++__setup("silencelevel=", setup_console_silencelevel);
++
+ static int __init log_buf_len_setup(char *str)
+ {
+ unsigned long size = memparse(str, &str);
+@@ -223,6 +239,10 @@ int do_syslog(int type, char __user *buf
+ char c;
+ int error = 0;
+
++ if (!ve_is_super(get_exec_env()) &&
++ (type == 6 || type == 7 || type == 8))
++ goto out;
++
+ error = security_syslog(type);
+ if (error)
+ return error;
+@@ -243,15 +263,15 @@ int do_syslog(int type, char __user *buf
+ error = -EFAULT;
+ goto out;
+ }
+- error = wait_event_interruptible(log_wait,
+- (log_start - log_end));
++ error = wait_event_interruptible(ve_log_wait,
++ (ve_log_start - ve_log_end));
+ if (error)
+ goto out;
+ i = 0;
+ spin_lock_irq(&logbuf_lock);
+- while (!error && (log_start != log_end) && i < len) {
+- c = LOG_BUF(log_start);
+- log_start++;
++ while (!error && (ve_log_start != ve_log_end) && i < len) {
++ c = VE_LOG_BUF(ve_log_start);
++ ve_log_start++;
+ spin_unlock_irq(&logbuf_lock);
+ error = __put_user(c,buf);
+ buf++;
+@@ -277,15 +297,17 @@ int do_syslog(int type, char __user *buf
+ error = -EFAULT;
+ goto out;
+ }
++ if (ve_log_buf == NULL)
++ goto out;
+ count = len;
+- if (count > log_buf_len)
+- count = log_buf_len;
++ if (count > ve_log_buf_len)
++ count = ve_log_buf_len;
+ spin_lock_irq(&logbuf_lock);
+- if (count > logged_chars)
+- count = logged_chars;
++ if (count > ve_logged_chars)
++ count = ve_logged_chars;
+ if (do_clear)
+- logged_chars = 0;
+- limit = log_end;
++ ve_logged_chars = 0;
++ limit = ve_log_end;
+ /*
+ * __put_user() could sleep, and while we sleep
+ * printk() could overwrite the messages
+@@ -294,9 +316,9 @@ int do_syslog(int type, char __user *buf
+ */
+ for (i = 0; i < count && !error; i++) {
+ j = limit-1-i;
+- if (j + log_buf_len < log_end)
++ if (j + ve_log_buf_len < ve_log_end)
+ break;
+- c = LOG_BUF(j);
++ c = VE_LOG_BUF(j);
+ spin_unlock_irq(&logbuf_lock);
+ error = __put_user(c,&buf[count-1-i]);
+ cond_resched();
+@@ -320,7 +342,7 @@ int do_syslog(int type, char __user *buf
+ }
+ break;
+ case 5: /* Clear ring buffer */
+- logged_chars = 0;
++ ve_logged_chars = 0;
+ break;
+ case 6: /* Disable logging to console */
+ console_loglevel = minimum_console_loglevel;
+@@ -338,10 +360,10 @@ int do_syslog(int type, char __user *buf
+ error = 0;
+ break;
+ case 9: /* Number of chars in the log buffer */
+- error = log_end - log_start;
++ error = ve_log_end - ve_log_start;
+ break;
+ case 10: /* Size of the log buffer */
+- error = log_buf_len;
++ error = ve_log_buf_len;
+ break;
+ default:
+ error = -EINVAL;
+@@ -439,14 +461,14 @@ static void call_console_drivers(unsigne
+
+ static void emit_log_char(char c)
+ {
+- LOG_BUF(log_end) = c;
+- log_end++;
+- if (log_end - log_start > log_buf_len)
+- log_start = log_end - log_buf_len;
+- if (log_end - con_start > log_buf_len)
+- con_start = log_end - log_buf_len;
+- if (logged_chars < log_buf_len)
+- logged_chars++;
++ VE_LOG_BUF(ve_log_end) = c;
++ ve_log_end++;
++ if (ve_log_end - ve_log_start > ve_log_buf_len)
++ ve_log_start = ve_log_end - ve_log_buf_len;
++ if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len)
++ con_start = ve_log_end - ve_log_buf_len;
++ if (ve_logged_chars < ve_log_buf_len)
++ ve_logged_chars++;
+ }
+
+ /*
+@@ -511,6 +533,30 @@ __attribute__((weak)) unsigned long long
+ * printf(3)
+ */
+
++static inline int ve_log_init(void)
++{
++#ifdef CONFIG_VE
++ if (ve_log_buf != NULL)
++ return 0;
++
++ if (ve_is_super(get_exec_env())) {
++ ve0._log_wait = &log_wait;
++ ve0._log_start = &log_start;
++ ve0._log_end = &log_end;
++ ve0._logged_chars = &logged_chars;
++ ve0.log_buf = log_buf;
++ return 0;
++ }
++
++ ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC);
++ if (!ve_log_buf)
++ return -ENOMEM;
++
++ memset(ve_log_buf, 0, ve_log_buf_len);
++#endif
++ return 0;
++}
++
+ asmlinkage int printk(const char *fmt, ...)
+ {
+ va_list args;
+@@ -526,13 +572,14 @@ asmlinkage int printk(const char *fmt, .
+ /* cpu currently holding logbuf_lock */
+ static volatile unsigned int printk_cpu = UINT_MAX;
+
+-asmlinkage int vprintk(const char *fmt, va_list args)
++asmlinkage int __vprintk(const char *fmt, va_list args)
+ {
+ unsigned long flags;
+ int printed_len;
+ char *p;
+ static char printk_buf[1024];
+ static int log_level_unknown = 1;
++ int err, need_wake;
+
+ preempt_disable();
+ if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+@@ -544,6 +591,12 @@ asmlinkage int vprintk(const char *fmt,
+ spin_lock_irqsave(&logbuf_lock, flags);
+ printk_cpu = smp_processor_id();
+
++ err = ve_log_init();
++ if (err) {
++ spin_unlock_irqrestore(&logbuf_lock, flags);
++ return err;
++ }
++
+ /* Emit the output into the temporary buffer */
+ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+
+@@ -615,7 +668,12 @@ asmlinkage int vprintk(const char *fmt,
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+ goto out;
+ }
+- if (!down_trylock(&console_sem)) {
++ if (!ve_is_super(get_exec_env())) {
++ need_wake = (ve_log_start != ve_log_end);
++ spin_unlock_irqrestore(&logbuf_lock, flags);
++ if (!oops_in_progress && need_wake)
++ wake_up_interruptible(&ve_log_wait);
++ } else if (!down_trylock(&console_sem)) {
+ console_locked = 1;
+ /*
+ * We own the drivers. We can drop the spinlock and let
+@@ -641,6 +699,38 @@ out:
+ EXPORT_SYMBOL(printk);
+ EXPORT_SYMBOL(vprintk);
+
++asmlinkage int vprintk(const char *fmt, va_list args)
++{
++ int i;
++ struct ve_struct *env;
++
++ env = set_exec_env(get_ve0());
++ i = __vprintk(fmt, args);
++ set_exec_env(env);
++ return i;
++}
++
++asmlinkage int ve_printk(int dst, const char *fmt, ...)
++{
++ va_list args;
++ int printed_len;
++
++ printed_len = 0;
++ if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) {
++ va_start(args, fmt);
++ printed_len = vprintk(fmt, args);
++ va_end(args);
++ }
++ if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) {
++ va_start(args, fmt);
++ printed_len = __vprintk(fmt, args);
++ va_end(args);
++ }
++ return printed_len;
++}
++EXPORT_SYMBOL(ve_printk);
++
++
+ #else
+
+ asmlinkage long sys_syslog(int type, char __user *buf, int len)
+@@ -732,6 +822,12 @@ int is_console_locked(void)
+ }
+ EXPORT_SYMBOL(is_console_locked);
+
++void wake_up_klogd(void)
++{
++ if (!oops_in_progress && waitqueue_active(&log_wait))
++ wake_up_interruptible(&log_wait);
++}
++
+ /**
+ * release_console_sem - unlock the console system
+ *
+@@ -768,8 +864,8 @@ void release_console_sem(void)
+ console_may_schedule = 0;
+ up(&console_sem);
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
+- wake_up_interruptible(&log_wait);
++ if (wake_klogd)
++ wake_up_klogd();
+ }
+ EXPORT_SYMBOL(release_console_sem);
+
+@@ -1049,3 +1145,33 @@ int printk_ratelimit(void)
+ printk_ratelimit_burst);
+ }
+ EXPORT_SYMBOL(printk_ratelimit);
++
++/*
++ * Rate limiting stuff.
++ */
++int vz_ratelimit(struct vz_rate_info *p)
++{
++ unsigned long cjif, djif;
++ unsigned long flags;
++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
++ long new_bucket;
++
++ spin_lock_irqsave(&ratelimit_lock, flags);
++ cjif = jiffies;
++ djif = cjif - p->last;
++ if (djif < p->interval) {
++ if (p->bucket >= p->burst) {
++ spin_unlock_irqrestore(&ratelimit_lock, flags);
++ return 0;
++ }
++ p->bucket++;
++ } else {
++ new_bucket = p->bucket - (djif / (unsigned)p->interval);
++ if (new_bucket < 0)
++ new_bucket = 0;
++ p->bucket = new_bucket + 1;
++ }
++ p->last = cjif;
++ spin_unlock_irqrestore(&ratelimit_lock, flags);
++ return 1;
++}
+diff -upr linux-2.6.16.orig/kernel/ptrace.c linux-2.6.16-026test015/kernel/ptrace.c
+--- linux-2.6.16.orig/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/ptrace.c 2006-07-04 14:41:39.000000000 +0400
+@@ -57,10 +57,6 @@ void ptrace_untrace(task_t *child)
+ signal_wake_up(child, 1);
+ }
+ }
+- if (child->signal->flags & SIGNAL_GROUP_EXIT) {
+- sigaddset(&child->pending.signal, SIGKILL);
+- signal_wake_up(child, 1);
+- }
+ spin_unlock(&child->sighand->siglock);
+ }
+
+@@ -82,7 +78,8 @@ void __ptrace_unlink(task_t *child)
+ SET_LINKS(child);
+ }
+
+- ptrace_untrace(child);
++ if (child->state == TASK_TRACED)
++ ptrace_untrace(child);
+ }
+
+ /*
+@@ -136,7 +133,10 @@ static int may_attach(struct task_struct
+ smp_rmb();
+ if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+-
++ if (!task->mm->vps_dumpable && !ve_is_super(get_exec_env()))
++ return -EPERM;
++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env()))
++ return -EPERM;
+ return security_ptrace(current, task);
+ }
+
+@@ -152,12 +152,34 @@ int ptrace_may_attach(struct task_struct
+ int ptrace_attach(struct task_struct *task)
+ {
+ int retval;
+- task_lock(task);
++
+ retval = -EPERM;
+ if (task->pid <= 1)
+- goto bad;
++ goto out;
+ if (task->tgid == current->tgid)
+- goto bad;
++ goto out;
++
++repeat:
++ /*
++ * Nasty, nasty.
++ *
++ * We want to hold both the task-lock and the
++ * tasklist_lock for writing at the same time.
++ * But that's against the rules (tasklist_lock
++ * is taken for reading by interrupts on other
++ * cpu's that may have task_lock).
++ */
++ task_lock(task);
++ local_irq_disable();
++ if (!write_trylock(&tasklist_lock)) {
++ local_irq_enable();
++ task_unlock(task);
++ do {
++ cpu_relax();
++ } while (!write_can_lock(&tasklist_lock));
++ goto repeat;
++ }
++
+ /* the same process cannot be attached many times */
+ if (task->ptrace & PT_PTRACED)
+ goto bad;
+@@ -170,17 +192,15 @@ int ptrace_attach(struct task_struct *ta
+ ? PT_ATTACHED : 0);
+ if (capable(CAP_SYS_PTRACE))
+ task->ptrace |= PT_PTRACE_CAP;
+- task_unlock(task);
+
+- write_lock_irq(&tasklist_lock);
+ __ptrace_link(task, current);
+- write_unlock_irq(&tasklist_lock);
+
+ force_sig_specific(SIGSTOP, task);
+- return 0;
+
+ bad:
++ write_unlock_irq(&tasklist_lock);
+ task_unlock(task);
++out:
+ return retval;
+ }
+
+@@ -263,6 +283,7 @@ int access_process_vm(struct task_struct
+
+ return buf - old_buf;
+ }
++EXPORT_SYMBOL_GPL(access_process_vm);
+
+ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
+ {
+@@ -421,21 +442,22 @@ int ptrace_request(struct task_struct *c
+ */
+ int ptrace_traceme(void)
+ {
+- int ret;
++ int ret = -EPERM;
+
+ /*
+ * Are we already being traced?
+ */
+- if (current->ptrace & PT_PTRACED)
+- return -EPERM;
+- ret = security_ptrace(current->parent, current);
+- if (ret)
+- return -EPERM;
+- /*
+- * Set the ptrace bit in the process ptrace flags.
+- */
+- current->ptrace |= PT_PTRACED;
+- return 0;
++ task_lock(current);
++ if (!(current->ptrace & PT_PTRACED)) {
++ ret = security_ptrace(current->parent, current);
++ /*
++ * Set the ptrace bit in the process ptrace flags.
++ */
++ if (!ret)
++ current->ptrace |= PT_PTRACED;
++ }
++ task_unlock(current);
++ return ret;
+ }
+
+ /**
+@@ -459,7 +481,7 @@ struct task_struct *ptrace_get_task_stru
+ return ERR_PTR(-EPERM);
+
+ read_lock(&tasklist_lock);
+- child = find_task_by_pid(pid);
++ child = find_task_by_pid_ve(pid);
+ if (child)
+ get_task_struct(child);
+ read_unlock(&tasklist_lock);
+diff -upr linux-2.6.16.orig/kernel/sched.c linux-2.6.16-026test015/kernel/sched.c
+--- linux-2.6.16.orig/kernel/sched.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/sched.c 2006-07-04 14:41:39.000000000 +0400
+@@ -49,6 +49,8 @@
+ #include <linux/syscalls.h>
+ #include <linux/times.h>
+ #include <linux/acct.h>
++#include <linux/vsched.h>
++#include <linux/fairsched.h>
+ #include <asm/tlb.h>
+
+ #include <asm/unistd.h>
+@@ -134,7 +136,7 @@
+ #ifdef CONFIG_SMP
+ #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
+ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
+- num_online_cpus())
++ vsched_num_online_vcpus(task_vsched(p)))
+ #else
+ #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
+ (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
+@@ -199,6 +201,7 @@ struct prio_array {
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
++typedef struct vcpu_info *vcpu_t;
+ struct runqueue {
+ spinlock_t lock;
+
+@@ -220,9 +223,12 @@ struct runqueue {
+ */
+ unsigned long nr_uninterruptible;
+
++ unsigned long nr_sleeping;
++ unsigned long nr_stopped;
++
+ unsigned long expired_timestamp;
+ unsigned long long timestamp_last_tick;
+- task_t *curr, *idle;
++ task_t *curr;
+ struct mm_struct *prev_mm;
+ prio_array_t *active, *expired, arrays[2];
+ int best_expired_prio;
+@@ -233,11 +239,12 @@ struct runqueue {
+
+ /* For active balancing */
+ int active_balance;
+- int push_cpu;
++#endif
++ vcpu_t push_cpu;
+
+ task_t *migration_thread;
+ struct list_head migration_queue;
+-#endif
++ int cpu;
+
+ #ifdef CONFIG_SCHEDSTATS
+ /* latency stats */
+@@ -260,7 +267,51 @@ struct runqueue {
+ #endif
+ };
+
+-static DEFINE_PER_CPU(struct runqueue, runqueues);
++/* VCPU scheduler state description */
++struct vcpu_info;
++struct vcpu_scheduler {
++ struct list_head idle_list;
++ struct list_head active_list;
++ struct list_head running_list;
++#ifdef CONFIG_FAIRSCHED
++ struct fairsched_node *node;
++#endif
++ struct vcpu_info *vcpu[NR_CPUS];
++ int id;
++ cpumask_t vcpu_online_map, vcpu_running_map;
++ cpumask_t pcpu_running_map;
++ int num_online_vcpus;
++} ____cacheline_internodealigned_in_smp;
++
++/* virtual CPU description */
++struct vcpu_info {
++ struct runqueue rq;
++#ifdef CONFIG_SCHED_VCPU
++ unsigned active : 1,
++ running : 1;
++ struct list_head list;
++ struct vcpu_scheduler *vsched;
++ int last_pcpu;
++ u32 start_time;
++#endif
++ int id;
++} ____cacheline_internodealigned_in_smp;
++
++/* physical CPU description */
++struct pcpu_info {
++ struct vcpu_scheduler *vsched;
++ struct vcpu_info *vcpu;
++ task_t *idle;
++#ifdef CONFIG_SMP
++ struct sched_domain *sd;
++#endif
++ int id;
++} ____cacheline_internodealigned_in_smp;
++
++struct pcpu_info pcpu_info[NR_CPUS];
++
++#define pcpu(nr) (&pcpu_info[nr])
++#define this_pcpu() (pcpu(smp_processor_id()))
+
+ /*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+@@ -269,13 +320,399 @@ static DEFINE_PER_CPU(struct runqueue, r
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
++#define for_each_pdomain(sd, domain) \
++for (domain = rcu_dereference(sd); domain; domain = domain->parent)
++
+ #define for_each_domain(cpu, domain) \
+-for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
++ for_each_pdomain(vcpu_rq(cpu)->sd, domain)
++
++#ifdef CONFIG_SCHED_VCPU
++
++u32 vcpu_sched_timeslice = 5;
++u32 vcpu_timeslice = 0;
++EXPORT_SYMBOL(vcpu_sched_timeslice);
++EXPORT_SYMBOL(vcpu_timeslice);
++
++extern spinlock_t fairsched_lock;
++static struct vcpu_scheduler default_vsched, idle_vsched;
++static struct vcpu_info boot_vcpu, boot_idle_vcpu;
++
++#define vsched_default_vsched() (&default_vsched)
++#define vsched_default_vcpu(id) (default_vsched.vcpu[id])
++
++/*
++ * All macroses below could be used without locks, if there is no
++ * strict ordering requirements, because we assume, that:
++ *
++ * 1. VCPU could not disappear "on the fly" (FIXME)
++ *
++ * 2. p->vsched access is atomic.
++ */
++
++#define task_vsched(tsk) ((tsk)->vsched)
++#define this_vsched() (task_vsched(current))
++
++#define vsched_vcpu(vsched, id) ((vsched)->vcpu[id])
++#define this_vcpu() (task_vcpu(current))
++#define task_vcpu(p) ((p)->vcpu)
++
++#define vsched_id(vsched) ((vsched)->id)
++#define vsched_vcpu_online_map(vsched) ((vsched)->vcpu_online_map)
++#define vsched_num_online_vcpus(vsched) ((vsched)->num_online_vcpus)
++#define vsched_pcpu_running_map(vsched) ((vsched)->pcpu_running_map)
++
++#define vcpu_vsched(vcpu) ((vcpu)->vsched)
++#define vcpu_last_pcpu(vcpu) ((vcpu)->last_pcpu)
++#define vcpu_isset(vcpu, mask) (cpu_isset((vcpu)->id, mask))
++#define vcpu_is_offline(vcpu) (!vcpu_isset(vcpu, \
++ vcpu_vsched(vcpu)->vcpu_online_map))
++
++static int __add_vcpu(struct vcpu_scheduler *vsched, int id);
++
++#else /* CONFIG_SCHED_VCPU */
++
++static DEFINE_PER_CPU(struct vcpu_info, vcpu_info);
++
++#define task_vsched(p) NULL
++#define this_vcpu() (task_vcpu(current))
++#define task_vcpu(p) (vcpu(task_cpu(p)))
++
++#define vsched_vcpu(sched, id) (vcpu(id))
++#define vsched_id(vsched) 0
++#define vsched_default_vsched() NULL
++#define vsched_default_vcpu(id) (vcpu(id))
++
++#define vsched_vcpu_online_map(vsched) (cpu_online_map)
++#define vsched_num_online_vcpus(vsched) (num_online_cpus())
++#define vsched_pcpu_running_map(vsched) (cpu_online_map)
++
++#define vcpu(id) (&per_cpu(vcpu_info, id))
++
++#define vcpu_vsched(vcpu) NULL
++#define vcpu_last_pcpu(vcpu) ((vcpu)->id)
++#define vcpu_isset(vcpu, mask) (cpu_isset((vcpu)->id, mask))
++#define vcpu_is_offline(vcpu) (cpu_is_offline((vcpu)->id))
++
++#endif /* CONFIG_SCHED_VCPU */
++
++#define this_rq() (vcpu_rq(this_vcpu()))
++#define task_rq(p) (vcpu_rq(task_vcpu(p)))
++#define vcpu_rq(vcpu) (&(vcpu)->rq)
++#define get_vcpu() ({ preempt_disable(); this_vcpu(); })
++#define put_vcpu() ({ put_cpu(); })
++#define rq_vcpu(__rq) (container_of((__rq), struct vcpu_info, rq))
++
++/**
++ * idle_task - return the idle task for a given cpu.
++ * @cpu: the processor in question.
++ */
++task_t *idle_task(int cpu)
++{
++ return pcpu(cpu)->idle;
++}
++
++#ifdef CONFIG_SMP
++static inline void update_rq_cpu_load(runqueue_t *this_rq)
++{
++ unsigned long old_load, this_load;
++ int i;
++
++ if (unlikely(this_rq->nr_running == 0)) {
++ for (i = 0; i < 3; i++)
++ this_rq->cpu_load[i] = 0;
++ return;
++ }
++
++ this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
++ for (i = 0; i < 3; i++) {
++ unsigned long new_load = this_load;
++ int scale = 1 << i;
++ old_load = this_rq->cpu_load[i];
++ /*
++ * Round up the averaging division if load is increasing. This
++ * prevents us from getting stuck on 9 if the load is 10, for
++ * example.
++ */
++ if (new_load > old_load)
++ new_load += scale-1;
++ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
++ }
++}
++#else /* CONFIG_SMP */
++static inline void update_rq_cpu_load(runqueue_t *this_rq)
++{
++}
++#endif /* CONFIG_SMP */
++
++#ifdef CONFIG_SCHED_VCPU
++
++void fastcall vsched_cpu_online_map(struct vcpu_scheduler *vsched,
++ cpumask_t *mask)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&fairsched_lock, flags);
++ *mask = vsched->vcpu_online_map;
++ spin_unlock_irqrestore(&fairsched_lock, flags);
++}
++
++static inline void set_task_vsched(task_t *p, struct vcpu_scheduler *vsched)
++{
++ /* NOTE: set_task_cpu() is required after every set_task_vsched()! */
++ p->vsched = vsched;
++ p->vsched_id = vsched_id(vsched);
++}
++
++inline void set_task_cpu(struct task_struct *p, unsigned int vcpu_id)
++{
++ p->vcpu = vsched_vcpu(task_vsched(p), vcpu_id);
++ p->vcpu_id = vcpu_id;
++}
++
++static inline void set_task_vcpu(struct task_struct *p, vcpu_t vcpu)
++{
++ p->vcpu = vcpu;
++ p->vcpu_id = vcpu->id;
++}
++
++/* this is called when rq->nr_running changes from 0 to 1 */
++static void vcpu_attach(runqueue_t *rq)
++{
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu;
++
++ vcpu = rq_vcpu(rq);
++ vsched = vcpu_vsched(vcpu);
++
++ BUG_ON(vcpu->active);
++ spin_lock(&fairsched_lock);
++ vcpu->active = 1;
++ if (!vcpu->running)
++ list_move_tail(&vcpu->list, &vsched->active_list);
++
++ fairsched_incrun(vsched->node);
++ spin_unlock(&fairsched_lock);
++}
++
++/* this is called when rq->nr_running changes from 1 to 0 */
++static void vcpu_detach(runqueue_t *rq)
++{
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu;
++
++ vcpu = rq_vcpu(rq);
++ vsched = vcpu_vsched(vcpu);
++ BUG_ON(!vcpu->active);
++
++ spin_lock(&fairsched_lock);
++ fairsched_decrun(vsched->node);
++
++ vcpu->active = 0;
++ if (!vcpu->running)
++ list_move_tail(&vcpu->list, &vsched->idle_list);
++ spin_unlock(&fairsched_lock);
++}
++
++static inline void __vcpu_get(vcpu_t vcpu)
++{
++ struct pcpu_info *pcpu;
++ struct vcpu_scheduler *vsched;
++
++ BUG_ON(!this_vcpu()->running);
++
++ pcpu = this_pcpu();
++ vsched = vcpu_vsched(vcpu);
++
++ pcpu->vcpu = vcpu;
++ pcpu->vsched = vsched;
++
++ fairsched_inccpu(vsched->node);
++
++ list_move_tail(&vcpu->list, &vsched->running_list);
++ vcpu->start_time = jiffies;
++ vcpu->last_pcpu = pcpu->id;
++ vcpu->running = 1;
++ __set_bit(vcpu->id, vsched->vcpu_running_map.bits);
++ __set_bit(pcpu->id, vsched->pcpu_running_map.bits);
++#ifdef CONFIG_SMP
++ vcpu_rq(vcpu)->sd = pcpu->sd;
++#endif
++}
++
++static void vcpu_put(vcpu_t vcpu)
++{
++ struct vcpu_scheduler *vsched;
++ struct pcpu_info *cur_pcpu;
++ runqueue_t *rq;
++
++ vsched = vcpu_vsched(vcpu);
++ rq = vcpu_rq(vcpu);
++ cur_pcpu = this_pcpu();
++
++ BUG_ON(!vcpu->running);
++
++ spin_lock(&fairsched_lock);
++ vcpu->running = 0;
++ list_move_tail(&vcpu->list,
++ vcpu->active ? &vsched->active_list : &vsched->idle_list);
++ fairsched_deccpu(vsched->node);
++ __clear_bit(vcpu->id, vsched->vcpu_running_map.bits);
++ if (vsched != this_vsched())
++ __clear_bit(cur_pcpu->id, vsched->pcpu_running_map.bits);
++
++ if (!rq->nr_running)
++ rq->expired_timestamp = 0;
++ /* from this point task_running(prev_rq, prev) will be 0 */
++ rq->curr = cur_pcpu->idle;
++ update_rq_cpu_load(rq);
++ spin_unlock(&fairsched_lock);
++}
++
++static vcpu_t schedule_vcpu(vcpu_t cur_vcpu, cycles_t cycles)
++{
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu;
++ runqueue_t *rq;
++#ifdef CONFIG_FAIRSCHED
++ struct fairsched_node *node, *nodec;
++
++ nodec = vcpu_vsched(cur_vcpu)->node;
++ node = nodec;
++#endif
++
++ BUG_ON(!cur_vcpu->running);
++restart:
++ if (unlikely(system_state == SYSTEM_BOOTING))
++ goto affine;
++
++ spin_lock(&fairsched_lock);
++#ifdef CONFIG_FAIRSCHED
++ node = fairsched_schedule(node, nodec,
++ cur_vcpu->active,
++ cycles);
++ if (unlikely(node == NULL))
++ goto idle;
++
++ vsched = node->vsched;
++#else
++ vsched = &default_vsched;
++#endif
++ /* FIXME: optimize vcpu switching, maybe we do not need to call
++ fairsched_schedule() at all if vcpu is still active and too
++ little time have passed so far */
++ if (cur_vcpu->vsched == vsched && cur_vcpu->active &&
++ jiffies - cur_vcpu->start_time < msecs_to_jiffies(vcpu_sched_timeslice)) {
++ vcpu = cur_vcpu;
++ goto done;
++ }
++
++ if (list_empty(&vsched->active_list)) {
++ /* nothing except for this cpu can be scheduled */
++ if (likely(cur_vcpu->vsched == vsched && cur_vcpu->active)) {
++ /*
++ * Current vcpu is the one we need. We have not
++ * put it yet, so it's not on the active_list.
++ */
++ vcpu = cur_vcpu;
++ goto done;
++ } else
++ goto none;
++ }
++
++ /* select vcpu and add to running list */
++ vcpu = list_entry(vsched->active_list.next, struct vcpu_info, list);
++ __vcpu_get(vcpu);
++done:
++ spin_unlock(&fairsched_lock);
++
++ rq = vcpu_rq(vcpu);
++ if (unlikely(vcpu != cur_vcpu)) {
++ spin_unlock(&vcpu_rq(cur_vcpu)->lock);
++ spin_lock(&rq->lock);
++ if (unlikely(!rq->nr_running)) {
++ /* race with balancing? */
++ spin_unlock(&rq->lock);
++ vcpu_put(vcpu);
++ spin_lock(&vcpu_rq(cur_vcpu)->lock);
++ goto restart;
++ }
++ }
++ BUG_ON(!rq->nr_running);
++ return vcpu;
++
++none:
++#ifdef CONFIG_FAIRSCHED
++ spin_unlock(&fairsched_lock);
++
++ /* fairsched doesn't schedule more CPUs than we have active */
++ BUG_ON(1);
++#else
++ goto idle;
++#endif
++
++idle:
++ vcpu = task_vcpu(this_pcpu()->idle);
++ __vcpu_get(vcpu);
++ spin_unlock(&fairsched_lock);
++ spin_unlock(&vcpu_rq(cur_vcpu)->lock);
++
++ spin_lock(&vcpu_rq(vcpu)->lock);
++ return vcpu;
++
++affine:
++ vcpu = vsched_vcpu(&default_vsched, raw_smp_processor_id());
++ /* current VCPU busy, continue */
++ if (cur_vcpu == vcpu && vcpu->active)
++ return cur_vcpu;
++ /* current is idle and nothing to run, keep idle */
++ if (vcpu_vsched(cur_vcpu) == &idle_vsched && !vcpu->active)
++ return cur_vcpu;
++
++ /* need to switch to idle... */
++ if (cur_vcpu == vcpu) {
++ spin_lock(&fairsched_lock);
++ goto idle;
++ }
++
++ /* ... and from idle */
++ spin_lock(&fairsched_lock);
++ __vcpu_get(vcpu);
++ goto done;
++}
++
++#else /* CONFIG_SCHED_VCPU */
++
++#define set_task_vsched(task, vsched) do { } while (0)
++
++static inline void vcpu_attach(runqueue_t *rq)
++{
++}
++
++static inline void vcpu_detach(runqueue_t *rq)
++{
++}
++
++static inline void vcpu_put(vcpu_t vcpu)
++{
++}
++
++static inline vcpu_t schedule_vcpu(vcpu_t prev_vcpu, cycles_t cycles)
++{
++ return prev_vcpu;
++}
++
++static inline void set_task_vcpu(struct task_struct *p, vcpu_t vcpu)
++{
++ set_task_pcpu(p, vcpu->id);
++}
++
++#endif /* CONFIG_SCHED_VCPU */
++
++int vcpu_online(int cpu)
++{
++ return cpu_isset(cpu, vsched_vcpu_online_map(this_vsched()));
++}
+
+-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+-#define this_rq() (&__get_cpu_var(runqueues))
+-#define task_rq(p) cpu_rq(task_cpu(p))
+-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+
+ #ifndef prepare_arch_switch
+ # define prepare_arch_switch(next) do { } while (0)
+@@ -284,6 +721,11 @@ for (domain = rcu_dereference(cpu_rq(cpu
+ # define finish_arch_switch(prev) do { } while (0)
+ #endif
+
++struct kernel_stat_glob kstat_glob;
++spinlock_t kstat_glb_lock = SPIN_LOCK_UNLOCKED;
++EXPORT_SYMBOL(kstat_glob);
++EXPORT_SYMBOL(kstat_glb_lock);
++
+ #ifndef __ARCH_WANT_UNLOCKED_CTXSW
+ static inline int task_running(runqueue_t *rq, task_t *p)
+ {
+@@ -300,7 +742,7 @@ static inline void finish_lock_switch(ru
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+ #endif
+- spin_unlock_irq(&rq->lock);
++ spin_unlock(&rq->lock);
+ }
+
+ #else /* __ARCH_WANT_UNLOCKED_CTXSW */
+@@ -374,6 +816,208 @@ static inline void task_rq_unlock(runque
+ spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+
++#ifdef CONFIG_VE
++#define ve_nr_iowait_inc(env, cpu) \
++ do { \
++ VE_CPU_STATS((env), (cpu))->nr_iowait++; \
++ } while(0)
++#define ve_nr_iowait_dec(env, cpu) \
++ do { \
++ VE_CPU_STATS((env), (cpu))->nr_iowait--; \
++ } while(0)
++#define ve_nr_unint_inc(env, cpu) \
++ do { \
++ VE_CPU_STATS((env), (cpu))->nr_unint++; \
++ } while(0)
++#define ve_nr_unint_dec(env, cpu) \
++ do { \
++ VE_CPU_STATS((env), (cpu))->nr_unint--; \
++ } while(0)
++
++#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0)
++
++cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu)
++{
++ struct ve_cpu_stats *ve_stat;
++ unsigned v;
++ cycles_t strt, ret, cycles;
++
++ ve_stat = VE_CPU_STATS(ve, cpu);
++ do {
++ v = read_seqcount_begin(&ve_stat->stat_lock);
++ ret = ve_stat->idle_time;
++ strt = ve_stat->strt_idle_time;
++ if (strt && nr_uninterruptible_ve(ve) == 0) {
++ cycles = get_cycles();
++ if (cycles_after(cycles, strt))
++ ret += cycles - strt;
++ }
++ } while (read_seqcount_retry(&ve_stat->stat_lock, v));
++ return ret;
++}
++EXPORT_SYMBOL(ve_sched_get_idle_time);
++
++cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu)
++{
++ struct ve_cpu_stats *ve_stat;
++ unsigned v;
++ cycles_t strt, ret, cycles;
++
++ ve_stat = VE_CPU_STATS(ve, cpu);
++ do {
++ v = read_seqcount_begin(&ve_stat->stat_lock);
++ ret = ve_stat->iowait_time;
++ strt = ve_stat->strt_idle_time;
++ if (strt && nr_iowait_ve(ve) > 0) {
++ cycles = get_cycles();
++ if (cycles_after(cycles, strt))
++ ret += cycles - strt;
++ }
++ } while (read_seqcount_retry(&ve_stat->stat_lock, v));
++ return ret;
++}
++
++EXPORT_SYMBOL(ve_sched_get_iowait_time);
++
++static inline void ve_stop_idle(struct ve_struct *ve,
++ unsigned int cpu, cycles_t cycles)
++{
++ struct ve_cpu_stats *ve_stat;
++
++ ve_stat = VE_CPU_STATS(ve, cpu);
++
++ write_seqcount_begin(&ve_stat->stat_lock);
++ if (ve_stat->strt_idle_time) {
++ if (cycles_after(cycles, ve_stat->strt_idle_time)) {
++ if (nr_iowait_ve(ve) == 0)
++ ve_stat->idle_time += cycles -
++ ve_stat->strt_idle_time;
++ else
++ ve_stat->iowait_time += cycles -
++ ve_stat->strt_idle_time;
++ }
++ ve_stat->strt_idle_time = 0;
++ }
++ write_seqcount_end(&ve_stat->stat_lock);
++}
++
++static inline void ve_strt_idle(struct ve_struct *ve,
++ unsigned int cpu, cycles_t cycles)
++{
++ struct ve_cpu_stats *ve_stat;
++
++ ve_stat = VE_CPU_STATS(ve, cpu);
++
++ write_seqcount_begin(&ve_stat->stat_lock);
++ ve_stat->strt_idle_time = cycles;
++ write_seqcount_end(&ve_stat->stat_lock);
++}
++
++#define ve_nr_running_inc(env, cpu, cycles) do { \
++ if (++VE_CPU_STATS((env), (cpu))->nr_running == 1) \
++ ve_stop_idle(env, cpu, cycles); \
++ } while (0)
++#define ve_nr_running_dec(env, cpu, cyclses) do { \
++ if (--VE_CPU_STATS((env), (cpu))->nr_running == 0) \
++ ve_strt_idle(env, cpu, cycles); \
++ } while (0)
++
++void ve_sched_attach(struct ve_struct *envid)
++{
++ struct task_struct *tsk;
++ unsigned int cpu;
++ cycles_t cycles;
++
++ tsk = current;
++ preempt_disable();
++ cycles = get_cycles();
++ cpu = task_cpu(tsk);
++ ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles);
++ ve_nr_running_inc(envid, cpu, cycles);
++ preempt_enable();
++}
++EXPORT_SYMBOL(ve_sched_attach);
++
++static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc)
++{
++ struct ve_task_info *ti;
++
++ ti = VE_TASK_INFO(p);
++ write_seqcount_begin(&ti->wakeup_lock);
++ ti->wakeup_stamp = cyc;
++ write_seqcount_end(&ti->wakeup_lock);
++}
++
++static inline void update_sched_lat(struct task_struct *t, cycles_t cycles)
++{
++ int cpu;
++ cycles_t ve_wstamp;
++
++ /* safe due to runqueue lock */
++ cpu = smp_processor_id();
++ ve_wstamp = t->ve_task_info.wakeup_stamp;
++
++ if (ve_wstamp && cycles > ve_wstamp) {
++ KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat,
++ cpu, cycles - ve_wstamp);
++ KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve,
++ cpu, cycles - ve_wstamp);
++ }
++}
++
++static inline void update_ve_task_info(task_t *prev, cycles_t cycles)
++{
++#ifdef CONFIG_FAIRSCHED
++ if (prev != this_pcpu()->idle) {
++#else
++ if (prev != this_rq()->idle) {
++#endif
++ VE_CPU_STATS(prev->ve_task_info.owner_env,
++ smp_processor_id())->used_time +=
++ cycles - prev->ve_task_info.sched_time;
++
++ prev->ve_task_info.sched_time = cycles;
++ }
++}
++
++#else
++#define ve_nr_running_inc(env, cpu, cycles) do { } while(0)
++#define ve_nr_running_dec(env, cpu, cycles) do { } while(0)
++#define ve_nr_iowait_inc(env, cpu) do { } while(0)
++#define ve_nr_iowait_dec(env, cpu) do { } while(0)
++#define ve_nr_unint_inc(env, cpu) do { } while(0)
++#define ve_nr_unint_dec(env, cpu) do { } while(0)
++#define update_ve_task_info(prev, cycles) do { } while (0)
++#endif
++
++struct task_nrs_struct {
++ long nr_running;
++ long nr_unint;
++ long nr_stopped;
++ long nr_sleeping;
++ long nr_iowait;
++ long long nr_switches;
++} ____cacheline_aligned_in_smp;
++
++static struct task_nrs_struct glob_task_nrs[NR_CPUS];
++#define nr_running_inc(cpu) do { glob_task_nrs[cpu].nr_running++; } while (0)
++#define nr_running_dec(cpu) do { glob_task_nrs[cpu].nr_running--; } while (0)
++#define nr_unint_inc(cpu) do { glob_task_nrs[cpu].nr_unint++; } while (0)
++#define nr_unint_dec(cpu) do { glob_task_nrs[cpu].nr_unint--; } while (0)
++#define nr_stopped_inc(cpu) do { glob_task_nrs[cpu].nr_stopped++; } while (0)
++#define nr_stopped_dec(cpu) do { glob_task_nrs[cpu].nr_stopped--; } while (0)
++#define nr_sleeping_inc(cpu) do { glob_task_nrs[cpu].nr_sleeping++; } while (0)
++#define nr_sleeping_dec(cpu) do { glob_task_nrs[cpu].nr_sleeping--; } while (0)
++#define nr_iowait_inc(cpu) do { glob_task_nrs[cpu].nr_iowait++; } while (0)
++#define nr_iowait_dec(cpu) do { glob_task_nrs[cpu].nr_iowait--; } while (0)
++
++
++unsigned long nr_zombie = 0; /* protected by tasklist_lock */
++EXPORT_SYMBOL(nr_zombie);
++
++atomic_t nr_dead = ATOMIC_INIT(0);
++EXPORT_SYMBOL(nr_dead);
++
+ #ifdef CONFIG_SCHEDSTATS
+ /*
+ * bump this up when changing the output format or the meaning of an existing
+@@ -666,8 +1310,19 @@ static int effective_prio(task_t *p)
+ */
+ static inline void __activate_task(task_t *p, runqueue_t *rq)
+ {
++ cycles_t cycles;
++
++#ifdef CONFIG_VE
++ cycles = get_cycles();
++ write_wakeup_stamp(p, cycles);
++ p->ve_task_info.sleep_time += cycles;
++#endif
+ enqueue_task(p, rq->active);
+ rq->nr_running++;
++ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles);
++ nr_running_inc(smp_processor_id());
++ if (rq->nr_running == 1)
++ vcpu_attach(rq);
+ }
+
+ /*
+@@ -800,9 +1455,38 @@ static void activate_task(task_t *p, run
+ */
+ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+ {
++ cycles_t cycles;
++#ifdef CONFIG_VE
++ unsigned int cpu, pcpu;
++ struct ve_struct *ve;
++
++ cycles = get_cycles();
++ cpu = task_cpu(p);
++ pcpu = smp_processor_id();
++ ve = p->ve_task_info.owner_env;
++
++ p->ve_task_info.sleep_time -= cycles;
++#endif
++ if (p->state == TASK_UNINTERRUPTIBLE) {
++ ve_nr_unint_inc(ve, cpu);
++ nr_unint_inc(pcpu);
++ }
++ if (p->state == TASK_INTERRUPTIBLE) {
++ rq->nr_sleeping++;
++ nr_sleeping_inc(pcpu);
++ }
++ if (p->state == TASK_STOPPED) {
++ rq->nr_stopped++;
++ nr_stopped_inc(pcpu);
++ }
++
++ ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles);
++ nr_running_dec(pcpu);
+ rq->nr_running--;
+ dequeue_task(p, p->array);
+ p->array = NULL;
++ if (rq->nr_running == 0)
++ vcpu_detach(rq);
+ }
+
+ /*
+@@ -813,18 +1497,22 @@ static void deactivate_task(struct task_
+ * the target CPU.
+ */
+ #ifdef CONFIG_SMP
++/* FIXME: need to add vsched arg */
+ static void resched_task(task_t *p)
+ {
+ int cpu;
+
++#if 0
++ /* FIXME: this fails due to idle rq->curre == idle */
+ assert_spin_locked(&task_rq(p)->lock);
++#endif
+
+ if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ return;
+
+ set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+- cpu = task_cpu(p);
++ cpu = task_pcpu(p);
+ if (cpu == smp_processor_id())
+ return;
+
+@@ -847,15 +1535,35 @@ static inline void resched_task(task_t *
+ */
+ inline int task_curr(const task_t *p)
+ {
+- return cpu_curr(task_cpu(p)) == p;
++ return task_rq(p)->curr == p;
+ }
+
+-#ifdef CONFIG_SMP
++/**
++ * idle_cpu - is a given cpu idle currently?
++ * @cpu: the processor in question.
++ */
++inline int idle_cpu(int cpu)
++{
++ return pcpu(cpu)->vsched == &idle_vsched;
++}
++
++EXPORT_SYMBOL_GPL(idle_cpu);
++
++static inline int idle_vcpu(vcpu_t cpu)
++{
++#ifdef CONFIG_SCHED_VCPU
++ return !cpu->active;
++#else
++ return idle_cpu(cpu->id);
++#endif
++}
++
++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU)
+ typedef struct {
+ struct list_head list;
+
+ task_t *task;
+- int dest_cpu;
++ vcpu_t dest_cpu;
+
+ struct completion done;
+ } migration_req_t;
+@@ -864,7 +1572,7 @@ typedef struct {
+ * The task's runqueue lock must be held.
+ * Returns true if you have to wait for migration thread.
+ */
+-static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
++static int migrate_task(task_t *p, vcpu_t dest_cpu, migration_req_t *req)
+ {
+ runqueue_t *rq = task_rq(p);
+
+@@ -872,8 +1580,13 @@ static int migrate_task(task_t *p, int d
+ * If the task is not on a runqueue (and not running), then
+ * it is sufficient to simply update the task's cpu field.
+ */
++#ifdef CONFIG_SCHED_VCPU
++ BUG_ON(task_vsched(p) == &idle_vsched);
++ BUG_ON(vcpu_vsched(dest_cpu) == &idle_vsched);
++#endif
+ if (!p->array && !task_running(rq, p)) {
+- set_task_cpu(p, dest_cpu);
++ set_task_vsched(p, vcpu_vsched(dest_cpu));
++ set_task_vcpu(p, dest_cpu);
+ return 0;
+ }
+
+@@ -913,6 +1626,7 @@ repeat:
+ }
+ task_rq_unlock(rq, &flags);
+ }
++EXPORT_SYMBOL_GPL(wait_task_inactive);
+
+ /***
+ * kick_process - kick a running thread to enter/exit the kernel
+@@ -932,21 +1646,26 @@ void kick_process(task_t *p)
+ int cpu;
+
+ preempt_disable();
+- cpu = task_cpu(p);
++ cpu = task_pcpu(p);
+ if ((cpu != smp_processor_id()) && task_curr(p))
++ /* FIXME: ??? think over */
++ /* should add something like get_pcpu(cpu)->vcpu->id == task_cpu(p),
++ but with serialization of vcpu access... */
+ smp_send_reschedule(cpu);
+ preempt_enable();
+ }
++#endif
+
++#ifdef CONFIG_SMP
+ /*
+ * Return a low guess at the load of a migration-source cpu.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+-static inline unsigned long source_load(int cpu, int type)
++static inline unsigned long source_load(vcpu_t cpu, int type)
+ {
+- runqueue_t *rq = cpu_rq(cpu);
++ runqueue_t *rq = vcpu_rq(cpu);
+ unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
+@@ -957,9 +1676,9 @@ static inline unsigned long source_load(
+ /*
+ * Return a high guess at the load of a migration-target cpu
+ */
+-static inline unsigned long target_load(int cpu, int type)
++static inline unsigned long target_load(vcpu_t cpu, int type)
+ {
+- runqueue_t *rq = cpu_rq(cpu);
++ runqueue_t *rq = vcpu_rq(cpu);
+ unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
+@@ -972,33 +1691,35 @@ static inline unsigned long target_load(
+ * domain.
+ */
+ static struct sched_group *
+-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
++find_idlest_group(struct sched_domain *sd, struct task_struct *p, vcpu_t this_cpu)
+ {
+ struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
+ int imbalance = 100 + (sd->imbalance_pct-100)/2;
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu;
++ int this_pcpu;
+
++ vsched = vcpu_vsched(this_cpu);
++ this_pcpu = vcpu_last_pcpu(this_cpu);
+ do {
+ unsigned long load, avg_load;
+ int local_group;
+ int i;
+
+- /* Skip over this group if it has no CPUs allowed */
+- if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+- goto nextgroup;
+-
+- local_group = cpu_isset(this_cpu, group->cpumask);
++ local_group = cpu_isset(this_pcpu, group->cpumask);
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu_mask(i, group->cpumask) {
++ vcpu = pcpu(i)->vcpu;
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+- load = source_load(i, load_idx);
++ load = source_load(vcpu, load_idx);
+ else
+- load = target_load(i, load_idx);
++ load = target_load(vcpu, load_idx);
+
+ avg_load += load;
+ }
+@@ -1013,7 +1734,6 @@ find_idlest_group(struct sched_domain *s
+ min_load = avg_load;
+ idlest = group;
+ }
+-nextgroup:
+ group = group->next;
+ } while (group != sd->groups);
+
+@@ -1025,23 +1745,31 @@ nextgroup:
+ /*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+-static int
+-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
++static vcpu_t
++find_idlest_cpu(struct sched_group *group, struct task_struct *p, vcpu_t this_cpu)
+ {
+- cpumask_t tmp;
+ unsigned long load, min_load = ULONG_MAX;
+- int idlest = -1;
++ cpumask_t vmask;
++ struct vcpu_scheduler *vsched;
++ vcpu_t idlest = (vcpu_t)-1;
++ vcpu_t vcpu;
+ int i;
+
+- /* Traverse only the allowed CPUs */
+- cpus_and(tmp, group->cpumask, p->cpus_allowed);
++ vsched = vcpu_vsched(this_cpu);
++ BUG_ON(vsched != task_vsched(p));
+
+- for_each_cpu_mask(i, tmp) {
+- load = source_load(i, 0);
++ cpus_and(vmask, vsched_vcpu_online_map(vsched), p->cpus_allowed);
++ for_each_cpu_mask(i, vmask) {
++ vcpu = vsched_vcpu(vsched, i);
+
+- if (load < min_load || (load == min_load && i == this_cpu)) {
++ if (!cpu_isset(vcpu_last_pcpu(vcpu), group->cpumask))
++ continue;
++
++ load = source_load(vcpu, 0);
++
++ if (load < min_load || (load == min_load && vcpu == this_cpu)) {
+ min_load = load;
+- idlest = i;
++ idlest = vcpu;
+ }
+ }
+
+@@ -1059,7 +1787,7 @@ find_idlest_cpu(struct sched_group *grou
+ *
+ * preempt must be disabled.
+ */
+-static int sched_balance_self(int cpu, int flag)
++static vcpu_t sched_balance_self(vcpu_t cpu, int flag)
+ {
+ struct task_struct *t = current;
+ struct sched_domain *tmp, *sd = NULL;
+@@ -1071,7 +1799,7 @@ static int sched_balance_self(int cpu, i
+ while (sd) {
+ cpumask_t span;
+ struct sched_group *group;
+- int new_cpu;
++ vcpu_t new_cpu;
+ int weight;
+
+ span = sd->span;
+@@ -1080,7 +1808,7 @@ static int sched_balance_self(int cpu, i
+ goto nextlevel;
+
+ new_cpu = find_idlest_cpu(group, t, cpu);
+- if (new_cpu == -1 || new_cpu == cpu)
++ if (new_cpu == (vcpu_t)(-1) || new_cpu == cpu)
+ goto nextlevel;
+
+ /* Now try balancing at a lower domain level */
+@@ -1111,21 +1839,27 @@ nextlevel:
+ * Returns the CPU we should wake onto.
+ */
+ #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+-static int wake_idle(int cpu, task_t *p)
++static vcpu_t wake_idle(vcpu_t cpu, task_t *p)
+ {
+- cpumask_t tmp;
++ cpumask_t vtmp;
+ struct sched_domain *sd;
++ struct vcpu_scheduler *vsched;
+ int i;
+
+- if (idle_cpu(cpu))
++ if (idle_vcpu(cpu))
+ return cpu;
+
++ vsched = vcpu_vsched(cpu);
++ cpus_and(vtmp, vsched_vcpu_online_map(vsched), p->cpus_allowed);
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_IDLE) {
+- cpus_and(tmp, sd->span, p->cpus_allowed);
+- for_each_cpu_mask(i, tmp) {
+- if (idle_cpu(i))
+- return i;
++ for_each_cpu_mask(i, vtmp) {
++ vcpu_t vcpu;
++ vcpu = vsched_vcpu(vsched, i);
++ if (!cpu_isset(vcpu_last_pcpu(vcpu), sd->span))
++ continue;
++ if (idle_vcpu(vcpu))
++ return vcpu;
+ }
+ }
+ else
+@@ -1134,7 +1868,7 @@ static int wake_idle(int cpu, task_t *p)
+ return cpu;
+ }
+ #else
+-static inline int wake_idle(int cpu, task_t *p)
++static inline vcpu_t wake_idle(vcpu_t cpu, task_t *p)
+ {
+ return cpu;
+ }
+@@ -1156,15 +1890,17 @@ static inline int wake_idle(int cpu, tas
+ */
+ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+ {
+- int cpu, this_cpu, success = 0;
++ vcpu_t cpu, this_cpu;
++ int success = 0;
+ unsigned long flags;
+ long old_state;
+ runqueue_t *rq;
+ #ifdef CONFIG_SMP
+ unsigned long load, this_load;
+ struct sched_domain *sd, *this_sd = NULL;
+- int new_cpu;
++ vcpu_t new_cpu;
+ #endif
++ cpu = NULL;
+
+ rq = task_rq_lock(p, &flags);
+ old_state = p->state;
+@@ -1174,8 +1910,8 @@ static int try_to_wake_up(task_t *p, uns
+ if (p->array)
+ goto out_running;
+
+- cpu = task_cpu(p);
+- this_cpu = smp_processor_id();
++ cpu = task_vcpu(p);
++ this_cpu = this_vcpu();
+
+ #ifdef CONFIG_SMP
+ if (unlikely(task_running(rq, p)))
+@@ -1184,20 +1920,23 @@ static int try_to_wake_up(task_t *p, uns
+ new_cpu = cpu;
+
+ schedstat_inc(rq, ttwu_cnt);
++ /* FIXME: add vsched->last_vcpu array to optimize wakeups in different vsched */
++ if (vcpu_vsched(cpu) != vcpu_vsched(this_cpu))
++ goto out_set_cpu;
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
+ goto out_set_cpu;
+ }
+
+ for_each_domain(this_cpu, sd) {
+- if (cpu_isset(cpu, sd->span)) {
++ if (cpu_isset(vcpu_last_pcpu(cpu), sd->span)) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ this_sd = sd;
+ break;
+ }
+ }
+
+- if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
++ if (unlikely(!vcpu_isset(this_cpu, p->cpus_allowed)))
+ goto out_set_cpu;
+
+ /*
+@@ -1253,7 +1992,7 @@ static int try_to_wake_up(task_t *p, uns
+ out_set_cpu:
+ new_cpu = wake_idle(new_cpu, p);
+ if (new_cpu != cpu) {
+- set_task_cpu(p, new_cpu);
++ set_task_vcpu(p, new_cpu);
+ task_rq_unlock(rq, &flags);
+ /* might preempt at this point */
+ rq = task_rq_lock(p, &flags);
+@@ -1263,13 +2002,21 @@ out_set_cpu:
+ if (p->array)
+ goto out_running;
+
+- this_cpu = smp_processor_id();
+- cpu = task_cpu(p);
++ this_cpu = this_vcpu();
++ cpu = task_vcpu(p);
+ }
+
+ out_activate:
+ #endif /* CONFIG_SMP */
+- if (old_state == TASK_UNINTERRUPTIBLE) {
++ if (old_state == TASK_INTERRUPTIBLE) {
++ nr_sleeping_dec(smp_processor_id());
++ rq->nr_sleeping--;
++ } else if (old_state == TASK_STOPPED) {
++ nr_stopped_dec(smp_processor_id());
++ rq->nr_stopped--;
++ } else if (old_state == TASK_UNINTERRUPTIBLE) {
++ nr_unint_dec(smp_processor_id());
++ ve_nr_unint_dec(p->ve_task_info.owner_env, task_cpu(p));
+ rq->nr_uninterruptible--;
+ /*
+ * Tasks on involuntary sleep don't earn
+@@ -1324,17 +2071,45 @@ int fastcall wake_up_state(task_t *p, un
+ }
+
+ /*
++ * init is special, it is forked from swapper (idle_vsched) and should
++ * belong to default_vsched, so we have to change it's vsched/fairsched manually
++ */
++static void wake_up_init(task_t *p)
++{
++ runqueue_t *rq;
++ unsigned long flags;
++
++ /* we should change both fairsched node and vsched here */
++ set_task_vsched(p, &default_vsched);
++ set_task_cpu(p, 0);
++
++ /*
++ * can't call wake_up_new_task() directly here,
++ * since it assumes that a child belongs to the same vsched
++ */
++ p->state = TASK_RUNNING;
++ p->sleep_avg = 0;
++ p->prio = effective_prio(p);
++
++ rq = task_rq_lock(p, &flags);
++ __activate_task(p, rq);
++ task_rq_unlock(rq, &flags);
++}
++
++/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+ void fastcall sched_fork(task_t *p, int clone_flags)
+ {
+- int cpu = get_cpu();
+-
++ vcpu_t cpu;
++
++ preempt_disable();
++ cpu = this_vcpu();
+ #ifdef CONFIG_SMP
+ cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+ #endif
+- set_task_cpu(p, cpu);
++ set_task_vcpu(p, cpu);
+
+ /*
+ * We mark the process as running here, but have not actually
+@@ -1369,6 +2144,10 @@ void fastcall sched_fork(task_t *p, int
+ p->first_time_slice = 1;
+ current->time_slice >>= 1;
+ p->timestamp = sched_clock();
++#ifdef CONFIG_VE
++ /*cosmetic: sleep till wakeup below*/
++ p->ve_task_info.sleep_time -= get_cycles();
++#endif
+ if (unlikely(!current->time_slice)) {
+ /*
+ * This case is rare, it happens when the parent has only
+@@ -1379,7 +2158,7 @@ void fastcall sched_fork(task_t *p, int
+ scheduler_tick();
+ }
+ local_irq_enable();
+- put_cpu();
++ preempt_enable();
+ }
+
+ /*
+@@ -1392,13 +2171,19 @@ void fastcall sched_fork(task_t *p, int
+ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
+ {
+ unsigned long flags;
+- int this_cpu, cpu;
++ vcpu_t this_cpu, cpu;
+ runqueue_t *rq, *this_rq;
+
++ if (unlikely(p->pid == 1)) {
++ wake_up_init(p);
++ return;
++ }
++
+ rq = task_rq_lock(p, &flags);
+ BUG_ON(p->state != TASK_RUNNING);
+- this_cpu = smp_processor_id();
+- cpu = task_cpu(p);
++ BUG_ON(task_vsched(current) != task_vsched(p));
++ this_cpu = this_vcpu();
++ cpu = task_vcpu(p);
+
+ /*
+ * We decrease the sleep average of forking parents
+@@ -1426,6 +2211,9 @@ void fastcall wake_up_new_task(task_t *p
+ p->array = current->array;
+ p->array->nr_active++;
+ rq->nr_running++;
++ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env,
++ task_cpu(p), get_cycles());
++ nr_running_inc(smp_processor_id());
+ }
+ set_need_resched();
+ } else
+@@ -1439,7 +2227,7 @@ void fastcall wake_up_new_task(task_t *p
+ */
+ this_rq = rq;
+ } else {
+- this_rq = cpu_rq(this_cpu);
++ this_rq = vcpu_rq(this_cpu);
+
+ /*
+ * Not the local CPU - must adjust timestamp. This should
+@@ -1482,7 +2270,7 @@ void fastcall sched_exit(task_t *p)
+ * the sleep_avg of the parent as well.
+ */
+ rq = task_rq_lock(p->parent, &flags);
+- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
++ if (p->first_time_slice && task_vcpu(p) == task_vcpu(p->parent)) {
+ p->parent->time_slice += p->time_slice;
+ if (unlikely(p->parent->time_slice > task_timeslice(p)))
+ p->parent->time_slice = task_timeslice(p);
+@@ -1532,7 +2320,10 @@ static inline void finish_task_switch(ru
+ {
+ struct mm_struct *mm = rq->prev_mm;
+ unsigned long prev_task_flags;
++ vcpu_t prev_vcpu, vcpu;
+
++ prev_vcpu = task_vcpu(prev);
++ vcpu = rq_vcpu(rq);
+ rq->prev_mm = NULL;
+
+ /*
+@@ -1549,6 +2340,10 @@ static inline void finish_task_switch(ru
+ prev_task_flags = prev->flags;
+ finish_arch_switch(prev);
+ finish_lock_switch(rq, prev);
++ if (prev_vcpu != vcpu)
++ vcpu_put(prev_vcpu);
++ local_irq_enable();
++
+ if (mm)
+ mmdrop(mm);
+ if (unlikely(prev_task_flags & PF_DEAD))
+@@ -1569,8 +2364,9 @@ asmlinkage void schedule_tail(task_t *pr
+ preempt_enable();
+ #endif
+ if (current->set_child_tid)
+- put_user(current->pid, current->set_child_tid);
++ put_user(virt_pid(current), current->set_child_tid);
+ }
++EXPORT_SYMBOL_GPL(schedule_tail);
+
+ /*
+ * context_switch - switch to the new MM and the new
+@@ -1610,20 +2406,26 @@ task_t * context_switch(runqueue_t *rq,
+ */
+ unsigned long nr_running(void)
+ {
+- unsigned long i, sum = 0;
++ unsigned long i, sum;
+
++ sum = 0;
+ for_each_online_cpu(i)
+- sum += cpu_rq(i)->nr_running;
++ sum += glob_task_nrs[i].nr_running;
++
++ if (unlikely((long)sum < 0))
++ sum = 0;
+
+ return sum;
+ }
++EXPORT_SYMBOL(nr_running);
+
+ unsigned long nr_uninterruptible(void)
+ {
+- unsigned long i, sum = 0;
++ unsigned long i, sum;
+
++ sum = 0;
+ for_each_cpu(i)
+- sum += cpu_rq(i)->nr_uninterruptible;
++ sum += glob_task_nrs[i].nr_unint;
+
+ /*
+ * Since we read the counters lockless, it might be slightly
+@@ -1635,31 +2437,133 @@ unsigned long nr_uninterruptible(void)
+ return sum;
+ }
+
++EXPORT_SYMBOL(nr_uninterruptible);
++
+ unsigned long long nr_context_switches(void)
+ {
+- unsigned long long i, sum = 0;
++ unsigned long long i, sum;
+
++ sum = 0;
+ for_each_cpu(i)
+- sum += cpu_rq(i)->nr_switches;
++ sum += glob_task_nrs[i].nr_switches;
++
++ if (unlikely((long)sum < 0))
++ sum = 0;
+
+ return sum;
+ }
+
++EXPORT_SYMBOL(nr_context_switches);
++
+ unsigned long nr_iowait(void)
+ {
+- unsigned long i, sum = 0;
++ unsigned long i, sum;
+
++ sum = 0;
+ for_each_cpu(i)
+- sum += atomic_read(&cpu_rq(i)->nr_iowait);
++ sum += glob_task_nrs[i].nr_iowait;
++
++ if (unlikely((long)sum < 0))
++ sum = 0;
+
+ return sum;
+ }
+
+-#ifdef CONFIG_SMP
++EXPORT_SYMBOL(nr_iowait);
++
++unsigned long nr_stopped(void)
++{
++ unsigned long i, sum;
++
++ sum = 0;
++ for_each_cpu(i)
++ sum += glob_task_nrs[i].nr_stopped;
++
++ if (unlikely((long)sum < 0))
++ sum = 0;
++
++ return sum;
++}
++
++EXPORT_SYMBOL(nr_stopped);
++
++unsigned long nr_sleeping(void)
++{
++ unsigned long i, sum;
++
++ sum = 0;
++ for_each_cpu(i)
++ sum += glob_task_nrs[i].nr_sleeping;
++
++ if (unlikely((long)sum < 0))
++ sum = 0;
++
++ return sum;
++}
++
++EXPORT_SYMBOL(nr_sleeping);
++
++#ifdef CONFIG_VE
++unsigned long nr_running_ve(struct ve_struct *ve)
++{
++ int i;
++ long sum;
++ cpumask_t ve_cpus;
++
++ sum = 0;
++ ve_cpu_online_map(ve, &ve_cpus);
++ for_each_cpu_mask(i, ve_cpus)
++ sum += VE_CPU_STATS(ve, i)->nr_running;
++ return (unsigned long)(sum < 0 ? 0 : sum);
++}
++
++EXPORT_SYMBOL(nr_running_ve);
++
++unsigned long nr_uninterruptible_ve(struct ve_struct *ve)
++{
++ int i;
++ long sum;
++ cpumask_t ve_cpus;
++
++ sum = 0;
++ ve_cpu_online_map(ve, &ve_cpus);
++ for_each_cpu_mask(i, ve_cpus)
++ sum += VE_CPU_STATS(ve, i)->nr_unint;
++ return (unsigned long)(sum < 0 ? 0 : sum);
++}
++
++EXPORT_SYMBOL(nr_uninterruptible_ve);
++
++unsigned long nr_iowait_ve(struct ve_struct *ve)
++{
++ int i;
++ long sum;
++ cpumask_t ve_cpus;
++
++ sum = 0;
++ ve_cpu_online_map(ve, &ve_cpus);
++ for_each_cpu_mask(i, ve_cpus)
++ sum += VE_CPU_STATS(ve, i)->nr_iowait;
++ return (unsigned long)(sum < 0 ? 0 : sum);
++}
++
++EXPORT_SYMBOL(nr_iowait_ve);
++#endif
++
++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU)
++
++#ifdef CONFIG_SCHED_VCPU
++#define rq_compare(rq1, rq2) (rq1 < rq2)
++#else
++#define rq_compare(rq1, rq2) (rq1->cpu < rq2->cpu)
++#endif
+
+ /*
+ * double_rq_lock - safely lock two runqueues
+ *
++ * We must take them in cpu order to match code in
++ * dependent_sleeper and wake_dependent_sleeper.
++ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+@@ -1671,7 +2575,7 @@ static void double_rq_lock(runqueue_t *r
+ spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else {
+- if (rq1 < rq2) {
++ if (rq_compare(rq1, rq2)) {
+ spin_lock(&rq1->lock);
+ spin_lock(&rq2->lock);
+ } else {
+@@ -1699,38 +2603,20 @@ static void double_rq_unlock(runqueue_t
+ }
+
+ /*
+- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+- */
+-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+- __releases(this_rq->lock)
+- __acquires(busiest->lock)
+- __acquires(this_rq->lock)
+-{
+- if (unlikely(!spin_trylock(&busiest->lock))) {
+- if (busiest < this_rq) {
+- spin_unlock(&this_rq->lock);
+- spin_lock(&busiest->lock);
+- spin_lock(&this_rq->lock);
+- } else
+- spin_lock(&busiest->lock);
+- }
+-}
+-
+-/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu. Then
+ * the cpu_allowed mask is restored.
+ */
+-static void sched_migrate_task(task_t *p, int dest_cpu)
++static void sched_migrate_task(task_t *p, vcpu_t dest_cpu)
+ {
+ migration_req_t req;
+ runqueue_t *rq;
+ unsigned long flags;
+
+ rq = task_rq_lock(p, &flags);
+- if (!cpu_isset(dest_cpu, p->cpus_allowed)
+- || unlikely(cpu_is_offline(dest_cpu)))
++ if (!vcpu_isset(dest_cpu, p->cpus_allowed)
++ || unlikely(vcpu_is_offline(dest_cpu)))
+ goto out;
+
+ /* force the process onto the specified CPU */
+@@ -1747,6 +2633,26 @@ static void sched_migrate_task(task_t *p
+ out:
+ task_rq_unlock(rq, &flags);
+ }
++#endif
++
++#ifdef CONFIG_SMP
++/*
++ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
++ */
++static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
++ __releases(this_rq->lock)
++ __acquires(busiest->lock)
++ __acquires(this_rq->lock)
++{
++ if (unlikely(!spin_trylock(&busiest->lock))) {
++ if (rq_compare(busiest, this_rq)) {
++ spin_unlock(&this_rq->lock);
++ spin_lock(&busiest->lock);
++ spin_lock(&this_rq->lock);
++ } else
++ spin_lock(&busiest->lock);
++ }
++}
+
+ /*
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+@@ -1754,9 +2660,12 @@ out:
+ */
+ void sched_exec(void)
+ {
+- int new_cpu, this_cpu = get_cpu();
++ vcpu_t new_cpu, this_cpu;
++
++ preempt_disable();
++ this_cpu = this_vcpu();
+ new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
+- put_cpu();
++ preempt_enable();
+ if (new_cpu != this_cpu)
+ sched_migrate_task(current, new_cpu);
+ }
+@@ -1767,12 +2676,24 @@ void sched_exec(void)
+ */
+ static
+ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
+- runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
++ runqueue_t *this_rq, prio_array_t *this_array, vcpu_t this_cpu)
+ {
++ struct ve_struct *ve;
++ cycles_t cycles;
++
++ cycles = get_cycles();
++ ve = VE_TASK_INFO(p)->owner_env;
++
+ dequeue_task(p, src_array);
+ src_rq->nr_running--;
+- set_task_cpu(p, this_cpu);
++ ve_nr_running_dec(ve, task_cpu(p), cycles);
++ if (src_rq->nr_running == 0)
++ vcpu_detach(src_rq);
++ set_task_vcpu(p, this_cpu);
++ if (this_rq->nr_running == 0)
++ vcpu_attach(this_rq);
+ this_rq->nr_running++;
++ ve_nr_running_inc(ve, task_cpu(p), cycles);
+ enqueue_task(p, this_array);
+ p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
+ + this_rq->timestamp_last_tick;
+@@ -1788,7 +2709,7 @@ void pull_task(runqueue_t *src_rq, prio_
+ * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ */
+ static
+-int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
++int can_migrate_task(task_t *p, runqueue_t *rq, vcpu_t this_cpu,
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
+ {
+@@ -1798,7 +2719,7 @@ int can_migrate_task(task_t *p, runqueue
+ * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 3) are cache-hot on their current CPU.
+ */
+- if (!cpu_isset(this_cpu, p->cpus_allowed))
++ if (!vcpu_isset(this_cpu, p->cpus_allowed))
+ return 0;
+ *all_pinned = 0;
+
+@@ -1826,7 +2747,7 @@ int can_migrate_task(task_t *p, runqueue
+ *
+ * Called with both runqueues locked.
+ */
+-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
++static int move_tasks(runqueue_t *this_rq, vcpu_t this_cpu, runqueue_t *busiest,
+ unsigned long max_nr_move, struct sched_domain *sd,
+ enum idle_type idle, int *all_pinned)
+ {
+@@ -1919,13 +2840,19 @@ out:
+ * moved to restore balance via the imbalance parameter.
+ */
+ static struct sched_group *
+-find_busiest_group(struct sched_domain *sd, int this_cpu,
++find_busiest_group(struct sched_domain *sd, vcpu_t this_cpu,
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+ {
+ struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ unsigned long max_pull;
+ int load_idx;
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu;
++ int this_pcpu;
++
++ vsched = vcpu_vsched(this_cpu);
++ this_pcpu = vcpu_last_pcpu(this_cpu);
+
+ max_load = this_load = total_load = total_pwr = 0;
+ if (idle == NOT_IDLE)
+@@ -1936,24 +2863,27 @@ find_busiest_group(struct sched_domain *
+ load_idx = sd->idle_idx;
+
+ do {
++ cpumask_t tmp;
+ unsigned long load;
+ int local_group;
+ int i;
+
+- local_group = cpu_isset(this_cpu, group->cpumask);
++ local_group = cpu_isset(this_pcpu, group->cpumask);
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
++ cpus_and(tmp, group->cpumask, vsched_pcpu_running_map(vsched));
+
+- for_each_cpu_mask(i, group->cpumask) {
++ for_each_cpu_mask(i, tmp) {
++ vcpu = pcpu(i)->vcpu;
+ if (*sd_idle && !idle_cpu(i))
+ *sd_idle = 0;
+
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+- load = target_load(i, load_idx);
++ load = target_load(vcpu, load_idx);
+ else
+- load = source_load(i, load_idx);
++ load = source_load(vcpu, load_idx);
+
+ avg_load += load;
+ }
+@@ -1976,6 +2906,8 @@ find_busiest_group(struct sched_domain *
+
+ if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
+ goto out_balanced;
++ if (!this)
++ this = busiest; /* this->cpu_power is needed below */
+
+ avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+
+@@ -2058,25 +2990,57 @@ out_balanced:
+ /*
+ * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ */
+-static runqueue_t *find_busiest_queue(struct sched_group *group,
++static vcpu_t find_busiest_queue(vcpu_t this_cpu, struct sched_group *group,
+ enum idle_type idle)
+ {
+ unsigned long load, max_load = 0;
+- runqueue_t *busiest = NULL;
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu, busiest = NULL;
++ cpumask_t tmp;
+ int i;
+
++ vsched = vcpu_vsched(this_cpu);
+ for_each_cpu_mask(i, group->cpumask) {
+- load = source_load(i, 0);
++ vcpu = pcpu(i)->vcpu;
++ if (vcpu_vsched(vcpu) != vsched && idle != SCHED_IDLE)
++ continue;
++ load = source_load(vcpu, 0);
++ if (load > max_load) {
++ max_load = load;
++ busiest = vcpu;
++ }
++ }
+
++#ifdef CONFIG_SCHED_VCPU
++ cpus_andnot(tmp, vsched->vcpu_online_map, vsched->vcpu_running_map);
++ for_each_cpu_mask(i, tmp) {
++ vcpu = vsched_vcpu(vsched, i);
++ load = source_load(vcpu, 0);
+ if (load > max_load) {
+ max_load = load;
+- busiest = cpu_rq(i);
++ busiest = vcpu;
+ }
+ }
++#endif
+
+ return busiest;
+ }
+
++#ifdef CONFIG_SCHED_VCPU
++vcpu_t find_idle_vcpu(struct vcpu_scheduler *vsched)
++{
++ vcpu_t vcpu;
++
++ vcpu = NULL;
++ spin_lock(&fairsched_lock);
++ if (!list_empty(&vsched->idle_list))
++ vcpu = list_entry(vsched->idle_list.next,
++ struct vcpu_info, list);
++ spin_unlock(&fairsched_lock);
++ return vcpu;
++}
++#endif
++
+ /*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+@@ -2089,10 +3053,11 @@ static runqueue_t *find_busiest_queue(st
+ *
+ * Called with this_rq unlocked.
+ */
+-static int load_balance(int this_cpu, runqueue_t *this_rq,
++static int load_balance(vcpu_t this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd, enum idle_type idle)
+ {
+ struct sched_group *group;
++ vcpu_t busiest_vcpu;
+ runqueue_t *busiest;
+ unsigned long imbalance;
+ int nr_moved, all_pinned = 0;
+@@ -2110,13 +3075,24 @@ static int load_balance(int this_cpu, ru
+ goto out_balanced;
+ }
+
+- busiest = find_busiest_queue(group, idle);
+- if (!busiest) {
++ busiest_vcpu = find_busiest_queue(this_cpu, group, idle);
++ if (!busiest_vcpu) {
+ schedstat_inc(sd, lb_nobusyq[idle]);
+ goto out_balanced;
+ }
+
+- BUG_ON(busiest == this_rq);
++#ifdef CONFIG_SCHED_VCPU
++ if (vcpu_vsched(this_cpu) != vcpu_vsched(busiest_vcpu)) {
++ this_cpu = find_idle_vcpu(vcpu_vsched(busiest_vcpu));
++ if (!this_cpu)
++ goto out_one_pinned;
++ this_rq = vcpu_rq(this_cpu);
++ }
++#endif
++ busiest = vcpu_rq(busiest_vcpu);
++
++ if (unlikely(busiest == this_rq))
++ goto out_balanced;
+
+ schedstat_add(sd, lb_imbalance[idle], imbalance);
+
+@@ -2149,7 +3125,7 @@ static int load_balance(int this_cpu, ru
+ /* don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+- if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
++ if (!vcpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ spin_unlock(&busiest->lock);
+ all_pinned = 1;
+ goto out_one_pinned;
+@@ -2214,11 +3190,12 @@ out_one_pinned:
+ * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
+ * this_rq is locked.
+ */
+-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
++static int load_balance_newidle(vcpu_t this_cpu, runqueue_t *this_rq,
+ struct sched_domain *sd)
+ {
+ struct sched_group *group;
+- runqueue_t *busiest = NULL;
++ runqueue_t *busiest;
++ vcpu_t busiest_vcpu;
+ unsigned long imbalance;
+ int nr_moved = 0;
+ int sd_idle = 0;
+@@ -2233,13 +3210,12 @@ static int load_balance_newidle(int this
+ goto out_balanced;
+ }
+
+- busiest = find_busiest_queue(group, NEWLY_IDLE);
+- if (!busiest) {
++ busiest_vcpu = find_busiest_queue(this_cpu, group, NEWLY_IDLE);
++ if (!busiest_vcpu || busiest_vcpu == this_cpu) {
+ schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
+ goto out_balanced;
+ }
+-
+- BUG_ON(busiest == this_rq);
++ busiest = vcpu_rq(busiest_vcpu);
+
+ schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
+
+@@ -2272,8 +3248,11 @@ out_balanced:
+ /*
+ * idle_balance is called by schedule() if this_cpu is about to become
+ * idle. Attempts to pull tasks from other CPUs.
++ *
++ * Returns whether to continue with another runqueue
++ * instead of switching to idle.
+ */
+-static void idle_balance(int this_cpu, runqueue_t *this_rq)
++static int idle_balance(vcpu_t this_cpu, runqueue_t *this_rq)
+ {
+ struct sched_domain *sd;
+
+@@ -2281,10 +3260,11 @@ static void idle_balance(int this_cpu, r
+ if (sd->flags & SD_BALANCE_NEWIDLE) {
+ if (load_balance_newidle(this_cpu, this_rq, sd)) {
+ /* We've pulled tasks over so stop searching */
+- break;
++ return 1;
+ }
+ }
+ }
++ return 0;
+ }
+
+ /*
+@@ -2294,18 +3274,26 @@ static void idle_balance(int this_cpu, r
+ * logical imbalances.
+ *
+ * Called with busiest_rq locked.
++ *
++ * In human terms: balancing of CPU load by moving tasks between CPUs is
++ * performed by 2 methods, push and pull.
++ * In certain places when CPU is found to be idle, it performs pull from busy
++ * CPU to current (idle) CPU.
++ * active_load_balance implements push method, with migration thread getting
++ * scheduled on a busy CPU (hence, making all running processes on this CPU sit
++ * in the queue) and selecting where to push and which task.
+ */
+-static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
++static void active_load_balance(runqueue_t *busiest_rq, vcpu_t busiest_cpu)
+ {
+ struct sched_domain *sd;
+ runqueue_t *target_rq;
+- int target_cpu = busiest_rq->push_cpu;
++ vcpu_t target_cpu = busiest_rq->push_cpu;
+
+ if (busiest_rq->nr_running <= 1)
+ /* no task to move */
+ return;
+
+- target_rq = cpu_rq(target_cpu);
++ target_rq = vcpu_rq(target_cpu);
+
+ /*
+ * This condition is "impossible", if it occurs
+@@ -2317,10 +3305,17 @@ static void active_load_balance(runqueue
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
++ /*
++ * Our main candidate where to push our tasks is busiest->push_cpu.
++ * First, find the domain that spans over both that candidate CPU and
++ * the current one.
++ *
++ * FIXME: make sure that push_cpu doesn't disappear before we get here.
++ */
+ /* Search for an sd spanning us and the target CPU. */
+ for_each_domain(target_cpu, sd)
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+- cpu_isset(busiest_cpu, sd->span))
++ cpu_isset(vcpu_last_pcpu(busiest_cpu), sd->span))
+ break;
+
+ if (unlikely(sd == NULL))
+@@ -2346,31 +3341,17 @@ out:
+ */
+
+ /* Don't have all balancing operations going off at once */
+-#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
++#define CPU_OFFSET(cpu) (HZ * (cpu) / NR_CPUS)
+
+-static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
++static void rebalance_tick(vcpu_t this_cpu, runqueue_t *this_rq,
+ enum idle_type idle)
+ {
+- unsigned long old_load, this_load;
+- unsigned long j = jiffies + CPU_OFFSET(this_cpu);
++ unsigned long j;
+ struct sched_domain *sd;
+- int i;
+
+- this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
+ /* Update our load */
+- for (i = 0; i < 3; i++) {
+- unsigned long new_load = this_load;
+- int scale = 1 << i;
+- old_load = this_rq->cpu_load[i];
+- /*
+- * Round up the averaging division if load is increasing. This
+- * prevents us from getting stuck on 9 if the load is 10, for
+- * example.
+- */
+- if (new_load > old_load)
+- new_load += scale-1;
+- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+- }
++ update_rq_cpu_load(this_rq);
++ j = jiffies + CPU_OFFSET(smp_processor_id());
+
+ for_each_domain(this_cpu, sd) {
+ unsigned long interval;
+@@ -2404,17 +3385,19 @@ static void rebalance_tick(int this_cpu,
+ /*
+ * on UP we do not need to balance between CPUs:
+ */
+-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
++static inline void rebalance_tick(vcpu_t cpu, runqueue_t *rq, enum idle_type idle)
+ {
+ }
+-static inline void idle_balance(int cpu, runqueue_t *rq)
++static inline void idle_balance(vcpu_t cpu, runqueue_t *rq)
+ {
+ }
+ #endif
+
+-static inline int wake_priority_sleeper(runqueue_t *rq)
++static inline int wake_priority_sleeper(runqueue_t *rq, task_t *idle)
+ {
+ int ret = 0;
++#ifndef CONFIG_SCHED_VCPU
++ /* FIXME: can we implement SMT priority sleeping for this? */
+ #ifdef CONFIG_SCHED_SMT
+ spin_lock(&rq->lock);
+ /*
+@@ -2422,11 +3405,13 @@ static inline int wake_priority_sleeper(
+ * reasons reschedule the idle task to see if it can now run.
+ */
+ if (rq->nr_running) {
+- resched_task(rq->idle);
++ /* FIXME */
++ resched_task(idle);
+ ret = 1;
+ }
+ spin_unlock(&rq->lock);
+ #endif
++#endif
+ return ret;
+ }
+
+@@ -2476,6 +3461,15 @@ unsigned long long current_sched_time(co
+ STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+ ((rq)->curr->static_prio > (rq)->best_expired_prio))
+
++#ifdef CONFIG_VE
++#define update_ve_cpu_time(p, time, tick) do { \
++ VE_CPU_STATS((p)->ve_task_info.owner_env, \
++ task_cpu(p))->time += tick; \
++ } while (0)
++#else
++#define update_ve_cpu_time(p, time, tick) do { } while (0)
++#endif
++
+ /*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+@@ -2491,10 +3485,13 @@ void account_user_time(struct task_struc
+
+ /* Add user time to cpustat. */
+ tmp = cputime_to_cputime64(cputime);
+- if (TASK_NICE(p) > 0)
++ if (TASK_NICE(p) > 0) {
+ cpustat->nice = cputime64_add(cpustat->nice, tmp);
+- else
++ update_ve_cpu_time(p, nice, tmp);
++ } else {
+ cpustat->user = cputime64_add(cpustat->user, tmp);
++ update_ve_cpu_time(p, user, tmp);
++ }
+ }
+
+ /*
+@@ -2511,14 +3508,16 @@ void account_system_time(struct task_str
+ cputime64_t tmp;
+
+ p->stime = cputime_add(p->stime, cputime);
++ tmp = cputime_to_cputime64(cputime);
++
++ update_ve_cpu_time(p, system, tmp);
+
+ /* Add system time to cpustat. */
+- tmp = cputime_to_cputime64(cputime);
+ if (hardirq_count() - hardirq_offset)
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ else if (softirq_count())
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+- else if (p != rq->idle)
++ else if (p != this_pcpu()->idle)
+ cpustat->system = cputime64_add(cpustat->system, tmp);
+ else if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+@@ -2539,7 +3538,7 @@ void account_steal_time(struct task_stru
+ cputime64_t tmp = cputime_to_cputime64(steal);
+ runqueue_t *rq = this_rq();
+
+- if (p == rq->idle) {
++ if (p == this_pcpu()->idle) {
+ p->stime = cputime_add(p->stime, steal);
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
+@@ -2559,18 +3558,23 @@ void account_steal_time(struct task_stru
+ void scheduler_tick(void)
+ {
+ int cpu = smp_processor_id();
+- runqueue_t *rq = this_rq();
++ vcpu_t vcpu;
++ runqueue_t *rq;
+ task_t *p = current;
+ unsigned long long now = sched_clock();
+
++ vcpu = this_vcpu();
++ rq = vcpu_rq(vcpu);
+ update_cpu_clock(p, rq, now);
+
+ rq->timestamp_last_tick = now;
+
+- if (p == rq->idle) {
+- if (wake_priority_sleeper(rq))
++ set_tsk_need_resched(p); //FIXME
++
++ if (p == pcpu(cpu)->idle) {
++ if (wake_priority_sleeper(rq, pcpu(cpu)->idle))
+ goto out;
+- rebalance_tick(cpu, rq, SCHED_IDLE);
++ rebalance_tick(vcpu, rq, SCHED_IDLE);
+ return;
+ }
+
+@@ -2646,10 +3650,14 @@ void scheduler_tick(void)
+ out_unlock:
+ spin_unlock(&rq->lock);
+ out:
+- rebalance_tick(cpu, rq, NOT_IDLE);
++ rebalance_tick(vcpu, rq, NOT_IDLE);
+ }
+
+-#ifdef CONFIG_SCHED_SMT
++#if defined(CONFIG_SCHED_SMT) && !defined(CONFIG_SCHED_VCPU)
++/* FIXME: SMT scheduling
++ * rq->cpu is initialized with rq address if FAIRSCED is on
++ * this is not correct for SMT case
++ */
+ static inline void wakeup_busy_runqueue(runqueue_t *rq)
+ {
+ /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+@@ -2657,7 +3665,7 @@ static inline void wakeup_busy_runqueue(
+ resched_task(rq->idle);
+ }
+
+-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
++static void wake_sleeping_dependent(vcpu_t this_cpu)
+ {
+ struct sched_domain *tmp, *sd = NULL;
+ cpumask_t sibling_map;
+@@ -2711,7 +3719,7 @@ static inline unsigned long smt_slice(ta
+ return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+ }
+
+-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
++static int dependent_sleeper(vcpu_t this_cpu)
+ {
+ struct sched_domain *tmp, *sd = NULL;
+ cpumask_t sibling_map;
+@@ -2812,11 +3820,11 @@ out_unlock:
+ return ret;
+ }
+ #else
+-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
++static inline void wake_sleeping_dependent(vcpu_t this_cpu)
+ {
+ }
+
+-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
++static inline int dependent_sleeper(vcpu_t this_cpu)
+ {
+ return 0;
+ }
+@@ -2866,7 +3874,9 @@ asmlinkage void __sched schedule(void)
+ struct list_head *queue;
+ unsigned long long now;
+ unsigned long run_time;
+- int cpu, idx, new_prio;
++ int idx, new_prio;
++ vcpu_t vcpu;
++ cycles_t cycles;
+
+ /*
+ * Test if we are atomic. Since do_exit() needs to call into
+@@ -2888,13 +3898,14 @@ need_resched:
+ prev = current;
+ release_kernel_lock(prev);
+ need_resched_nonpreemptible:
++ cycles = get_cycles();
+ rq = this_rq();
+
+ /*
+ * The idle thread is not allowed to schedule!
+ * Remove this check after it has been exercised a bit.
+ */
+- if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
++ if (unlikely(prev == this_pcpu()->idle) && prev->state != TASK_RUNNING) {
+ printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+ dump_stack();
+ }
+@@ -2932,25 +3943,35 @@ need_resched_nonpreemptible:
+ }
+ }
+
+- cpu = smp_processor_id();
++ prev->sleep_avg -= run_time;
++ if ((long)prev->sleep_avg <= 0)
++ prev->sleep_avg = 0;
++
++ vcpu = rq_vcpu(rq);
++ if (rq->nr_running &&
++ jiffies - vcpu->start_time < msecs_to_jiffies(vcpu_timeslice))
++ goto same_vcpu;
++
++ if (unlikely(!rq->nr_running))
++ idle_balance(vcpu, rq);
++ vcpu = schedule_vcpu(vcpu, cycles);
++ rq = vcpu_rq(vcpu);
++
+ if (unlikely(!rq->nr_running)) {
+ go_idle:
+- idle_balance(cpu, rq);
+- if (!rq->nr_running) {
+- next = rq->idle;
+- rq->expired_timestamp = 0;
+- wake_sleeping_dependent(cpu, rq);
+- /*
+- * wake_sleeping_dependent() might have released
+- * the runqueue, so break out if we got new
+- * tasks meanwhile:
+- */
+- if (!rq->nr_running)
+- goto switch_tasks;
+- }
++ next = this_pcpu()->idle;
++ rq->expired_timestamp = 0;
++ wake_sleeping_dependent(vcpu);
++ /*
++ * wake_sleeping_dependent() might have released
++ * the runqueue, so break out if we got new
++ * tasks meanwhile:
++ */
++ if (!rq->nr_running)
++ goto switch_tasks;
+ } else {
+- if (dependent_sleeper(cpu, rq)) {
+- next = rq->idle;
++ if (dependent_sleeper(vcpu)) {
++ next = this_pcpu()->idle;
+ goto switch_tasks;
+ }
+ /*
+@@ -2962,6 +3983,7 @@ go_idle:
+ goto go_idle;
+ }
+
++same_vcpu:
+ array = rq->active;
+ if (unlikely(!array->nr_active)) {
+ /*
+@@ -2998,28 +4020,50 @@ go_idle:
+ requeue_task(next, array);
+ }
+ next->activated = 0;
++
+ switch_tasks:
+- if (next == rq->idle)
++ if (next == this_pcpu()->idle)
+ schedstat_inc(rq, sched_goidle);
+ prefetch(next);
+ prefetch_stack(next);
+ clear_tsk_need_resched(prev);
+- rcu_qsctr_inc(task_cpu(prev));
++ rcu_qsctr_inc(task_pcpu(prev));
+
+ update_cpu_clock(prev, rq, now);
+
+- prev->sleep_avg -= run_time;
+- if ((long)prev->sleep_avg <= 0)
+- prev->sleep_avg = 0;
++ /* updated w/o rq->lock, which is ok due to after-read-checks */
+ prev->timestamp = prev->last_ran = now;
+
+ sched_info_switch(prev, next);
+ if (likely(prev != next)) {
++ cycles_t cycles;
++
++ /* current physical CPU id should be valid after switch */
++ set_task_vcpu(next, vcpu);
++ set_task_pcpu(next, task_pcpu(prev));
++ cycles = get_cycles();
+ next->timestamp = now;
+ rq->nr_switches++;
++ glob_task_nrs[smp_processor_id()].nr_switches++;
+ rq->curr = next;
+ ++*switch_count;
+
++#ifdef CONFIG_VE
++ prev->ve_task_info.sleep_stamp = cycles;
++ if (prev->state == TASK_RUNNING && prev != this_pcpu()->idle)
++ write_wakeup_stamp(prev, cycles);
++ update_sched_lat(next, cycles);
++
++ /* because next & prev are protected with
++ * runqueue lock we may not worry about
++ * wakeup_stamp and sched_time protection
++ * (same thing in 'else' branch below)
++ */
++ update_ve_task_info(prev, cycles);
++ next->ve_task_info.sched_time = cycles;
++ write_wakeup_stamp(next, 0);
++#endif
++
+ prepare_task_switch(rq, next);
+ prev = context_switch(rq, prev, next);
+ barrier();
+@@ -3029,8 +4073,10 @@ switch_tasks:
+ * frame will be invalid.
+ */
+ finish_task_switch(this_rq(), prev);
+- } else
++ } else {
++ update_ve_task_info(prev, get_cycles());
+ spin_unlock_irq(&rq->lock);
++ }
+
+ prev = current;
+ if (unlikely(reacquire_kernel_lock(prev) < 0))
+@@ -3565,27 +4611,9 @@ int task_prio(const task_t *p)
+ */
+ int task_nice(const task_t *p)
+ {
+- return TASK_NICE(p);
+-}
+-EXPORT_SYMBOL_GPL(task_nice);
+-
+-/**
+- * idle_cpu - is a given cpu idle currently?
+- * @cpu: the processor in question.
+- */
+-int idle_cpu(int cpu)
+-{
+- return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+-}
+-
+-/**
+- * idle_task - return the idle task for a given cpu.
+- * @cpu: the processor in question.
+- */
+-task_t *idle_task(int cpu)
+-{
+- return cpu_rq(cpu)->idle;
++ return TASK_NICE(p);
+ }
++EXPORT_SYMBOL_GPL(task_nice);
+
+ /**
+ * find_process_by_pid - find a process with a matching PID value.
+@@ -3593,7 +4621,7 @@ task_t *idle_task(int cpu)
+ */
+ static inline task_t *find_process_by_pid(pid_t pid)
+ {
+- return pid ? find_task_by_pid(pid) : current;
++ return pid ? find_task_by_pid_ve(pid) : current;
+ }
+
+ /* Actually do priority change: must hold rq lock. */
+@@ -3653,7 +4681,7 @@ recheck:
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+- if (!capable(CAP_SYS_NICE)) {
++ if (!capable(CAP_SYS_ADMIN)) {
+ /*
+ * can't change policy, except between SCHED_NORMAL
+ * and SCHED_BATCH:
+@@ -4110,10 +5138,19 @@ EXPORT_SYMBOL(yield);
+ */
+ void __sched io_schedule(void)
+ {
+- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
++ struct runqueue *rq = this_rq();
++
++#ifdef CONFIG_VE
++ struct ve_struct *ve;
++ ve = current->ve_task_info.owner_env;
++#endif
+
+ atomic_inc(&rq->nr_iowait);
++ ve_nr_iowait_inc(ve, task_cpu(current));
++ nr_iowait_inc(smp_processor_id());
+ schedule();
++ nr_iowait_dec(smp_processor_id());
++ ve_nr_iowait_dec(ve, task_cpu(current));
+ atomic_dec(&rq->nr_iowait);
+ }
+
+@@ -4121,11 +5158,20 @@ EXPORT_SYMBOL(io_schedule);
+
+ long __sched io_schedule_timeout(long timeout)
+ {
+- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
++ struct runqueue *rq = this_rq();
+ long ret;
+
++#ifdef CONFIG_VE
++ struct ve_struct *ve;
++ ve = current->ve_task_info.owner_env;
++#endif
++
+ atomic_inc(&rq->nr_iowait);
++ ve_nr_iowait_inc(ve, task_cpu(current));
++ nr_iowait_inc(smp_processor_id());
+ ret = schedule_timeout(timeout);
++ nr_iowait_dec(smp_processor_id());
++ ve_nr_iowait_dec(ve, task_cpu(current));
+ atomic_dec(&rq->nr_iowait);
+ return ret;
+ }
+@@ -4248,15 +5294,9 @@ static void show_task(task_t *p)
+ else
+ printk("?");
+ #if (BITS_PER_LONG == 32)
+- if (state == TASK_RUNNING)
+- printk(" running ");
+- else
+- printk(" %08lX ", thread_saved_pc(p));
++ printk(" %08lX ", (unsigned long)p);
+ #else
+- if (state == TASK_RUNNING)
+- printk(" running task ");
+- else
+- printk(" %016lx ", thread_saved_pc(p));
++ printk(" %016lx ", (unsigned long)p);
+ #endif
+ #ifdef CONFIG_DEBUG_STACK_USAGE
+ {
+@@ -4295,26 +5335,41 @@ void show_state(void)
+ #if (BITS_PER_LONG == 32)
+ printk("\n"
+ " sibling\n");
+- printk(" task PC pid father child younger older\n");
++ printk(" task taskaddr pid father child younger older\n");
+ #else
+ printk("\n"
+ " sibling\n");
+- printk(" task PC pid father child younger older\n");
++ printk(" task taskaddr pid father child younger older\n");
+ #endif
+ read_lock(&tasklist_lock);
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ /*
+ * reset the NMI-timeout, listing all files on a slow
+ * console might take alot of time:
+ */
+ touch_nmi_watchdog();
+ show_task(p);
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+
+ read_unlock(&tasklist_lock);
+ mutex_debug_show_all_locks();
+ }
+
++static void init_boot_vcpus(long cpu)
++{
++ if (vsched_vcpu(&idle_vsched, cpu) != NULL)
++ return;
++
++ if (__add_vcpu(&idle_vsched, cpu) != 0)
++ panic("Can't create idle vcpu %ld\n", cpu);
++
++ /* Also create vcpu for default_vsched */
++ if (__add_vcpu(&default_vsched, cpu) != 0)
++ panic("Can't create default vcpu %ld\n", cpu);
++
++ cpu_set(cpu, idle_vsched.pcpu_running_map);
++}
++
+ /**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+@@ -4325,22 +5380,47 @@ void show_state(void)
+ */
+ void __devinit init_idle(task_t *idle, int cpu)
+ {
+- runqueue_t *rq = cpu_rq(cpu);
++ struct vcpu_scheduler *vsched;
++ vcpu_t vcpu;
++ runqueue_t *rq;
+ unsigned long flags;
+
++#ifdef CONFIG_SCHED_VCPU
++ init_boot_vcpus(cpu);
++#endif
++ vsched = &idle_vsched;
++ vcpu = vsched_vcpu(vsched, cpu);
++ rq = vcpu_rq(vcpu);
++
+ idle->timestamp = sched_clock();
+ idle->sleep_avg = 0;
+ idle->array = NULL;
+ idle->prio = MAX_PRIO;
+ idle->state = TASK_RUNNING;
+ idle->cpus_allowed = cpumask_of_cpu(cpu);
++ set_task_vsched(idle, &idle_vsched);
+ set_task_cpu(idle, cpu);
+
+ spin_lock_irqsave(&rq->lock, flags);
+- rq->curr = rq->idle = idle;
++ pcpu(cpu)->idle = idle;
++ rq->curr = idle;
+ #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ idle->oncpu = 1;
+ #endif
++ set_task_pcpu(idle, cpu);
++ set_task_vsched(idle, vsched);
++ set_task_vcpu(idle, vcpu);
++#ifdef CONFIG_SCHED_VCPU
++ /* the following code is very close to vcpu_get */
++ spin_lock(&fairsched_lock);
++ pcpu(cpu)->vcpu = vcpu;
++ pcpu(cpu)->vsched = vcpu->vsched;
++ list_move_tail(&vcpu->list, &vsched->running_list);
++ __set_bit(cpu, vsched->vcpu_running_map.bits);
++ __set_bit(cpu, vsched->pcpu_running_map.bits);
++ vcpu->running = 1;
++ spin_unlock(&fairsched_lock);
++#endif
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ /* Set the preempt count _outside_ the spinlocks! */
+@@ -4360,7 +5440,6 @@ void __devinit init_idle(task_t *idle, i
+ */
+ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+
+-#ifdef CONFIG_SMP
+ /*
+ * This is how migration works:
+ *
+@@ -4377,6 +5456,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ * 7) we wake up and the migration is done.
+ */
+
++#ifdef CONFIG_SMP
+ /*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+@@ -4392,9 +5472,11 @@ int set_cpus_allowed(task_t *p, cpumask_
+ int ret = 0;
+ migration_req_t req;
+ runqueue_t *rq;
++ struct vcpu_scheduler *vsched;
+
++ vsched = task_vsched(p);
+ rq = task_rq_lock(p, &flags);
+- if (!cpus_intersects(new_mask, cpu_online_map)) {
++ if (!cpus_intersects(new_mask, vsched_vcpu_online_map(vsched))) {
+ ret = -EINVAL;
+ goto out;
+ }
+@@ -4404,7 +5486,8 @@ int set_cpus_allowed(task_t *p, cpumask_
+ if (cpu_isset(task_cpu(p), new_mask))
+ goto out;
+
+- if (migrate_task(p, any_online_cpu(new_mask), &req)) {
++ if (migrate_task(p, vsched_vcpu(vsched, any_online_cpu(new_mask)),
++ &req)) {
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, &flags);
+ wake_up_process(rq->migration_thread);
+@@ -4418,6 +5501,7 @@ out:
+ }
+
+ EXPORT_SYMBOL_GPL(set_cpus_allowed);
++#endif
+
+ /*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+@@ -4428,25 +5512,30 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
++static void __migrate_task(struct task_struct *p, vcpu_t src_cpu, vcpu_t dest_cpu)
+ {
+ runqueue_t *rq_dest, *rq_src;
+
+- if (unlikely(cpu_is_offline(dest_cpu)))
++ if (unlikely(vcpu_is_offline(dest_cpu)))
+ return;
+
+- rq_src = cpu_rq(src_cpu);
+- rq_dest = cpu_rq(dest_cpu);
++#ifdef CONFIG_SCHED_VCPU
++ BUG_ON(vcpu_vsched(src_cpu) == &idle_vsched);
++#endif
++ rq_src = vcpu_rq(src_cpu);
++ rq_dest = vcpu_rq(dest_cpu);
+
+ double_rq_lock(rq_src, rq_dest);
+ /* Already moved. */
+- if (task_cpu(p) != src_cpu)
++ if (task_vcpu(p) != src_cpu)
+ goto out;
+ /* Affinity changed (again). */
+- if (!cpu_isset(dest_cpu, p->cpus_allowed))
++ if (!vcpu_isset(dest_cpu, p->cpus_allowed))
+ goto out;
+
+- set_task_cpu(p, dest_cpu);
++ BUG_ON(task_running(rq_src, p));
++ set_task_vsched(p, vcpu_vsched(dest_cpu));
++ set_task_vcpu(p, dest_cpu);
+ if (p->array) {
+ /*
+ * Sync timestamp with rq_dest's before activating.
+@@ -4474,9 +5563,9 @@ out:
+ static int migration_thread(void *data)
+ {
+ runqueue_t *rq;
+- int cpu = (long)data;
++ vcpu_t cpu = (vcpu_t)data;
+
+- rq = cpu_rq(cpu);
++ rq = vcpu_rq(cpu);
+ BUG_ON(rq->migration_thread != current);
+
+ set_current_state(TASK_INTERRUPTIBLE);
+@@ -4488,15 +5577,17 @@ static int migration_thread(void *data)
+
+ spin_lock_irq(&rq->lock);
+
+- if (cpu_is_offline(cpu)) {
++ if (vcpu_is_offline(cpu)) {
+ spin_unlock_irq(&rq->lock);
+ goto wait_to_die;
+ }
+
++#ifdef CONFIG_SMP
+ if (rq->active_balance) {
+ active_load_balance(rq, cpu);
+ rq->active_balance = 0;
+ }
++#endif
+
+ head = &rq->migration_queue;
+
+@@ -4529,14 +5620,16 @@ wait_to_die:
+ return 0;
+ }
+
+-#ifdef CONFIG_HOTPLUG_CPU
+ /* Figure out where task on dead CPU should go, use force if neccessary. */
+-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
++static void move_task_off_dead_cpu(vcpu_t dead_cpu, struct task_struct *tsk)
+ {
+ int dest_cpu;
++ struct vcpu_scheduler *vsched;
+ cpumask_t mask;
+
++#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_SCHED_VCPU)
+ /* On same node? */
++#error FIXME: wrong code
+ mask = node_to_cpumask(cpu_to_node(dead_cpu));
+ cpus_and(mask, mask, tsk->cpus_allowed);
+ dest_cpu = any_online_cpu(mask);
+@@ -4560,9 +5653,20 @@ static void move_task_off_dead_cpu(int d
+ "longer affine to cpu%d\n",
+ tsk->pid, tsk->comm, dead_cpu);
+ }
+- __migrate_task(tsk, dead_cpu, dest_cpu);
++#elif defined(CONFIG_SCHED_VCPU)
++ vsched = vcpu_vsched(dead_cpu);
++ mask = vsched_vcpu_online_map(vsched);
++ cpus_and(mask, mask, tsk->cpus_allowed);
++ dest_cpu = any_online_cpu(mask);
++
++ /* On any allowed CPU? */
++ if (dest_cpu == NR_CPUS)
++ dest_cpu = any_online_cpu(vsched_vcpu_online_map(vsched));
++#endif
++ __migrate_task(tsk, dead_cpu, vsched_vcpu(vsched, dest_cpu));
+ }
+
++#ifdef CONFIG_HOTPLUG_CPU
+ /*
+ * While a dead CPU has no uninterruptible tasks queued at this point,
+ * it might still have a nonzero ->nr_uninterruptible counter, because
+@@ -4582,25 +5686,30 @@ static void migrate_nr_uninterruptible(r
+ double_rq_unlock(rq_src, rq_dest);
+ local_irq_restore(flags);
+ }
++#endif
+
+ /* Run through task list and migrate tasks from the dead cpu. */
+-static void migrate_live_tasks(int src_cpu)
++static void migrate_live_tasks(vcpu_t src_cpu)
+ {
+ struct task_struct *tsk, *t;
+
++ BUG_ON(vcpu_isset(src_cpu, vsched_vcpu_online_map(vcpu_vsched(src_cpu))));
+ write_lock_irq(&tasklist_lock);
+
+- do_each_thread(t, tsk) {
++ do_each_thread_all(t, tsk) {
+ if (tsk == current)
+ continue;
++ if (tsk == vcpu_rq(src_cpu)->migration_thread)
++ continue;
+
+- if (task_cpu(tsk) == src_cpu)
++ if (task_vcpu(tsk) == src_cpu)
+ move_task_off_dead_cpu(src_cpu, tsk);
+- } while_each_thread(t, tsk);
++ } while_each_thread_all(t, tsk);
+
+ write_unlock_irq(&tasklist_lock);
+ }
+
++#ifdef CONFIG_HOTPLUG_CPU
+ /* Schedules idle task to be the next runnable task on current CPU.
+ * It does so by boosting its priority to highest possible and adding it to
+ * the _front_ of runqueue. Used by CPU offline code.
+@@ -4622,6 +5731,9 @@ void sched_idle_next(void)
+
+ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ /* Add idle task to _front_ of it's priority queue */
++#ifdef CONFIG_SCHED_VCPU
++#error "FIXME: VCPU vs. HOTPLUG: fix the code below"
++#endif
+ __activate_idle_task(p, rq);
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+@@ -4683,48 +5795,83 @@ static void migrate_dead_tasks(unsigned
+ }
+ #endif /* CONFIG_HOTPLUG_CPU */
+
++static void migration_thread_bind(struct task_struct *k, vcpu_t cpu)
++{
++ BUG_ON(k->state != TASK_INTERRUPTIBLE);
++ /* Must have done schedule() in kthread() before we set_task_cpu */
++ wait_task_inactive(k);
++
++ set_task_vsched(k, vcpu_vsched(cpu));
++ set_task_vcpu(k, cpu);
++ k->cpus_allowed = cpumask_of_cpu(cpu->id);
++}
++
++static void migration_thread_stop(runqueue_t *rq)
++{
++ struct task_struct *thread;
++
++ thread = rq->migration_thread;
++ if (thread == NULL)
++ return;
++
++ get_task_struct(thread);
++ kthread_stop(thread);
++
++ /* We MUST ensure, that the do_exit of the migration thread is
++ * completed and it will never scheduled again before vsched_destroy.
++ * The task with flag PF_DEAD if unscheduled will never receive
++ * CPU again. */
++ while (!(thread->flags & PF_DEAD) || task_running(rq, thread))
++ yield();
++ put_task_struct(thread);
++
++ rq->migration_thread = NULL;
++}
++
+ /*
+ * migration_call - callback that gets triggered when a CPU is added.
+ * Here we can start up the necessary migration thread for the new CPU.
+ */
+-static int migration_call(struct notifier_block *nfb, unsigned long action,
++static int vmigration_call(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+ {
+- int cpu = (long)hcpu;
++ vcpu_t cpu = (vcpu_t)hcpu;
+ struct task_struct *p;
+ struct runqueue *rq;
+ unsigned long flags;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+- p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
++ p = kthread_create(migration_thread, hcpu, "migration/%d/%d",
++ vsched_id(vcpu_vsched(cpu)), cpu->id);
+ if (IS_ERR(p))
+ return NOTIFY_BAD;
+ p->flags |= PF_NOFREEZE;
+- kthread_bind(p, cpu);
+- /* Must be high prio: stop_machine expects to yield to it. */
++
++ migration_thread_bind(p, cpu);
+ rq = task_rq_lock(p, &flags);
++ /* Must be high prio: stop_machine expects to yield to it. */
+ __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
+ task_rq_unlock(rq, &flags);
+- cpu_rq(cpu)->migration_thread = p;
++ vcpu_rq(cpu)->migration_thread = p;
+ break;
+ case CPU_ONLINE:
+ /* Strictly unneccessary, as first user will wake it. */
+- wake_up_process(cpu_rq(cpu)->migration_thread);
++ wake_up_process(vcpu_rq(cpu)->migration_thread);
+ break;
+-#ifdef CONFIG_HOTPLUG_CPU
++#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SCHED_VCPU)
++#error "FIXME: CPU down code doesn't work yet with VCPUs"
++#endif
+ case CPU_UP_CANCELED:
+ /* Unbind it from offline cpu so it can run. Fall thru. */
+- kthread_bind(cpu_rq(cpu)->migration_thread,
+- any_online_cpu(cpu_online_map));
+- kthread_stop(cpu_rq(cpu)->migration_thread);
+- cpu_rq(cpu)->migration_thread = NULL;
++ migration_thread_bind(vcpu_rq(cpu)->migration_thread, this_vcpu());
++ migration_thread_stop(vcpu_rq(cpu));
+ break;
+ case CPU_DEAD:
+ migrate_live_tasks(cpu);
+- rq = cpu_rq(cpu);
+- kthread_stop(rq->migration_thread);
+- rq->migration_thread = NULL;
++ rq = vcpu_rq(cpu);
++ migration_thread_stop(rq);
++#ifdef CONFIG_HOTPLUG_CPU
+ /* Idle task back to normal (off runqueue, low prio) */
+ rq = task_rq_lock(rq->idle, &flags);
+ deactivate_task(rq->idle, rq);
+@@ -4734,6 +5881,7 @@ static int migration_call(struct notifie
+ task_rq_unlock(rq, &flags);
+ migrate_nr_uninterruptible(rq);
+ BUG_ON(rq->nr_running != 0);
++#endif
+
+ /* No need to migrate the tasks: it was best-effort if
+ * they didn't do lock_cpu_hotplug(). Just wake up
+@@ -4748,11 +5896,19 @@ static int migration_call(struct notifie
+ }
+ spin_unlock_irq(&rq->lock);
+ break;
+-#endif
+ }
+ return NOTIFY_OK;
+ }
+
++static int migration_call(struct notifier_block *nfb, unsigned long action,
++ void *hcpu)
++{
++ if (action == CPU_UP_PREPARE)
++ init_boot_vcpus((long)hcpu);
++ /* we need to translate pcpu to vcpu */
++ return vmigration_call(nfb, action, vsched_default_vcpu((long)hcpu));
++}
++
+ /* Register at highest priority so that task migration (migrate_all_tasks)
+ * happens before everything else.
+ */
+@@ -4770,7 +5926,6 @@ int __init migration_init(void)
+ register_cpu_notifier(&migration_notifier);
+ return 0;
+ }
+-#endif
+
+ #ifdef CONFIG_SMP
+ #undef SCHED_DOMAIN_DEBUG
+@@ -4798,7 +5953,7 @@ static void sched_domain_debug(struct sc
+ printk(KERN_DEBUG);
+ for (i = 0; i < level + 1; i++)
+ printk(" ");
+- printk("domain %d: ", level);
++ printk("domain %d, flags %x: ", level, sd->flags);
+
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ printk("does not load-balance\n");
+@@ -4923,7 +6078,7 @@ static int sd_parent_degenerate(struct s
+ */
+ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+ {
+- runqueue_t *rq = cpu_rq(cpu);
++ runqueue_t *rq = vcpu_rq(vsched_default_vcpu(cpu));
+ struct sched_domain *tmp;
+
+ /* Remove the sched domains which do not contribute to scheduling. */
+@@ -4940,6 +6095,7 @@ static void cpu_attach_domain(struct sch
+
+ sched_domain_debug(sd, cpu);
+
++ rcu_assign_pointer(pcpu(cpu)->sd, sd);
+ rcu_assign_pointer(rq->sd, sd);
+ }
+
+@@ -5118,7 +6274,7 @@ static unsigned long domain_distance(int
+ unsigned long distance = 0;
+ struct sched_domain *sd;
+
+- for_each_domain(cpu1, sd) {
++ for_each_pdomain(pcpu(cpu1)->sd, sd) {
+ WARN_ON(!cpu_isset(cpu1, sd->span));
+ if (cpu_isset(cpu2, sd->span))
+ return distance;
+@@ -5440,7 +6596,7 @@ static void calibrate_migration_costs(co
+ */
+ for_each_cpu_mask(cpu, *cpu_map) {
+ distance = 0;
+- for_each_domain(cpu, sd) {
++ for_each_pdomain(pcpu(cpu)->sd, sd) {
+ sd->cache_hot_time = migration_cost[distance];
+ distance++;
+ }
+@@ -6012,42 +7168,398 @@ int in_sched_functions(unsigned long add
+ && addr < (unsigned long)__sched_text_end);
+ }
+
+-void __init sched_init(void)
++static void init_rq(struct runqueue *rq, int cpu)
++{
++ int j, k;
++ prio_array_t *array;
++
++ spin_lock_init(&rq->lock);
++ rq->nr_running = 0;
++ rq->active = rq->arrays;
++ rq->expired = rq->arrays + 1;
++ rq->best_expired_prio = MAX_PRIO;
++
++#ifdef CONFIG_SMP
++ rq->sd = NULL;
++ for (j = 0; j < 3; j++)
++ rq->cpu_load[j] = 0;
++ rq->active_balance = 0;
++#endif
++ rq->push_cpu = 0;
++ rq->migration_thread = NULL;
++ INIT_LIST_HEAD(&rq->migration_queue);
++ rq->cpu = cpu;
++ atomic_set(&rq->nr_iowait, 0);
++
++ for (j = 0; j < 2; j++) {
++ array = rq->arrays + j;
++ for (k = 0; k < MAX_PRIO; k++) {
++ INIT_LIST_HEAD(array->queue + k);
++ __clear_bit(k, array->bitmap);
++ }
++ // delimiter for bitsearch
++ __set_bit(MAX_PRIO, array->bitmap);
++ }
++}
++
++#if defined(CONFIG_SCHED_VCPU) || defined(CONFIG_FAIRSCHED)
++static void init_vcpu(vcpu_t vcpu, int id)
++{
++ memset(vcpu, 0, sizeof(struct vcpu_info));
++ vcpu->id = id;
++#ifdef CONFIG_SCHED_VCPU
++ vcpu->last_pcpu = id;
++#endif
++ init_rq(vcpu_rq(vcpu), id);
++}
++
++/* both rq and vsched lock should be taken */
++static void __install_vcpu(struct vcpu_scheduler *vsched, vcpu_t vcpu)
++{
++ int id;
++
++ id = vcpu->id;
++ vcpu->vsched = vsched;
++ vsched->vcpu[id] = vcpu;
++ vcpu->last_pcpu = id;
++ wmb();
++ /* FIXME: probably locking should be reworked, e.g.
++ we don't have corresponding rmb(), so we need to update mask
++ only after quiscent state */
++ /* init_boot_vcpu() should be remade if RCU is used here */
++ list_add(&vcpu->list, &vsched->idle_list);
++ cpu_set(id, vsched->vcpu_online_map);
++ vsched->num_online_vcpus++;
++}
++
++static int install_vcpu(vcpu_t vcpu, struct vcpu_scheduler *vsched)
+ {
+ runqueue_t *rq;
+- int i, j, k;
++ unsigned long flags;
++ int res = 0;
+
+- for_each_cpu(i) {
+- prio_array_t *array;
++ rq = vcpu_rq(vcpu);
++ spin_lock_irqsave(&rq->lock, flags);
++ spin_lock(&fairsched_lock);
+
+- rq = cpu_rq(i);
+- spin_lock_init(&rq->lock);
+- rq->nr_running = 0;
+- rq->active = rq->arrays;
+- rq->expired = rq->arrays + 1;
+- rq->best_expired_prio = MAX_PRIO;
++ if (vsched->vcpu[vcpu->id] != NULL)
++ res = -EBUSY;
++ else
++ __install_vcpu(vsched, vcpu);
+
+-#ifdef CONFIG_SMP
+- rq->sd = NULL;
+- for (j = 1; j < 3; j++)
+- rq->cpu_load[j] = 0;
+- rq->active_balance = 0;
+- rq->push_cpu = 0;
+- rq->migration_thread = NULL;
+- INIT_LIST_HEAD(&rq->migration_queue);
+-#endif
+- atomic_set(&rq->nr_iowait, 0);
+-
+- for (j = 0; j < 2; j++) {
+- array = rq->arrays + j;
+- for (k = 0; k < MAX_PRIO; k++) {
+- INIT_LIST_HEAD(array->queue + k);
+- __clear_bit(k, array->bitmap);
+- }
+- // delimiter for bitsearch
+- __set_bit(MAX_PRIO, array->bitmap);
++ spin_unlock(&fairsched_lock);
++ spin_unlock_irqrestore(&rq->lock, flags);
++ return res;
++}
++
++static int __add_vcpu(struct vcpu_scheduler *vsched, int id)
++{
++ vcpu_t vcpu;
++ int res;
++
++ res = -ENOMEM;
++ vcpu = kmalloc(sizeof(struct vcpu_info), GFP_KERNEL);
++ if (vcpu == NULL)
++ goto out;
++
++ init_vcpu(vcpu, id);
++ vcpu_rq(vcpu)->curr = this_pcpu()->idle;
++ res = install_vcpu(vcpu, vsched);
++ if (res < 0)
++ goto out_free;
++ return 0;
++
++out_free:
++ kfree(vcpu);
++out:
++ return res;
++}
++
++void vsched_init(struct vcpu_scheduler *vsched, int id)
++{
++ memset(vsched, 0, sizeof(*vsched));
++
++ INIT_LIST_HEAD(&vsched->idle_list);
++ INIT_LIST_HEAD(&vsched->active_list);
++ INIT_LIST_HEAD(&vsched->running_list);
++ vsched->num_online_vcpus = 0;
++ vsched->vcpu_online_map = CPU_MASK_NONE;
++ vsched->vcpu_running_map = CPU_MASK_NONE;
++ vsched->pcpu_running_map = CPU_MASK_NONE;
++ vsched->id = id;
++}
++
++#ifdef CONFIG_FAIRSCHED
++
++/* No locks supposed to be held */
++static void vsched_del_vcpu(vcpu_t vcpu);
++static int vsched_add_vcpu(struct vcpu_scheduler *vsched)
++{
++ int res, err;
++ vcpu_t vcpu;
++ int id;
++ static DECLARE_MUTEX(id_mutex);
++
++ down(&id_mutex);
++ id = find_first_zero_bit(vsched->vcpu_online_map.bits, NR_CPUS);
++ if (id >= NR_CPUS) {
++ err = -EBUSY;
++ goto out_up;
++ }
++
++ err = __add_vcpu(vsched, id);
++ if (err < 0)
++ goto out_up;
++
++ vcpu = vsched_vcpu(vsched, id);
++ err = -ENOMEM;
++
++ res = vmigration_call(&migration_notifier, CPU_UP_PREPARE, vcpu);
++ if (res != NOTIFY_OK)
++ goto out_del_up;
++
++ res = vmigration_call(&migration_notifier, CPU_ONLINE, vcpu);
++ if (res != NOTIFY_OK)
++ goto out_cancel_del_up;
++
++ err = 0;
++
++out_up:
++ up(&id_mutex);
++ return err;
++
++out_cancel_del_up:
++ vmigration_call(&migration_notifier, CPU_UP_CANCELED, vcpu);
++out_del_up:
++ vsched_del_vcpu(vcpu);
++ goto out_up;
++}
++
++static void vsched_del_vcpu(vcpu_t vcpu)
++{
++ struct vcpu_scheduler *vsched;
++ runqueue_t *rq;
++
++ vsched = vcpu_vsched(vcpu);
++ rq = vcpu_rq(vcpu);
++
++ spin_lock_irq(&rq->lock);
++ spin_lock(&fairsched_lock);
++ cpu_clear(vcpu->id, vsched->vcpu_online_map);
++ vsched->num_online_vcpus--;
++ spin_unlock(&fairsched_lock);
++ spin_unlock_irq(&rq->lock);
++
++ /*
++ * FIXME: ideas for VCPU hotplug:
++ *
++ * - push_cpu should be checked/cleanuped
++ * - serialization
++ */
++
++ /*
++ * all tasks should migrate from this VCPU somewhere,
++ * also, since this moment VCPU is offline, so migration_thread
++ * won't accept any new tasks...
++ */
++ vmigration_call(&migration_notifier, CPU_DEAD, vcpu);
++ BUG_ON(rq->nr_running != 0);
++
++ /* vcpu_put() is called after deactivate_task. This loop makes sure
++ * that vcpu_put() was finished and vcpu can be freed */
++ while ((volatile int)vcpu->running)
++ yield();
++
++ BUG_ON(vcpu->active); /* should be in idle_list */
++ BUG_ON(vcpu_rq(vcpu)->prev_mm != NULL);
++
++ spin_lock_irq(&fairsched_lock);
++ list_del(&vcpu->list);
++ vsched_vcpu(vsched, vcpu->id) = NULL;
++ spin_unlock_irq(&fairsched_lock);
++
++ kfree(vcpu);
++}
++
++int vsched_mvpr(struct task_struct *p, struct vcpu_scheduler *vsched)
++{
++ vcpu_t dest_vcpu;
++ int id;
++ int res;
++
++ res = 0;
++ while(1) {
++ /* FIXME: we suppose here that vcpu can't dissapear on the fly */
++ for(id = first_cpu(vsched->vcpu_online_map); id < NR_CPUS;
++ id++) {
++ if ((vsched->vcpu[id] != NULL) &&
++ !vcpu_isset(vsched->vcpu[id], p->cpus_allowed))
++ continue;
++ else
++ break;
++ }
++ if (id >= NR_CPUS) {
++ res = -EINVAL;
++ goto out;
++ }
++
++ dest_vcpu = vsched_vcpu(vsched, id);
++ while(1) {
++ sched_migrate_task(p, dest_vcpu);
++ if (task_vsched_id(p) == vsched_id(vsched))
++ goto out;
++ if (!vcpu_isset(vsched->vcpu[id], p->cpus_allowed))
++ break;
+ }
+ }
++out:
++ return res;
++}
++
++void vsched_fairsched_link(struct vcpu_scheduler *vsched,
++ struct fairsched_node *node)
++{
++ vsched->node = node;
++ node->vsched = vsched;
++}
++
++void vsched_fairsched_unlink(struct vcpu_scheduler *vsched,
++ struct fairsched_node *node)
++{
++ vsched->node = NULL;
++ node->vsched = NULL;
++}
++
++int vsched_create(int id, struct fairsched_node *node)
++{
++ struct vcpu_scheduler *vsched;
++ int i, res;
++
++ vsched = kmalloc(sizeof(*vsched), GFP_KERNEL);
++ if (vsched == NULL)
++ return -ENOMEM;
++
++ vsched_init(vsched, node->id);
++ vsched_fairsched_link(vsched, node);
++
++ for(i = 0; i < num_online_cpus(); i++) {
++ res = vsched_add_vcpu(vsched);
++ if (res < 0)
++ goto err_add;
++ }
++ return 0;
++
++err_add:
++ vsched_destroy(vsched);
++ return res;
++}
++
++int vsched_destroy(struct vcpu_scheduler *vsched)
++{
++ vcpu_t vcpu;
++
++ if (vsched == NULL)
++ return 0;
++
++ spin_lock_irq(&fairsched_lock);
++ while(1) {
++ if (!list_empty(&vsched->running_list))
++ vcpu = list_entry(vsched->running_list.next,
++ struct vcpu_info, list);
++ else if (!list_empty(&vsched->active_list))
++ vcpu = list_entry(vsched->active_list.next,
++ struct vcpu_info, list);
++ else if (!list_empty(&vsched->idle_list))
++ vcpu = list_entry(vsched->idle_list.next,
++ struct vcpu_info, list);
++ else
++ break;
++ spin_unlock_irq(&fairsched_lock);
++ vsched_del_vcpu(vcpu);
++ spin_lock_irq(&fairsched_lock);
++ }
++ if (vsched->num_online_vcpus)
++ goto err_busy;
++ spin_unlock_irq(&fairsched_lock);
++
++ vsched_fairsched_unlink(vsched, vsched->node);
++ kfree(vsched);
++ return 0;
++
++err_busy:
++ printk(KERN_ERR "BUG in vsched_destroy, vsched id %d\n",
++ vsched->id);
++ spin_unlock_irq(&fairsched_lock);
++ return -EBUSY;
++
++}
++#endif /* defined(CONFIG_FAIRSCHED) */
++#endif /* defined(CONFIG_SCHED_VCPU) || defined(CONFIG_FAIRSCHED) */
++
++static void init_boot_vcpu(void)
++{
++ int res;
++
++ /*
++ * We setup boot_vcpu and it's runqueue until init_idle() happens
++ * on cpu0. This is required since timer interrupts can happen
++ * between sched_init() and init_idle().
++ */
++ init_vcpu(&boot_idle_vcpu, 0);
++ vcpu_rq(&boot_idle_vcpu)->curr = current;
++ res = install_vcpu(&boot_idle_vcpu, &idle_vsched);
++ if (res < 0)
++ panic("Can't install boot idle vcpu");
++
++ init_vcpu(&boot_vcpu, 0);
++ vcpu_rq(&boot_vcpu)->curr = current;
++ res = install_vcpu(&boot_vcpu, &default_vsched);
++ if (res < 0)
++ panic("Can't install boot vcpu");
++
++ this_pcpu()->vcpu = &boot_idle_vcpu;
++ this_pcpu()->vsched = &idle_vsched;
++}
++
++static void init_pcpu(int id)
++{
++ struct pcpu_info *pcpu;
++
++ pcpu = pcpu(id);
++ pcpu->id = id;
++#ifdef CONFIG_SMP
++ pcpu->sd = NULL;
++#endif
++
++#ifndef CONFIG_SCHED_VCPU
++ init_vcpu(vcpu(id), id);
++#endif
++}
++
++static void init_pcpus(void)
++{
++ int i;
++ for (i = 0; i < NR_CPUS; i++)
++ init_pcpu(i);
++}
++
++void __init sched_init(void)
++{
++ init_pcpus();
++#if defined(CONFIG_SCHED_VCPU)
++ vsched_init(&idle_vsched, -1);
++ vsched_init(&default_vsched, 0);
++#if defined(CONFIG_FAIRSCHED)
++ fairsched_init_early();
++ vsched_fairsched_link(&idle_vsched, &fairsched_idle_node);
++ vsched_fairsched_link(&default_vsched, &fairsched_init_node);
++#endif
++ init_boot_vcpu();
++#else
++#if defined(CONFIG_FAIRSCHED)
++ fairsched_init_early();
++#endif
++#endif
+
+ /*
+ * The boot idle thread does lazy MMU switching as well:
+@@ -6064,6 +7576,149 @@ void __init sched_init(void)
+ init_idle(current, smp_processor_id());
+ }
+
++#ifdef CONFIG_SCHED_VCPU
++static void show_vcpu_list(struct vcpu_scheduler *vsched, struct list_head *lh)
++{
++ cpumask_t m;
++ vcpu_t vcpu;
++ int i;
++
++ cpus_clear(m);
++ list_for_each_entry(vcpu, lh, list)
++ cpu_set(vcpu->id, m);
++
++ for (i = 0; i < NR_CPUS; i++)
++ if (cpu_isset(i, m))
++ printk("%d ", i);
++}
++
++#define PRINT(s, sz, fmt...) \
++ do { \
++ int __out; \
++ __out = scnprintf(*s, *sz, fmt); \
++ *s += __out; \
++ *sz -= __out; \
++ } while(0)
++
++static void show_rq_array(prio_array_t *array, char *header, char **s, int *sz)
++{
++ struct list_head *list;
++ task_t *p;
++ int k, h;
++
++ h = 0;
++ for (k = 0; k < MAX_PRIO; k++) {
++ list = array->queue + k;
++ if (list_empty(list))
++ continue;
++
++ if (!h) {
++ PRINT(s, sz, header);
++ h = 1;
++ }
++
++ PRINT(s, sz, " prio %d (", k);
++ list_for_each_entry(p, list, run_list)
++ PRINT(s, sz, "%s[%d] ", p->comm, p->pid);
++ PRINT(s, sz, ")");
++ }
++ if (h)
++ PRINT(s, sz, "\n");
++}
++
++static void show_vcpu(vcpu_t vcpu)
++{
++ runqueue_t *rq;
++ char buf[1024], *s;
++ unsigned long flags;
++ int sz;
++
++ if (vcpu == NULL)
++ return;
++
++ rq = vcpu_rq(vcpu);
++ spin_lock_irqsave(&rq->lock, flags);
++ printk(" vcpu %d: last_pcpu %d, state %s%s\n",
++ vcpu->id, vcpu->last_pcpu,
++ vcpu->active ? "A" : "",
++ vcpu->running ? "R" : "");
++
++ printk(" rq: running %lu, load {%lu,%lu,%lu}, sw %Lu, sd %p, curr %p\n",
++ rq->nr_running,
++#ifdef CONFIG_SMP
++ rq->cpu_load[0], rq->cpu_load[1], rq->cpu_load[2],
++#else
++ 0LU, 0LU, 0LU,
++#endif
++ rq->nr_switches,
++#ifdef CONFIG_SMP
++ rq->sd,
++#else
++ NULL,
++#endif
++ rq->curr
++ );
++
++ s = buf;
++ sz = sizeof(buf) - 1;
++
++ show_rq_array(rq->active, " active:", &s, &sz);
++ show_rq_array(rq->expired, " expired:", &s, &sz);
++ spin_unlock_irqrestore(&rq->lock, flags);
++
++ *s = 0;
++ printk(buf);
++}
++
++static inline void fairsched_show_node(struct vcpu_scheduler *vsched)
++{
++#ifdef CONFIG_FAIRSCHED
++ struct fairsched_node *node;
++
++ node = vsched->node;
++ printk("fsnode: ready %d run %d cpu %d vsched %p, pcpu %d\n",
++ node->nr_ready, node->nr_runnable, node->nr_pcpu,
++ node->vsched, smp_processor_id());
++#endif
++}
++
++static void __show_vsched(struct vcpu_scheduler *vsched)
++{
++ char mask[NR_CPUS + 1];
++ int i;
++ unsigned long flags;
++
++ spin_lock_irqsave(&fairsched_lock, flags);
++ printk("vsched id=%d\n", vsched_id(vsched));
++ fairsched_show_node(vsched);
++
++ printk(" idle cpus ");
++ show_vcpu_list(vsched, &vsched->idle_list);
++ printk("; active cpus ");
++ show_vcpu_list(vsched, &vsched->active_list);
++ printk("; running cpus ");
++ show_vcpu_list(vsched, &vsched->running_list);
++ printk("\n");
++
++ cpumask_scnprintf(mask, NR_CPUS, vsched->vcpu_online_map);
++ printk(" num_online_cpus=%d, mask=%s (w=%d)\n",
++ vsched->num_online_vcpus, mask,
++ cpus_weight(vsched->vcpu_online_map));
++ spin_unlock_irqrestore(&fairsched_lock, flags);
++
++ for (i = 0; i < NR_CPUS; i++)
++ show_vcpu(vsched->vcpu[i]);
++}
++
++void show_vsched(void)
++{
++ oops_in_progress = 1;
++ __show_vsched(&idle_vsched);
++ __show_vsched(&default_vsched);
++ oops_in_progress = 0;
++}
++#endif /* CONFIG_SCHED_VCPU */
++
+ #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+ void __might_sleep(char *file, int line)
+ {
+@@ -6095,7 +7750,7 @@ void normalize_rt_tasks(void)
+ runqueue_t *rq;
+
+ read_lock_irq(&tasklist_lock);
+- for_each_process (p) {
++ for_each_process_all (p) {
+ if (!rt_task(p))
+ continue;
+
+@@ -6136,7 +7791,7 @@ void normalize_rt_tasks(void)
+ */
+ task_t *curr_task(int cpu)
+ {
+- return cpu_curr(cpu);
++ return vcpu_rq(pcpu(cpu)->vcpu)->curr;
+ }
+
+ /**
+@@ -6156,7 +7811,7 @@ task_t *curr_task(int cpu)
+ */
+ void set_curr_task(int cpu, task_t *p)
+ {
+- cpu_curr(cpu) = p;
++ vcpu_rq(pcpu(cpu)->vcpu)->curr = p;
+ }
+
+ #endif
+diff -upr linux-2.6.16.orig/kernel/signal.c linux-2.6.16-026test015/kernel/signal.c
+--- linux-2.6.16.orig/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/signal.c 2006-07-04 14:41:39.000000000 +0400
+@@ -25,17 +25,20 @@
+ #include <linux/posix-timers.h>
+ #include <linux/signal.h>
+ #include <linux/audit.h>
++#include <linux/kmem_cache.h>
+ #include <linux/capability.h>
+ #include <asm/param.h>
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+ #include <asm/siginfo.h>
++#include <ub/ub_misc.h>
+
+ /*
+ * SLAB caches for signal bits.
+ */
+
+-static kmem_cache_t *sigqueue_cachep;
++kmem_cache_t *sigqueue_cachep;
++EXPORT_SYMBOL_GPL(sigqueue_cachep);
+
+ /*
+ * In POSIX a signal is sent either to a specific thread (Linux task)
+@@ -221,6 +224,7 @@ fastcall void recalc_sigpending_tsk(stru
+ else
+ clear_tsk_thread_flag(t, TIF_SIGPENDING);
+ }
++EXPORT_SYMBOL_GPL(recalc_sigpending_tsk);
+
+ void recalc_sigpending(void)
+ {
+@@ -271,8 +275,13 @@ static struct sigqueue *__sigqueue_alloc
+ atomic_inc(&t->user->sigpending);
+ if (override_rlimit ||
+ atomic_read(&t->user->sigpending) <=
+- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
++ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) {
+ q = kmem_cache_alloc(sigqueue_cachep, flags);
++ if (q && ub_siginfo_charge(q, get_task_ub(t))) {
++ kmem_cache_free(sigqueue_cachep, q);
++ q = NULL;
++ }
++ }
+ if (unlikely(q == NULL)) {
+ atomic_dec(&t->user->sigpending);
+ } else {
+@@ -289,6 +298,7 @@ static void __sigqueue_free(struct sigqu
+ return;
+ atomic_dec(&q->user->sigpending);
+ free_uid(q->user);
++ ub_siginfo_uncharge(q);
+ kmem_cache_free(sigqueue_cachep, q);
+ }
+
+@@ -378,8 +388,11 @@ void __exit_signal(struct task_struct *t
+ wake_up_process(sig->group_exit_task);
+ sig->group_exit_task = NULL;
+ }
+- if (tsk == sig->curr_target)
++ if (tsk == sig->curr_target) {
+ sig->curr_target = next_thread(tsk);
++ if (tsk == sig->curr_target)
++ sig->curr_target = NULL;
++ }
+ tsk->signal = NULL;
+ /*
+ * Accumulate here the counters for all threads but the
+@@ -524,7 +537,16 @@ static int __dequeue_signal(struct sigpe
+ {
+ int sig = 0;
+
+- sig = next_signal(pending, mask);
++ /* SIGKILL must have priority, otherwise it is quite easy
++ * to create an unkillable process, sending sig < SIGKILL
++ * to self */
++ if (unlikely(sigismember(&pending->signal, SIGKILL))) {
++ if (!sigismember(mask, SIGKILL))
++ sig = SIGKILL;
++ }
++
++ if (likely(!sig))
++ sig = next_signal(pending, mask);
+ if (sig) {
+ if (current->notifier) {
+ if (sigismember(current->notifier_mask, sig)) {
+@@ -618,6 +640,7 @@ void signal_wake_up(struct task_struct *
+ if (!wake_up_state(t, mask))
+ kick_process(t);
+ }
++EXPORT_SYMBOL_GPL(signal_wake_up);
+
+ /*
+ * Remove signals in mask from the pending set and queue.
+@@ -838,7 +861,7 @@ static int send_signal(int sig, struct s
+ q->info.si_signo = sig;
+ q->info.si_errno = 0;
+ q->info.si_code = SI_USER;
+- q->info.si_pid = current->pid;
++ q->info.si_pid = virt_pid(current);
+ q->info.si_uid = current->uid;
+ break;
+ case (unsigned long) SEND_SIG_PRIV:
+@@ -975,7 +998,6 @@ __group_complete_signal(int sig, struct
+ if (t == NULL)
+ /* restart balancing at this thread */
+ t = p->signal->curr_target = p;
+- BUG_ON(t->tgid != p->tgid);
+
+ while (!wants_signal(sig, t)) {
+ t = next_thread(t);
+@@ -1159,13 +1181,18 @@ int __kill_pg_info(int sig, struct sigin
+ if (pgrp <= 0)
+ return -EINVAL;
+
++ /* Use __vpid_to_pid(). This function is used under write_lock
++ * tasklist_lock. */
++ if (is_virtual_pid(pgrp))
++ pgrp = __vpid_to_pid(pgrp);
++
+ success = 0;
+ retval = -ESRCH;
+- do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) {
+ int err = group_send_sig_info(sig, info, p);
+ success |= !err;
+ retval = err;
+- } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p);
+ return success ? 0 : retval;
+ }
+
+@@ -1193,7 +1220,7 @@ kill_proc_info(int sig, struct siginfo *
+ read_lock(&tasklist_lock);
+ acquired_tasklist_lock = 1;
+ }
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ error = -ESRCH;
+ if (p)
+ error = group_send_sig_info(sig, info, p);
+@@ -1214,7 +1241,7 @@ int kill_proc_info_as_uid(int sig, struc
+ return ret;
+
+ read_lock(&tasklist_lock);
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ if (!p) {
+ ret = -ESRCH;
+ goto out_unlock;
+@@ -1253,8 +1280,8 @@ static int kill_something_info(int sig,
+ struct task_struct * p;
+
+ read_lock(&tasklist_lock);
+- for_each_process(p) {
+- if (p->pid > 1 && p->tgid != current->tgid) {
++ for_each_process_ve(p) {
++ if (virt_pid(p) > 1 && p->tgid != current->tgid) {
+ int err = group_send_sig_info(sig, info, p);
+ ++count;
+ if (err != -EPERM)
+@@ -1562,9 +1589,17 @@ void do_notify_parent(struct task_struct
+ BUG_ON(!tsk->ptrace &&
+ (tsk->group_leader != tsk || !thread_group_empty(tsk)));
+
++#ifdef CONFIG_VE
++ /* Allow to send only SIGCHLD from VE */
++ if (sig != SIGCHLD &&
++ tsk->ve_task_info.owner_env !=
++ tsk->parent->ve_task_info.owner_env)
++ sig = SIGCHLD;
++#endif
++
+ info.si_signo = sig;
+ info.si_errno = 0;
+- info.si_pid = tsk->pid;
++ info.si_pid = get_task_pid_ve(tsk, tsk->parent->ve_task_info.owner_env);
+ info.si_uid = tsk->uid;
+
+ /* FIXME: find out whether or not this is supposed to be c*time. */
+@@ -1629,7 +1664,7 @@ static void do_notify_parent_cldstop(str
+
+ info.si_signo = SIGCHLD;
+ info.si_errno = 0;
+- info.si_pid = tsk->pid;
++ info.si_pid = get_task_pid_ve(tsk, VE_TASK_INFO(parent)->owner_env);
+ info.si_uid = tsk->uid;
+
+ /* FIXME: find out whether or not this is supposed to be c*time. */
+@@ -1763,7 +1798,9 @@ finish_stop(int stop_count)
+ read_unlock(&tasklist_lock);
+
+ out:
++ set_stop_state(current);
+ schedule();
++ clear_stop_state(current);
+ /*
+ * Now we don't run again until continued.
+ */
+@@ -1940,11 +1977,13 @@ relock:
+ ptrace_signal_deliver(regs, cookie);
+
+ /* Let the debugger run. */
++ set_pn_state(current, PN_STOP_SIGNAL);
+ ptrace_stop(signr, signr, info);
++ clear_pn_state(current);
+
+- /* We're back. Did the debugger cancel the sig or group_exit? */
++ /* We're back. Did the debugger cancel the sig? */
+ signr = current->exit_code;
+- if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
++ if (signr == 0)
+ continue;
+
+ current->exit_code = 0;
+@@ -1957,7 +1996,7 @@ relock:
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+- info->si_pid = current->parent->pid;
++ info->si_pid = virt_pid(current->parent);
+ info->si_uid = current->parent->uid;
+ }
+
+@@ -1988,8 +2027,14 @@ relock:
+ continue;
+
+ /* Init gets no signals it doesn't want. */
+- if (current->pid == 1)
++ if (virt_pid(current) == 1) {
++ /* Allow SIGKILL for non-root VE */
++#ifdef CONFIG_VE
++ if (current->pid == 1 ||
++ signr != SIGKILL)
++#endif
+ continue;
++ }
+
+ if (sig_kernel_stop(signr)) {
+ /*
+@@ -2307,7 +2352,6 @@ sys_rt_sigtimedwait(const sigset_t __use
+
+ timeout = schedule_timeout_interruptible(timeout);
+
+- try_to_freeze();
+ spin_lock_irq(&current->sighand->siglock);
+ sig = dequeue_signal(current, &these, &info);
+ current->blocked = current->real_blocked;
+@@ -2340,7 +2384,7 @@ sys_kill(int pid, int sig)
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_USER;
+- info.si_pid = current->tgid;
++ info.si_pid = virt_tgid(current);
+ info.si_uid = current->uid;
+
+ return kill_something_info(sig, &info, pid);
+@@ -2356,12 +2400,12 @@ static int do_tkill(int tgid, int pid, i
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_TKILL;
+- info.si_pid = current->tgid;
++ info.si_pid = virt_tgid(current);
+ info.si_uid = current->uid;
+
+ read_lock(&tasklist_lock);
+- p = find_task_by_pid(pid);
+- if (p && (tgid <= 0 || p->tgid == tgid)) {
++ p = find_task_by_pid_ve(pid);
++ if (p && (tgid <= 0 || virt_tgid(p) == tgid)) {
+ error = check_kill_permission(sig, &info, p);
+ /*
+ * The null signal is a permissions and process existence
+diff -upr linux-2.6.16.orig/kernel/softirq.c linux-2.6.16-026test015/kernel/softirq.c
+--- linux-2.6.16.orig/kernel/softirq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/softirq.c 2006-07-04 14:41:38.000000000 +0400
+@@ -13,10 +13,13 @@
+ #include <linux/mm.h>
+ #include <linux/notifier.h>
+ #include <linux/percpu.h>
++#include <linux/sysctl.h>
+ #include <linux/cpu.h>
+ #include <linux/kthread.h>
+ #include <linux/rcupdate.h>
+
++#include <ub/beancounter.h>
++
+ #include <asm/irq.h>
+ /*
+ - No shared variables, all the data are CPU local.
+@@ -44,6 +47,8 @@ EXPORT_SYMBOL(irq_stat);
+ static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
+
+ static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
++static DEFINE_PER_CPU(struct task_struct *, ksoftirqd_wakeup);
++static int ksoftirqd_stat[NR_CPUS];
+
+ /*
+ * we cannot loop indefinitely here to avoid userspace starvation,
+@@ -54,7 +59,7 @@ static DEFINE_PER_CPU(struct task_struct
+ static inline void wakeup_softirqd(void)
+ {
+ /* Interrupts are disabled: no need to stop preemption */
+- struct task_struct *tsk = __get_cpu_var(ksoftirqd);
++ struct task_struct *tsk = __get_cpu_var(ksoftirqd_wakeup);
+
+ if (tsk && tsk->state != TASK_RUNNING)
+ wake_up_process(tsk);
+@@ -73,10 +78,14 @@ static inline void wakeup_softirqd(void)
+
+ asmlinkage void __do_softirq(void)
+ {
++ struct user_beancounter *ub;
+ struct softirq_action *h;
+ __u32 pending;
+ int max_restart = MAX_SOFTIRQ_RESTART;
+ int cpu;
++ struct ve_struct *envid;
++
++ envid = set_exec_env(get_ve0());
+
+ pending = local_softirq_pending();
+
+@@ -90,6 +99,7 @@ restart:
+
+ h = softirq_vec;
+
++ ub = set_exec_ub(get_ub0());
+ do {
+ if (pending & 1) {
+ h->action(h);
+@@ -98,6 +108,7 @@ restart:
+ h++;
+ pending >>= 1;
+ } while (pending);
++ (void)set_exec_ub(ub);
+
+ local_irq_disable();
+
+@@ -108,6 +119,7 @@ restart:
+ if (pending)
+ wakeup_softirqd();
+
++ (void)set_exec_env(envid);
+ __local_bh_enable();
+ }
+
+@@ -483,6 +495,52 @@ static int __devinit cpu_callback(struct
+ return NOTIFY_OK;
+ }
+
++static int proc_ksoftirqd(ctl_table *ctl, int write, struct file *filp,
++ void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++ int ret, cpu;
++
++ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
++ if (!write)
++ return ret;
++
++ for_each_online_cpu(cpu) {
++ per_cpu(ksoftirqd_wakeup, cpu) =
++ ksoftirqd_stat[cpu] ? per_cpu(ksoftirqd, cpu) : NULL;
++ }
++ return ret;
++}
++
++static int sysctl_ksoftirqd(ctl_table *table, int *name, int nlen,
++ void *oldval, size_t *oldlenp, void *newval, size_t newlen,
++ void **context)
++{
++ return -EINVAL;
++}
++
++static ctl_table debug_table[] = {
++ {
++ .ctl_name = 1246,
++ .procname = "ksoftirqd",
++ .data = ksoftirqd_stat,
++ .maxlen = sizeof(ksoftirqd_stat),
++ .mode = 0644,
++ .proc_handler = &proc_ksoftirqd,
++ .strategy = &sysctl_ksoftirqd
++ },
++ {0}
++};
++
++static ctl_table root_table[] = {
++ {
++ .ctl_name = CTL_DEBUG,
++ .procname = "debug",
++ .mode = 0555,
++ .child = debug_table
++ },
++ {0}
++};
++
+ static struct notifier_block __devinitdata cpu_nfb = {
+ .notifier_call = cpu_callback
+ };
+@@ -493,5 +551,6 @@ __init int spawn_ksoftirqd(void)
+ cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&cpu_nfb);
++ register_sysctl_table(root_table, 0);
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/kernel/stop_machine.c linux-2.6.16-026test015/kernel/stop_machine.c
+--- linux-2.6.16.orig/kernel/stop_machine.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/stop_machine.c 2006-07-04 14:41:39.000000000 +0400
+@@ -96,7 +96,7 @@ static int stop_machine(void)
+ stopmachine_state = STOPMACHINE_WAIT;
+
+ for_each_online_cpu(i) {
+- if (i == raw_smp_processor_id())
++ if (i == task_cpu(current))
+ continue;
+ ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
+ if (ret < 0)
+@@ -178,7 +178,7 @@ struct task_struct *__stop_machine_run(i
+
+ /* If they don't care which CPU fn runs on, bind to any online one. */
+ if (cpu == NR_CPUS)
+- cpu = raw_smp_processor_id();
++ cpu = task_cpu(current);
+
+ p = kthread_create(do_stop, &smdata, "kstopmachine");
+ if (!IS_ERR(p)) {
+diff -upr linux-2.6.16.orig/kernel/sys.c linux-2.6.16-026test015/kernel/sys.c
+--- linux-2.6.16.orig/kernel/sys.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/sys.c 2006-07-04 14:41:39.000000000 +0400
+@@ -11,6 +11,7 @@
+ #include <linux/mman.h>
+ #include <linux/smp_lock.h>
+ #include <linux/notifier.h>
++#include <linux/virtinfo.h>
+ #include <linux/reboot.h>
+ #include <linux/prctl.h>
+ #include <linux/init.h>
+@@ -236,6 +237,94 @@ int capable(int cap)
+ EXPORT_SYMBOL(capable);
+ #endif
+
++static DECLARE_MUTEX(virtinfo_sem);
++static struct vnotifier_block *virtinfo_chain[VIRT_TYPES];
++
++void virtinfo_notifier_register(int type, struct vnotifier_block *nb)
++{
++ struct vnotifier_block **p;
++
++ down(&virtinfo_sem);
++ for (p = &virtinfo_chain[type];
++ *p != NULL && nb->priority < (*p)->priority;
++ p = &(*p)->next);
++ nb->next = *p;
++ smp_wmb();
++ *p = nb;
++ up(&virtinfo_sem);
++}
++
++EXPORT_SYMBOL(virtinfo_notifier_register);
++
++struct virtinfo_cnt_struct {
++ volatile unsigned long exit[NR_CPUS];
++ volatile unsigned long entry;
++};
++static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt);
++
++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb)
++{
++ struct vnotifier_block **p;
++ int entry_cpu, exit_cpu;
++ unsigned long cnt, ent;
++
++ down(&virtinfo_sem);
++ for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next);
++ *p = nb->next;
++ smp_mb();
++
++ for_each_cpu_mask(entry_cpu, cpu_possible_map) {
++ while (1) {
++ cnt = 0;
++ for_each_cpu_mask(exit_cpu, cpu_possible_map)
++ cnt +=
++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu];
++ smp_rmb();
++ ent = per_cpu(virtcnt, entry_cpu).entry;
++ if (cnt == ent)
++ break;
++ __set_current_state(TASK_UNINTERRUPTIBLE);
++ schedule_timeout(HZ / 100);
++ }
++ }
++ up(&virtinfo_sem);
++}
++
++EXPORT_SYMBOL(virtinfo_notifier_unregister);
++
++int virtinfo_notifier_call(int type, unsigned long n, void *data)
++{
++ int ret;
++ int entry_cpu, exit_cpu;
++ struct vnotifier_block *nb;
++
++ entry_cpu = get_cpu();
++ per_cpu(virtcnt, entry_cpu).entry++;
++ smp_wmb();
++ put_cpu();
++
++ nb = virtinfo_chain[type];
++ ret = NOTIFY_DONE;
++ while (nb)
++ {
++ ret = nb->notifier_call(nb, n, data, ret);
++ if(ret & NOTIFY_STOP_MASK) {
++ ret &= ~NOTIFY_STOP_MASK;
++ break;
++ }
++ nb = nb->next;
++ }
++
++ exit_cpu = get_cpu();
++ smp_wmb();
++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++;
++ put_cpu();
++
++ return ret;
++}
++
++EXPORT_SYMBOL(virtinfo_notifier_call);
++
+ static int set_one_prio(struct task_struct *p, int niceval, int error)
+ {
+ int no_nice;
+@@ -281,17 +370,19 @@ asmlinkage long sys_setpriority(int whic
+ switch (which) {
+ case PRIO_PROCESS:
+ if (!who)
+- who = current->pid;
+- p = find_task_by_pid(who);
++ who = virt_pid(current);
++ p = find_task_by_pid_ve(who);
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (!who)
+ who = process_group(current);
+- do_each_task_pid(who, PIDTYPE_PGID, p) {
++ else
++ who = vpid_to_pid(who);
++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+- } while_each_task_pid(who, PIDTYPE_PGID, p);
++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ user = current->user;
+@@ -301,10 +392,10 @@ asmlinkage long sys_setpriority(int whic
+ if ((who != current->uid) && !(user = find_user(who)))
+ goto out_unlock; /* No processes for this user */
+
+- do_each_thread(g, p)
++ do_each_thread_ve(g, p)
+ if (p->uid == who)
+ error = set_one_prio(p, niceval, error);
+- while_each_thread(g, p);
++ while_each_thread_ve(g, p);
+ if (who != current->uid)
+ free_uid(user); /* For find_user() */
+ break;
+@@ -334,8 +425,8 @@ asmlinkage long sys_getpriority(int whic
+ switch (which) {
+ case PRIO_PROCESS:
+ if (!who)
+- who = current->pid;
+- p = find_task_by_pid(who);
++ who = virt_pid(current);
++ p = find_task_by_pid_ve(who);
+ if (p) {
+ niceval = 20 - task_nice(p);
+ if (niceval > retval)
+@@ -345,11 +436,13 @@ asmlinkage long sys_getpriority(int whic
+ case PRIO_PGRP:
+ if (!who)
+ who = process_group(current);
+- do_each_task_pid(who, PIDTYPE_PGID, p) {
++ else
++ who = vpid_to_pid(who);
++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) {
+ niceval = 20 - task_nice(p);
+ if (niceval > retval)
+ retval = niceval;
+- } while_each_task_pid(who, PIDTYPE_PGID, p);
++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ user = current->user;
+@@ -359,13 +452,13 @@ asmlinkage long sys_getpriority(int whic
+ if ((who != current->uid) && !(user = find_user(who)))
+ goto out_unlock; /* No processes for this user */
+
+- do_each_thread(g, p)
++ do_each_thread_ve(g, p)
+ if (p->uid == who) {
+ niceval = 20 - task_nice(p);
+ if (niceval > retval)
+ retval = niceval;
+ }
+- while_each_thread(g, p);
++ while_each_thread_ve(g, p);
+ if (who != current->uid)
+ free_uid(user); /* for find_user() */
+ break;
+@@ -497,6 +590,35 @@ asmlinkage long sys_reboot(int magic1, i
+ magic2 != LINUX_REBOOT_MAGIC2C))
+ return -EINVAL;
+
++#ifdef CONFIG_VE
++ if (!ve_is_super(get_exec_env()))
++ switch (cmd) {
++ case LINUX_REBOOT_CMD_RESTART:
++ case LINUX_REBOOT_CMD_HALT:
++ case LINUX_REBOOT_CMD_POWER_OFF:
++ case LINUX_REBOOT_CMD_RESTART2: {
++ struct siginfo info;
++
++ info.si_errno = 0;
++ info.si_code = SI_KERNEL;
++ info.si_pid = virt_pid(current);
++ info.si_uid = current->uid;
++ info.si_signo = SIGKILL;
++
++ /* Sending to real init is safe */
++ send_sig_info(SIGKILL, &info,
++ get_exec_env()->init_entry);
++ }
++
++ case LINUX_REBOOT_CMD_CAD_ON:
++ case LINUX_REBOOT_CMD_CAD_OFF:
++ return 0;
++
++ default:
++ return -EINVAL;
++ }
++#endif
++
+ /* Instead of trying to make the power_off code look like
+ * halt when pm_power_off is not set do it the easy way.
+ */
+@@ -686,7 +808,7 @@ asmlinkage long sys_setgid(gid_t gid)
+ return 0;
+ }
+
+-static int set_user(uid_t new_ruid, int dumpclear)
++int set_user(uid_t new_ruid, int dumpclear)
+ {
+ struct user_struct *new_user;
+
+@@ -711,6 +833,7 @@ static int set_user(uid_t new_ruid, int
+ current->uid = new_ruid;
+ return 0;
+ }
++EXPORT_SYMBOL(set_user);
+
+ /*
+ * Unprivileged users may change the real uid to the effective uid
+@@ -1079,7 +1202,12 @@ asmlinkage long sys_times(struct tms __u
+ if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
+ return -EFAULT;
+ }
++#ifndef CONFIG_VE
+ return (long) jiffies_64_to_clock_t(get_jiffies_64());
++#else
++ return (long) jiffies_64_to_clock_t(get_jiffies_64() -
++ get_exec_env()->start_jiffies);
++#endif
+ }
+
+ /*
+@@ -1100,21 +1228,24 @@ asmlinkage long sys_setpgid(pid_t pid, p
+ struct task_struct *p;
+ struct task_struct *group_leader = current->group_leader;
+ int err = -EINVAL;
++ int _pgid;
+
+ if (!pid)
+- pid = group_leader->pid;
++ pid = virt_pid(group_leader);
+ if (!pgid)
+ pgid = pid;
+ if (pgid < 0)
+ return -EINVAL;
+
++ _pgid = vpid_to_pid(pgid);
++
+ /* From this point forward we keep holding onto the tasklist lock
+ * so that our parent does not change from under us. -DaveM
+ */
+ write_lock_irq(&tasklist_lock);
+
+ err = -ESRCH;
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+ if (!p)
+ goto out;
+
+@@ -1139,25 +1270,35 @@ asmlinkage long sys_setpgid(pid_t pid, p
+ if (p->signal->leader)
+ goto out;
+
+- if (pgid != pid) {
++ pgid = virt_pid(p);
++ if (_pgid != p->pid) {
+ struct task_struct *p;
+
+- do_each_task_pid(pgid, PIDTYPE_PGID, p) {
+- if (p->signal->session == group_leader->signal->session)
++ do_each_task_pid_ve(_pgid, PIDTYPE_PGID, p) {
++ if (p->signal->session == group_leader->signal->session) {
++ pgid = virt_pgid(p);
+ goto ok_pgid;
+- } while_each_task_pid(pgid, PIDTYPE_PGID, p);
++ }
++ } while_each_task_pid_ve(_pgid, PIDTYPE_PGID, p);
+ goto out;
+ }
+
+ ok_pgid:
+- err = security_task_setpgid(p, pgid);
++ err = security_task_setpgid(p, _pgid);
+ if (err)
+ goto out;
+
+- if (process_group(p) != pgid) {
++ if (process_group(p) != _pgid) {
+ detach_pid(p, PIDTYPE_PGID);
+- p->signal->pgrp = pgid;
+- attach_pid(p, PIDTYPE_PGID, pgid);
++ p->signal->pgrp = _pgid;
++ set_virt_pgid(p, pgid);
++ attach_pid(p, PIDTYPE_PGID, _pgid);
++ if (atomic_read(&p->signal->count) != 1) {
++ task_t *t;
++ for (t = next_thread(p); t != p; t = next_thread(t)) {
++ set_virt_pgid(t, pgid);
++ }
++ }
+ }
+
+ err = 0;
+@@ -1170,19 +1311,19 @@ out:
+ asmlinkage long sys_getpgid(pid_t pid)
+ {
+ if (!pid) {
+- return process_group(current);
++ return virt_pgid(current);
+ } else {
+ int retval;
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+
+ retval = -ESRCH;
+ if (p) {
+ retval = security_task_getpgid(p);
+ if (!retval)
+- retval = process_group(p);
++ retval = virt_pgid(p);
+ }
+ read_unlock(&tasklist_lock);
+ return retval;
+@@ -1194,7 +1335,7 @@ asmlinkage long sys_getpgid(pid_t pid)
+ asmlinkage long sys_getpgrp(void)
+ {
+ /* SMP - assuming writes are word atomic this is fine */
+- return process_group(current);
++ return virt_pgid(current);
+ }
+
+ #endif
+@@ -1202,19 +1343,19 @@ asmlinkage long sys_getpgrp(void)
+ asmlinkage long sys_getsid(pid_t pid)
+ {
+ if (!pid) {
+- return current->signal->session;
++ return virt_sid(current);
+ } else {
+ int retval;
+ struct task_struct *p;
+
+ read_lock(&tasklist_lock);
+- p = find_task_by_pid(pid);
++ p = find_task_by_pid_ve(pid);
+
+ retval = -ESRCH;
+ if(p) {
+ retval = security_task_getsid(p);
+ if (!retval)
+- retval = p->signal->session;
++ retval = virt_sid(p);
+ }
+ read_unlock(&tasklist_lock);
+ return retval;
+@@ -1236,9 +1377,20 @@ asmlinkage long sys_setsid(void)
+
+ group_leader->signal->leader = 1;
+ __set_special_pids(group_leader->pid, group_leader->pid);
++ set_virt_pgid(group_leader, virt_pid(group_leader));
++ set_virt_sid(group_leader, virt_pid(group_leader));
+ group_leader->signal->tty = NULL;
+ group_leader->signal->tty_old_pgrp = 0;
+- err = process_group(group_leader);
++ if (atomic_read(&group_leader->signal->count) != 1) {
++ task_t *t;
++ for (t = next_thread(group_leader); t != group_leader;
++ t = next_thread(t)) {
++ set_virt_pgid(t, virt_pid(group_leader));
++ set_virt_sid(t, virt_pid(group_leader));
++ }
++ }
++
++ err = virt_pgid(group_leader);
+ out:
+ write_unlock_irq(&tasklist_lock);
+ up(&tty_sem);
+@@ -1518,7 +1670,7 @@ asmlinkage long sys_newuname(struct new_
+ int errno = 0;
+
+ down_read(&uts_sem);
+- if (copy_to_user(name,&system_utsname,sizeof *name))
++ if (copy_to_user(name,&ve_utsname,sizeof *name))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+@@ -1529,15 +1681,15 @@ asmlinkage long sys_sethostname(char __u
+ int errno;
+ char tmp[__NEW_UTS_LEN];
+
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(tmp, name, len)) {
+- memcpy(system_utsname.nodename, tmp, len);
+- system_utsname.nodename[len] = 0;
++ memcpy(ve_utsname.nodename, tmp, len);
++ ve_utsname.nodename[len] = 0;
+ errno = 0;
+ }
+ up_write(&uts_sem);
+@@ -1553,11 +1705,11 @@ asmlinkage long sys_gethostname(char __u
+ if (len < 0)
+ return -EINVAL;
+ down_read(&uts_sem);
+- i = 1 + strlen(system_utsname.nodename);
++ i = 1 + strlen(ve_utsname.nodename);
+ if (i > len)
+ i = len;
+ errno = 0;
+- if (copy_to_user(name, system_utsname.nodename, i))
++ if (copy_to_user(name, ve_utsname.nodename, i))
+ errno = -EFAULT;
+ up_read(&uts_sem);
+ return errno;
+@@ -1574,7 +1726,7 @@ asmlinkage long sys_setdomainname(char _
+ int errno;
+ char tmp[__NEW_UTS_LEN];
+
+- if (!capable(CAP_SYS_ADMIN))
++ if (!capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ if (len < 0 || len > __NEW_UTS_LEN)
+ return -EINVAL;
+@@ -1582,8 +1734,8 @@ asmlinkage long sys_setdomainname(char _
+ down_write(&uts_sem);
+ errno = -EFAULT;
+ if (!copy_from_user(tmp, name, len)) {
+- memcpy(system_utsname.domainname, tmp, len);
+- system_utsname.domainname[len] = 0;
++ memcpy(ve_utsname.domainname, tmp, len);
++ ve_utsname.domainname[len] = 0;
+ errno = 0;
+ }
+ up_write(&uts_sem);
+@@ -1657,7 +1809,19 @@ asmlinkage long sys_setrlimit(unsigned i
+ (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
+ new_rlim.rlim_cur <= cputime_to_secs(
+ current->signal->it_prof_expires))) {
+- cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur);
++ unsigned long rlim_cur = new_rlim.rlim_cur;
++ cputime_t cputime;
++
++ if (rlim_cur == 0) {
++ /*
++ * The caller is asking for an immediate RLIMIT_CPU
++ * expiry. But we use the zero value to mean "it was
++ * never set". So let's cheat and make it one second
++ * instead
++ */
++ rlim_cur = 1;
++ }
++ cputime = secs_to_cputime(rlim_cur);
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&current->sighand->siglock);
+ set_process_cpu_timer(current, CPUCLOCK_PROF,
+diff -upr linux-2.6.16.orig/kernel/sysctl.c linux-2.6.16-026test015/kernel/sysctl.c
+--- linux-2.6.16.orig/kernel/sysctl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/sysctl.c 2006-07-04 14:41:39.000000000 +0400
+@@ -25,6 +25,8 @@
+ #include <linux/slab.h>
+ #include <linux/sysctl.h>
+ #include <linux/proc_fs.h>
++#include <linux/ve_owner.h>
++#include <linux/ve.h>
+ #include <linux/capability.h>
+ #include <linux/ctype.h>
+ #include <linux/utsname.h>
+@@ -63,6 +65,7 @@ extern int max_threads;
+ extern int sysrq_enabled;
+ extern int core_uses_pid;
+ extern int suid_dumpable;
++extern int sysctl_at_vsyscall;
+ extern char core_pattern[];
+ extern int cad_pid;
+ extern int pid_max;
+@@ -72,6 +75,12 @@ extern int printk_ratelimit_burst;
+ extern int pid_max_min, pid_max_max;
+ extern int sysctl_drop_caches;
+ extern int percpu_pagelist_fraction;
++#ifdef CONFIG_VE
++int glob_virt_pids = 1;
++EXPORT_SYMBOL(glob_virt_pids);
++#endif
++
++extern int ve_area_access_check; /* fs/namei.c */
+
+ #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
+ int unknown_nmi_panic;
+@@ -101,6 +110,10 @@ extern int msg_ctlmnb;
+ extern int msg_ctlmni;
+ extern int sem_ctls[];
+ #endif
++#ifdef CONFIG_SCHED_VCPU
++extern u32 vcpu_sched_timeslice;
++extern u32 vcpu_timeslice;
++#endif
+
+ #ifdef __sparc__
+ extern char reboot_command [];
+@@ -108,6 +121,8 @@ extern int stop_a_enabled;
+ extern int scons_pwroff;
+ #endif
+
++extern int alloc_fail_warn;
++
+ #ifdef __hppa__
+ extern int pwrsw_enabled;
+ extern int unaligned_enabled;
+@@ -122,6 +137,7 @@ extern int spin_retry;
+ #endif
+
+ extern int sysctl_hz_timer;
++int decode_call_traces = 1;
+
+ #ifdef CONFIG_BSD_PROCESS_ACCT
+ extern int acct_parm[];
+@@ -131,10 +147,14 @@ extern int acct_parm[];
+ extern int no_unaligned_warning;
+ #endif
+
++#ifdef CONFIG_FAIRSCHED
++extern int fairsched_max_latency;
++int fsch_sysctl_latency(ctl_table *ctl, int write, struct file *filp,
++ void __user *buffer, size_t *lenp, loff_t *ppos);
++#endif
++
+ static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
+ ctl_table *, void **);
+-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+- void __user *buffer, size_t *lenp, loff_t *ppos);
+
+ static ctl_table root_table[];
+ static struct ctl_table_header root_table_header =
+@@ -178,6 +198,8 @@ static void register_proc_table(ctl_tabl
+ static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
+ #endif
+
++extern struct new_utsname virt_utsname;
++
+ /* The default sysctl tables: */
+
+ static ctl_table root_table[] = {
+@@ -276,6 +298,15 @@ static ctl_table kern_table[] = {
+ .strategy = &sysctl_string,
+ },
+ {
++ .ctl_name = KERN_VIRT_OSRELEASE,
++ .procname = "virt_osrelease",
++ .data = virt_utsname.release,
++ .maxlen = sizeof(virt_utsname.release),
++ .mode = 0644,
++ .proc_handler = &proc_doutsstring,
++ .strategy = &sysctl_string,
++ },
++ {
+ .ctl_name = KERN_PANIC,
+ .procname = "panic",
+ .data = &panic_timeout,
+@@ -353,6 +384,22 @@ static ctl_table kern_table[] = {
+ .proc_handler = &proc_dointvec,
+ },
+ #endif
++ {
++ .ctl_name = KERN_SILENCE_LEVEL,
++ .procname = "silence-level",
++ .data = &console_silence_loglevel,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
++ {
++ .ctl_name = KERN_ALLOC_FAIL_WARN,
++ .procname = "alloc_fail_warn",
++ .data = &alloc_fail_warn,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
+ #ifdef __hppa__
+ {
+ .ctl_name = KERN_HPPA_PWRSW,
+@@ -579,6 +626,24 @@ static ctl_table kern_table[] = {
+ .proc_handler = &proc_dointvec,
+ },
+ #endif
++#ifdef CONFIG_SCHED_VCPU
++ {
++ .ctl_name = KERN_VCPU_SCHED_TIMESLICE,
++ .procname = "vcpu_sched_timeslice",
++ .data = &vcpu_sched_timeslice,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_VCPU_TIMESLICE,
++ .procname = "vcpu_timeslice",
++ .data = &vcpu_timeslice,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++#endif
+ {
+ .ctl_name = KERN_PIDMAX,
+ .procname = "pid_max",
+@@ -590,6 +655,16 @@ static ctl_table kern_table[] = {
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
++#ifdef CONFIG_VE
++ {
++ .ctl_name = KERN_VIRT_PIDS,
++ .procname = "virt_pids",
++ .data = &glob_virt_pids,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++#endif
+ {
+ .ctl_name = KERN_PANIC_ON_OOPS,
+ .procname = "panic_on_oops",
+@@ -683,6 +758,16 @@ static ctl_table kern_table[] = {
+ .proc_handler = &proc_dointvec,
+ },
+ #endif
++#ifdef CONFIG_FAIRSCHED
++ {
++ .ctl_name = KERN_FAIRSCHED_MAX_LATENCY,
++ .procname = "fairsched-max-latency",
++ .data = &fairsched_max_latency,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &fsch_sysctl_latency
++ },
++#endif
+ { .ctl_name = 0 }
+ };
+
+@@ -1046,10 +1131,26 @@ static ctl_table fs_table[] = {
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
++ {
++ .ctl_name = FS_AT_VSYSCALL,
++ .procname = "vsyscall",
++ .data = &sysctl_at_vsyscall,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
+ { .ctl_name = 0 }
+ };
+
+ static ctl_table debug_table[] = {
++ {
++ .ctl_name = DBG_DECODE_CALLTRACES,
++ .procname = "decode_call_traces",
++ .data = &decode_call_traces,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
+ { .ctl_name = 0 }
+ };
+
+@@ -1113,6 +1214,7 @@ int do_sysctl(int __user *name, int nlen
+ {
+ struct list_head *tmp;
+ int error = -ENOTDIR;
++ struct ve_struct *ve;
+
+ if (nlen <= 0 || nlen >= CTL_MAXNAME)
+ return -ENOTDIR;
+@@ -1121,13 +1223,24 @@ int do_sysctl(int __user *name, int nlen
+ if (!oldlenp || get_user(old_len, oldlenp))
+ return -EFAULT;
+ }
++ ve = get_exec_env();
+ spin_lock(&sysctl_lock);
++#ifdef CONFIG_VE
++ tmp = ve->sysctl_lh.next;
++#else
+ tmp = &root_table_header.ctl_entry;
++#endif
+ do {
+- struct ctl_table_header *head =
+- list_entry(tmp, struct ctl_table_header, ctl_entry);
++ struct ctl_table_header *head;
+ void *context = NULL;
+
++#ifdef CONFIG_VE
++ if (tmp == &ve->sysctl_lh)
++ /* second pass over global variables */
++ tmp = &root_table_header.ctl_entry;
++#endif
++
++ head = list_entry(tmp, struct ctl_table_header, ctl_entry);
+ if (!use_table(head))
+ continue;
+
+@@ -1181,10 +1294,14 @@ static int test_perm(int mode, int op)
+ static inline int ctl_perm(ctl_table *table, int op)
+ {
+ int error;
++ int mode = table->mode;
++
+ error = security_sysctl(table, op);
+ if (error)
+ return error;
+- return test_perm(table->mode, op);
++ if (!ve_accessible(table->owner_env, get_exec_env()))
++ mode &= ~0222; /* disable write access */
++ return test_perm(mode, op);
+ }
+
+ static int parse_table(int __user *name, int nlen,
+@@ -1350,6 +1467,8 @@ struct ctl_table_header *register_sysctl
+ int insert_at_head)
+ {
+ struct ctl_table_header *tmp;
++ struct list_head *lh;
++
+ tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+@@ -1358,17 +1477,52 @@ struct ctl_table_header *register_sysctl
+ tmp->used = 0;
+ tmp->unregistering = NULL;
+ spin_lock(&sysctl_lock);
++#ifdef CONFIG_VE
++ lh = &get_exec_env()->sysctl_lh;
++#else
++ lh = &root_table_header.ctl_entry;
++#endif
+ if (insert_at_head)
+- list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
++ list_add(&tmp->ctl_entry, lh);
+ else
+- list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
++ list_add_tail(&tmp->ctl_entry, lh);
+ spin_unlock(&sysctl_lock);
+ #ifdef CONFIG_PROC_FS
++#ifdef CONFIG_VE
++ register_proc_table(table, get_exec_env()->proc_sys_root, tmp);
++#else
+ register_proc_table(table, proc_sys_root, tmp);
+ #endif
++#endif
+ return tmp;
+ }
+
++void free_sysctl_clone(ctl_table *clone)
++{
++ kfree(clone);
++}
++
++ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr)
++{
++ int i;
++ ctl_table *clone;
++
++ clone = kmalloc(nr * sizeof(ctl_table), GFP_KERNEL);
++ if (clone == NULL)
++ return NULL;
++
++ memcpy(clone, tmpl, nr * sizeof(ctl_table));
++ for (i = 0; i < nr; i++) {
++ if (tmpl[i].ctl_name == 0)
++ continue;
++ clone[i].owner_env = get_exec_env();
++ if (tmpl[i].child == NULL)
++ continue;
++ clone[i].child = clone + (tmpl[i].child - tmpl);
++ }
++ return clone;
++}
++
+ /**
+ * unregister_sysctl_table - unregister a sysctl table hierarchy
+ * @header: the header returned from register_sysctl_table
+@@ -1382,8 +1536,12 @@ void unregister_sysctl_table(struct ctl_
+ spin_lock(&sysctl_lock);
+ start_unregistering(header);
+ #ifdef CONFIG_PROC_FS
++#ifdef CONFIG_VE
++ unregister_proc_table(header->ctl_table, get_exec_env()->proc_sys_root);
++#else
+ unregister_proc_table(header->ctl_table, proc_sys_root);
+ #endif
++#endif
+ spin_unlock(&sysctl_lock);
+ kfree(header);
+ }
+@@ -1469,11 +1627,6 @@ static void unregister_proc_table(ctl_ta
+ * its fields. We are under sysctl_lock here.
+ */
+ de->data = NULL;
+-
+- /* Don't unregister proc entries that are still being used.. */
+- if (atomic_read(&de->count))
+- continue;
+-
+ table->de = NULL;
+ remove_proc_entry(table->procname, root);
+ }
+@@ -1615,7 +1768,7 @@ int proc_dostring(ctl_table *table, int
+ * to observe. Should this be in kernel/sys.c ????
+ */
+
+-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
++int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ int r;
+@@ -2190,7 +2343,7 @@ int proc_dostring(ctl_table *table, int
+ return -ENOSYS;
+ }
+
+-static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
++int proc_doutsstring(ctl_table *table, int write, struct file *filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+ return -ENOSYS;
+@@ -2494,6 +2647,14 @@ void unregister_sysctl_table(struct ctl_
+ {
+ }
+
++ctl_table * clone_sysctl_template(ctl_table *tmpl, int nr)
++{
++ return NULL;
++}
++
++void free_sysctl_clone(ctl_table *tmpl)
++{
++}
+ #endif /* CONFIG_SYSCTL */
+
+ /*
+@@ -2506,6 +2667,7 @@ EXPORT_SYMBOL(proc_dointvec_minmax);
+ EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+ EXPORT_SYMBOL(proc_dostring);
++EXPORT_SYMBOL(proc_doutsstring);
+ EXPORT_SYMBOL(proc_doulongvec_minmax);
+ EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+ EXPORT_SYMBOL(register_sysctl_table);
+@@ -2514,3 +2676,5 @@ EXPORT_SYMBOL(sysctl_jiffies);
+ EXPORT_SYMBOL(sysctl_ms_jiffies);
+ EXPORT_SYMBOL(sysctl_string);
+ EXPORT_SYMBOL(unregister_sysctl_table);
++EXPORT_SYMBOL(clone_sysctl_template);
++EXPORT_SYMBOL(free_sysctl_clone);
+diff -upr linux-2.6.16.orig/kernel/timer.c linux-2.6.16-026test015/kernel/timer.c
+--- linux-2.6.16.orig/kernel/timer.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/timer.c 2006-07-04 14:41:38.000000000 +0400
+@@ -460,7 +460,11 @@ static inline void __run_timers(tvec_bas
+ spin_unlock_irq(&base->t_base.lock);
+ {
+ int preempt_count = preempt_count();
++ struct ve_struct *ve;
++
++ ve = set_exec_env(get_ve0());
+ fn(data);
++ (void)set_exec_env(ve);
+ if (preempt_count != preempt_count()) {
+ printk(KERN_WARNING "huh, entered %p "
+ "with preempt_count %08x, exited"
+@@ -868,6 +872,23 @@ EXPORT_SYMBOL(avenrun);
+ * calc_load - given tick count, update the avenrun load estimates.
+ * This is called while holding a write_lock on xtime_lock.
+ */
++
++static void calc_load_ve(void)
++{
++ unsigned long flags, nr_unint;
++
++ nr_unint = nr_uninterruptible() * FIXED_1;
++ spin_lock_irqsave(&kstat_glb_lock, flags);
++ CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
++ CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
++ CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
++ spin_unlock_irqrestore(&kstat_glb_lock, flags);
++
++#ifdef CONFIG_VE
++ do_update_load_avg_ve();
++#endif
++}
++
+ static inline void calc_load(unsigned long ticks)
+ {
+ unsigned long active_tasks; /* fixed-point */
+@@ -880,6 +901,7 @@ static inline void calc_load(unsigned lo
+ CALC_LOAD(avenrun[0], EXP_1, active_tasks);
+ CALC_LOAD(avenrun[1], EXP_5, active_tasks);
+ CALC_LOAD(avenrun[2], EXP_15, active_tasks);
++ calc_load_ve();
+ }
+ }
+
+@@ -990,7 +1012,7 @@ asmlinkage unsigned long sys_alarm(unsig
+ */
+ asmlinkage long sys_getpid(void)
+ {
+- return current->tgid;
++ return virt_tgid(current);
+ }
+
+ /*
+@@ -1012,12 +1034,13 @@ asmlinkage long sys_getpid(void)
+ asmlinkage long sys_getppid(void)
+ {
+ int pid;
++#ifndef CONFIG_DEBUG_SLAB
+ struct task_struct *me = current;
+ struct task_struct *parent;
+
+ parent = me->group_leader->real_parent;
+ for (;;) {
+- pid = parent->tgid;
++ pid = virt_tgid(parent);
+ #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+ {
+ struct task_struct *old = parent;
+@@ -1034,6 +1057,16 @@ asmlinkage long sys_getppid(void)
+ #endif
+ break;
+ }
++#else
++ /*
++ * ->real_parent could be released before dereference and
++ * we accessed freed kernel memory, which faults with debugging on.
++ * Keep it simple and stupid.
++ */
++ read_lock(&tasklist_lock);
++ pid = virt_tgid(current->group_leader->real_parent);
++ read_unlock(&tasklist_lock);
++#endif
+ return pid;
+ }
+
+@@ -1164,7 +1197,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterru
+ /* Thread ID - the internal kernel "pid" */
+ asmlinkage long sys_gettid(void)
+ {
+- return current->pid;
++ return virt_pid(current);
+ }
+
+ /*
+@@ -1176,11 +1209,12 @@ asmlinkage long sys_sysinfo(struct sysin
+ unsigned long mem_total, sav_total;
+ unsigned int mem_unit, bitcount;
+ unsigned long seq;
++ unsigned long *__avenrun;
++ struct timespec tp;
+
+ memset((char *)&val, 0, sizeof(struct sysinfo));
+
+ do {
+- struct timespec tp;
+ seq = read_seqbegin(&xtime_lock);
+
+ /*
+@@ -1197,14 +1231,25 @@ asmlinkage long sys_sysinfo(struct sysin
+ tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
+ tp.tv_sec++;
+ }
+- val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+-
+- val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
+- val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
+- val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
++ } while (read_seqretry(&xtime_lock, seq));
+
++ if (ve_is_super(get_exec_env())) {
++ val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
++ __avenrun = &avenrun[0];
+ val.procs = nr_threads;
+- } while (read_seqretry(&xtime_lock, seq));
++ }
++#ifdef CONFIG_VE
++ else {
++ struct ve_struct *ve;
++ ve = get_exec_env();
++ __avenrun = &ve->avenrun[0];
++ val.procs = atomic_read(&ve->pcounter);
++ val.uptime = tp.tv_sec - ve->start_timespec.tv_sec;
++ }
++#endif
++ val.loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
++ val.loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
++ val.loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+
+ si_meminfo(&val);
+ si_swapinfo(&val);
+diff -upr linux-2.6.16.orig/kernel/ub/Kconfig linux-2.6.16-026test015/kernel/ub/Kconfig
+--- linux-2.6.16.orig/kernel/ub/Kconfig 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/Kconfig 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,89 @@
++#
++# User resources part (UBC)
++#
++# Copyright (C) 2005 SWsoft
++# All rights reserved.
++#
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++menu "User resources"
++
++config USER_RESOURCE
++ bool "Enable user resource accounting"
++ default y
++ help
++ This patch provides accounting and allows to configure
++ limits for user's consumption of exhaustible system resources.
++ The most important resource controlled by this patch is unswappable
++ memory (either mlock'ed or used by internal kernel structures and
++ buffers). The main goal of this patch is to protect processes
++ from running short of important resources because of an accidental
++ misbehavior of processes or malicious activity aiming to ``kill''
++ the system. It's worth to mention that resource limits configured
++ by setrlimit(2) do not give an acceptable level of protection
++ because they cover only small fraction of resources and work on a
++ per-process basis. Per-process accounting doesn't prevent malicious
++ users from spawning a lot of resource-consuming processes.
++
++config USER_RSS_ACCOUNTING
++ bool "Account physical memory usage"
++ default y
++ depends on USER_RESOURCE
++ help
++ This allows to estimate per beancounter physical memory usage.
++ Implemented alghorithm accounts shared pages of memory as well,
++ dividing them by number of beancounter which use the page.
++
++config USER_SWAP_ACCOUNTING
++ bool "Account swap usage"
++ default y
++ depends on USER_RESOURCE
++ help
++ This allows accounting of swap usage.
++
++config USER_RESOURCE_PROC
++ bool "Report resource usage in /proc"
++ default y
++ depends on USER_RESOURCE
++ help
++ Allows a system administrator to inspect resource accounts and limits.
++
++config UBC_DEBUG
++ bool "User resources debug features"
++ default n
++ depends on USER_RESOURCE
++ help
++ Enables to setup debug features for user resource accounting
++
++config UBC_DEBUG_KMEM
++ bool "Debug kmemsize with cache counters"
++ default n
++ depends on UBC_DEBUG
++ help
++ Adds /proc/user_beancounters_debug entry to get statistics
++ about cache usage of each beancounter
++
++config UBC_KEEP_UNUSED
++ bool "Keep unused beancounter alive"
++ default y
++ depends on UBC_DEBUG
++ help
++ If on, unused beancounters are kept on the hash and maxheld value
++ can be looked through.
++
++config UBC_DEBUG_ITEMS
++ bool "Account resources in items rather than in bytes"
++ default y
++ depends on UBC_DEBUG
++ help
++ When true some of the resources (e.g. kmemsize) are accounted
++ in items instead of bytes.
++
++config UBC_UNLIMITED
++ bool "Use unlimited ubc settings"
++ default y
++ depends on UBC_DEBUG
++ help
++ When ON all limits and barriers are set to max values.
++
++endmenu
+diff -upr linux-2.6.16.orig/kernel/ub/Makefile linux-2.6.16-026test015/kernel/ub/Makefile
+--- linux-2.6.16.orig/kernel/ub/Makefile 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/Makefile 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,20 @@
++#
++# User resources part (UBC)
++#
++# Copyright (C) 2005 SWsoft
++# All rights reserved.
++#
++# Licensing governed by "linux/COPYING.SWsoft" file.
++
++obj-y := ub_sys.o
++obj-$(CONFIG_USER_RESOURCE) += beancounter.o
++obj-$(CONFIG_USER_RESOURCE) += ub_dcache.o
++obj-$(CONFIG_USER_RESOURCE) += ub_mem.o
++obj-$(CONFIG_USER_RESOURCE) += ub_misc.o
++obj-$(CONFIG_USER_RESOURCE) += ub_net.o
++obj-$(CONFIG_USER_RESOURCE) += ub_pages.o
++obj-$(CONFIG_USER_RESOURCE) += ub_stat.o
++# obj-$(CONFIG_USER_RESOURCE) += ub_oom.o
++
++obj-$(CONFIG_USER_RSS_ACCOUNTING) += ub_page_bc.o
++obj-$(CONFIG_USER_RESOURCE_PROC) += ub_proc.o
+diff -upr linux-2.6.16.orig/kernel/ub/beancounter.c linux-2.6.16-026test015/kernel/ub/beancounter.c
+--- linux-2.6.16.orig/kernel/ub/beancounter.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/beancounter.c 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,675 @@
++/*
++ * linux/kernel/ub/beancounter.c
++ *
++ * Copyright (C) 1998 Alan Cox
++ * 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg>
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * TODO:
++ * - more intelligent limit check in mremap(): currently the new size is
++ * charged and _then_ old size is uncharged
++ * (almost done: !move_vma case is completely done,
++ * move_vma in its current implementation requires too many conditions to
++ * do things right, because it may be not only expansion, but shrinking
++ * also, plus do_munmap will require an additional parameter...)
++ * - problem: bad pmd page handling
++ * - consider /proc redesign
++ * - TCP/UDP ports
++ * + consider whether __charge_beancounter_locked should be inline
++ *
++ * Changes:
++ * 1999/08/17 Marcelo Tosatti <marcelo@conectiva.com.br>
++ * - Set "barrier" and "limit" parts of limits atomically.
++ * 1999/10/06 Marcelo Tosatti <marcelo@conectiva.com.br>
++ * - setublimit system call.
++ */
++
++#include <linux/slab.h>
++#include <linux/module.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_hash.h>
++#include <ub/ub_vmpages.h>
++
++static kmem_cache_t *ub_cachep;
++static struct user_beancounter default_beancounter;
++struct user_beancounter ub0;
++
++const char *ub_rnames[] = {
++ "kmemsize", /* 0 */
++ "lockedpages",
++ "privvmpages",
++ "shmpages",
++ "dummy",
++ "numproc", /* 5 */
++ "physpages",
++ "vmguarpages",
++ "oomguarpages",
++ "numtcpsock",
++ "numflock", /* 10 */
++ "numpty",
++ "numsiginfo",
++ "tcpsndbuf",
++ "tcprcvbuf",
++ "othersockbuf", /* 15 */
++ "dgramrcvbuf",
++ "numothersock",
++ "dcachesize",
++ "numfile",
++ "dummy", /* 20 */
++ "dummy",
++ "dummy",
++ "numiptent",
++ "unused_privvmpages", /* UB_RESOURCES */
++ "tmpfs_respages",
++ "swap_pages",
++ "held_pages",
++};
++
++static void init_beancounter_struct(struct user_beancounter *ub);
++static void init_beancounter_store(struct user_beancounter *ub);
++static void init_beancounter_nolimits(struct user_beancounter *ub);
++
++void print_ub_uid(struct user_beancounter *ub, char *buf, int size)
++{
++ if (ub->parent != NULL)
++ snprintf(buf, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid);
++ else
++ snprintf(buf, size, "%u", ub->ub_uid);
++}
++EXPORT_SYMBOL(print_ub_uid);
++
++#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
++#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
++struct ub_hash_slot ub_hash[UB_HASH_SIZE];
++spinlock_t ub_hash_lock;
++EXPORT_SYMBOL(ub_hash);
++EXPORT_SYMBOL(ub_hash_lock);
++
++/*
++ * Per user resource beancounting. Resources are tied to their luid.
++ * The resource structure itself is tagged both to the process and
++ * the charging resources (a socket doesn't want to have to search for
++ * things at irq time for example). Reference counters keep things in
++ * hand.
++ *
++ * The case where a user creates resource, kills all his processes and
++ * then starts new ones is correctly handled this way. The refcounters
++ * will mean the old entry is still around with resource tied to it.
++ */
++struct user_beancounter *get_beancounter_byuid(uid_t uid, int create)
++{
++ struct user_beancounter *new_ub, *ub;
++ unsigned long flags;
++ struct ub_hash_slot *slot;
++
++ slot = &ub_hash[ub_hash_fun(uid)];
++ new_ub = NULL;
++
++retry:
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ ub = slot->ubh_beans;
++ while (ub != NULL && (ub->ub_uid != uid || ub->parent != NULL))
++ ub = ub->ub_next;
++
++ if (ub != NULL) {
++ /* found */
++ get_beancounter(ub);
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ if (new_ub != NULL)
++ kmem_cache_free(ub_cachep, new_ub);
++ return ub;
++ }
++
++ if (!create) {
++ /* no ub found */
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ return NULL;
++ }
++
++ if (new_ub != NULL) {
++ /* install new ub */
++ new_ub->ub_next = slot->ubh_beans;
++ slot->ubh_beans = new_ub;
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ return new_ub;
++ }
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++ /* alloc new ub */
++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep,
++ GFP_KERNEL);
++ if (new_ub == NULL)
++ return NULL;
++
++ ub_debug(UBD_ALLOC, "Creating ub %p in slot %p\n", new_ub, slot);
++ memcpy(new_ub, &default_beancounter, sizeof(*new_ub));
++ init_beancounter_struct(new_ub);
++ new_ub->ub_uid = uid;
++ goto retry;
++}
++EXPORT_SYMBOL(get_beancounter_byuid);
++
++struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p,
++ int id, int create)
++{
++ struct user_beancounter *new_ub, *ub;
++ unsigned long flags;
++ struct ub_hash_slot *slot;
++
++ slot = &ub_hash[ub_subhash_fun(p, id)];
++ new_ub = NULL;
++
++retry:
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ ub = slot->ubh_beans;
++ while (ub != NULL && (ub->parent != p || ub->ub_uid != id))
++ ub = ub->ub_next;
++
++ if (ub != NULL) {
++ /* found */
++ get_beancounter(ub);
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ if (new_ub != NULL) {
++ put_beancounter(new_ub->parent);
++ kmem_cache_free(ub_cachep, new_ub);
++ }
++ return ub;
++ }
++
++ if (!create) {
++ /* no ub found */
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ return NULL;
++ }
++
++ if (new_ub != NULL) {
++ /* install new ub */
++ get_beancounter(new_ub);
++ new_ub->ub_next = slot->ubh_beans;
++ slot->ubh_beans = new_ub;
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ return new_ub;
++ }
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++ /* alloc new ub */
++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep,
++ GFP_KERNEL);
++ if (new_ub == NULL)
++ return NULL;
++
++ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", new_ub, slot);
++ memset(new_ub, 0, sizeof(*new_ub));
++ init_beancounter_nolimits(new_ub);
++ init_beancounter_store(new_ub);
++ init_beancounter_struct(new_ub);
++ atomic_set(&new_ub->ub_refcount, 0);
++ new_ub->ub_uid = id;
++ new_ub->parent = get_beancounter(p);
++ goto retry;
++}
++EXPORT_SYMBOL(get_subbeancounter_byid);
++
++struct user_beancounter *subbeancounter_findcreate(struct user_beancounter *p,
++ int id)
++{
++ struct user_beancounter *ub;
++ unsigned long flags;
++ struct ub_hash_slot *slot;
++
++ slot = &ub_hash[ub_subhash_fun(p, id)];
++
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ ub = slot->ubh_beans;
++ while (ub != NULL && (ub->parent != p || ub->ub_uid != id))
++ ub = ub->ub_next;
++
++ if (ub != NULL) {
++ /* found */
++ get_beancounter(ub);
++ goto done;
++ }
++
++ /* alloc new ub */
++ /* Can be called from non-atomic contexts. Den */
++ ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
++ if (ub == NULL)
++ goto done;
++
++ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", ub, slot);
++ memset(ub, 0, sizeof(*ub));
++ init_beancounter_nolimits(ub);
++ init_beancounter_store(ub);
++ init_beancounter_struct(ub);
++ atomic_set(&ub->ub_refcount, 0);
++ ub->ub_uid = id;
++ ub->parent = get_beancounter(p);
++
++ /* install new ub */
++ get_beancounter(ub);
++ ub->ub_next = slot->ubh_beans;
++ slot->ubh_beans = ub;
++
++done:
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ return ub;
++}
++EXPORT_SYMBOL(subbeancounter_findcreate);
++#ifndef CONFIG_UBC_KEEP_UNUSED
++
++static int verify_res(struct user_beancounter *ub, int resource,
++ unsigned long held)
++{
++ char id[64];
++
++ if (likely(held == 0))
++ return 1;
++
++ print_ub_uid(ub, id, sizeof(id));
++ printk(KERN_WARNING "Ub %s helds %lu in %s on put\n",
++ id, held, ub_rnames[resource]);
++ return 0;
++}
++
++static inline void verify_held(struct user_beancounter *ub)
++{
++ int i, clean;
++
++ clean = 1;
++ for (i = 0; i < UB_RESOURCES; i++)
++ clean &= verify_res(ub, i, ub->ub_parms[i].held);
++
++ clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages);
++ clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages);
++ clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages);
++ clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages);
++
++ ub_debug_trace(!clean, 5, 60*HZ);
++}
++
++static void __unhash_beancounter(struct user_beancounter *ub)
++{
++ struct user_beancounter **ubptr;
++ struct ub_hash_slot *slot;
++
++ if (ub->parent != NULL)
++ slot = &ub_hash[ub_subhash_fun(ub->parent, ub->ub_uid)];
++ else
++ slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
++ ubptr = &slot->ubh_beans;
++
++ while (*ubptr != NULL) {
++ if (*ubptr == ub) {
++ verify_held(ub);
++ *ubptr = ub->ub_next;
++ return;
++ }
++ ubptr = &((*ubptr)->ub_next);
++ }
++ printk(KERN_ERR "Invalid beancounter %p, luid=%d on free, slot %p\n",
++ ub, ub->ub_uid, slot);
++}
++#endif
++
++void __put_beancounter(struct user_beancounter *ub)
++{
++ unsigned long flags;
++ struct user_beancounter *parent;
++
++again:
++ parent = ub->parent;
++ ub_debug(UBD_ALLOC, "__put bc %p (cnt %d) for %.20s pid %d "
++ "cur %08lx cpu %d.\n",
++ ub, atomic_read(&ub->ub_refcount),
++ current->comm, current->pid,
++ (unsigned long)current, smp_processor_id());
++
++ /* equevalent to atomic_dec_and_lock_irqsave() */
++ local_irq_save(flags);
++ if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
++ if (unlikely(atomic_read(&ub->ub_refcount) < 0))
++ printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
++ "luid=%d, ref=%d\n",
++ ub, ub->ub_uid,
++ atomic_read(&ub->ub_refcount));
++ local_irq_restore(flags);
++ return;
++ }
++
++ if (unlikely(ub == get_ub0())) {
++ printk(KERN_ERR "Trying to put ub0\n");
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ return;
++ }
++
++#ifndef CONFIG_UBC_KEEP_UNUSED
++ __unhash_beancounter(ub);
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ ub_free_counters(ub);
++ kmem_cache_free(ub_cachep, ub);
++#else
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++#endif
++ ub = parent;
++ if (ub != NULL)
++ goto again;
++}
++EXPORT_SYMBOL(__put_beancounter);
++
++/*
++ * Generic resource charging stuff
++ */
++
++int __charge_beancounter_locked(struct user_beancounter *ub,
++ int resource, unsigned long val, enum severity strict)
++{
++ ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n",
++ val, resource, ub, ub->ub_parms[resource].held);
++ /*
++ * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
++ * at the moment is possible so an overflow is impossible.
++ */
++ ub->ub_parms[resource].held += val;
++
++ switch (strict) {
++ case UB_HARD:
++ if (ub->ub_parms[resource].held >
++ ub->ub_parms[resource].barrier)
++ break;
++ case UB_SOFT:
++ if (ub->ub_parms[resource].held >
++ ub->ub_parms[resource].limit)
++ break;
++ case UB_FORCE:
++ ub_adjust_maxheld(ub, resource);
++ return 0;
++ default:
++ BUG();
++ }
++
++ if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl))
++ printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n",
++ ub_rnames[resource], ub->ub_uid);
++ ub->ub_parms[resource].failcnt++;
++ ub->ub_parms[resource].held -= val;
++ return -ENOMEM;
++}
++
++int charge_beancounter(struct user_beancounter *ub,
++ int resource, unsigned long val, enum severity strict)
++{
++ int retval;
++ struct user_beancounter *p, *q;
++ unsigned long flags;
++
++ retval = -EINVAL;
++ if (val > UB_MAXVALUE)
++ goto out;
++
++ local_irq_save(flags);
++ for (p = ub; p != NULL; p = p->parent) {
++ spin_lock(&p->ub_lock);
++ retval = __charge_beancounter_locked(p, resource, val, strict);
++ spin_unlock(&p->ub_lock);
++ if (retval)
++ goto unroll;
++ }
++out_restore:
++ local_irq_restore(flags);
++out:
++ return retval;
++
++unroll:
++ for (q = ub; q != p; q = q->parent) {
++ spin_lock(&q->ub_lock);
++ __uncharge_beancounter_locked(q, resource, val);
++ spin_unlock(&q->ub_lock);
++ }
++ goto out_restore;
++}
++
++EXPORT_SYMBOL(charge_beancounter);
++
++void charge_beancounter_notop(struct user_beancounter *ub,
++ int resource, unsigned long val)
++{
++ struct user_beancounter *p;
++ unsigned long flags;
++
++ local_irq_save(flags);
++ for (p = ub; p->parent != NULL; p = p->parent) {
++ spin_lock(&p->ub_lock);
++ __charge_beancounter_locked(p, resource, val, UB_FORCE);
++ spin_unlock(&p->ub_lock);
++ }
++ local_irq_restore(flags);
++}
++
++EXPORT_SYMBOL(charge_beancounter_notop);
++
++void uncharge_warn(struct user_beancounter *ub, int resource,
++ unsigned long val, unsigned long held)
++{
++ char id[64];
++
++ print_ub_uid(ub, id, sizeof(id));
++ printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n",
++ val, held, ub_rnames[resource], id);
++ ub_debug_trace(1, 10, 10*HZ);
++}
++
++void __uncharge_beancounter_locked(struct user_beancounter *ub,
++ int resource, unsigned long val)
++{
++ ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n",
++ val, resource, ub, ub->ub_parms[resource].held);
++ if (ub->ub_parms[resource].held < val) {
++ uncharge_warn(ub, resource,
++ val, ub->ub_parms[resource].held);
++ val = ub->ub_parms[resource].held;
++ }
++ ub->ub_parms[resource].held -= val;
++}
++
++void uncharge_beancounter(struct user_beancounter *ub,
++ int resource, unsigned long val)
++{
++ unsigned long flags;
++ struct user_beancounter *p;
++
++ for (p = ub; p != NULL; p = p->parent) {
++ spin_lock_irqsave(&p->ub_lock, flags);
++ __uncharge_beancounter_locked(p, resource, val);
++ spin_unlock_irqrestore(&p->ub_lock, flags);
++ }
++}
++
++EXPORT_SYMBOL(uncharge_beancounter);
++
++void uncharge_beancounter_notop(struct user_beancounter *ub,
++ int resource, unsigned long val)
++{
++ struct user_beancounter *p;
++ unsigned long flags;
++
++ local_irq_save(flags);
++ for (p = ub; p->parent != NULL; p = p->parent) {
++ spin_lock(&p->ub_lock);
++ __uncharge_beancounter_locked(p, resource, val);
++ spin_unlock(&p->ub_lock);
++ }
++ local_irq_restore(flags);
++}
++
++EXPORT_SYMBOL(uncharge_beancounter_notop);
++
++
++/*
++ * Rate limiting stuff.
++ */
++int ub_ratelimit(struct ub_rate_info *p)
++{
++ unsigned long cjif, djif;
++ unsigned long flags;
++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED;
++ long new_bucket;
++
++ spin_lock_irqsave(&ratelimit_lock, flags);
++ cjif = jiffies;
++ djif = cjif - p->last;
++ if (djif < p->interval) {
++ if (p->bucket >= p->burst) {
++ spin_unlock_irqrestore(&ratelimit_lock, flags);
++ return 0;
++ }
++ p->bucket++;
++ } else {
++ new_bucket = p->bucket - (djif / (unsigned)p->interval);
++ if (new_bucket < 0)
++ new_bucket = 0;
++ p->bucket = new_bucket + 1;
++ }
++ p->last = cjif;
++ spin_unlock_irqrestore(&ratelimit_lock, flags);
++ return 1;
++}
++EXPORT_SYMBOL(ub_ratelimit);
++
++
++/*
++ * Initialization
++ *
++ * struct user_beancounter contains
++ * - limits and other configuration settings,
++ * with a copy stored for accounting purposes,
++ * - structural fields: lists, spinlocks and so on.
++ *
++ * Before these parts are initialized, the structure should be memset
++ * to 0 or copied from a known clean structure. That takes care of a lot
++ * of fields not initialized explicitly.
++ */
++
++static void init_beancounter_struct(struct user_beancounter *ub)
++{
++ ub->ub_magic = UB_MAGIC;
++ atomic_set(&ub->ub_refcount, 1);
++ spin_lock_init(&ub->ub_lock);
++ INIT_LIST_HEAD(&ub->ub_tcp_sk_list);
++ INIT_LIST_HEAD(&ub->ub_other_sk_list);
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ INIT_LIST_HEAD(&ub->ub_cclist);
++#endif
++}
++
++static void init_beancounter_store(struct user_beancounter *ub)
++{
++ int k;
++
++ for (k = 0; k < UB_RESOURCES; k++) {
++ memcpy(&ub->ub_store[k], &ub->ub_parms[k],
++ sizeof(struct ubparm));
++ }
++}
++
++static void init_beancounter_nolimits(struct user_beancounter *ub)
++{
++ int k;
++
++ for (k = 0; k < UB_RESOURCES; k++) {
++ ub->ub_parms[k].limit = UB_MAXVALUE;
++ /* FIXME: whether this is right for physpages and guarantees? */
++ ub->ub_parms[k].barrier = UB_MAXVALUE;
++ }
++
++ /* FIXME: set unlimited rate? */
++ ub->ub_limit_rl.burst = 4;
++ ub->ub_limit_rl.interval = 300*HZ;
++}
++
++static void init_beancounter_syslimits(struct user_beancounter *ub,
++ unsigned long mp)
++{
++ extern int max_threads;
++ int k;
++
++ ub->ub_parms[UB_KMEMSIZE].limit =
++ mp > (192*1024*1024 >> PAGE_SHIFT) ?
++ 32*1024*1024 : (mp << PAGE_SHIFT) / 6;
++ ub->ub_parms[UB_LOCKEDPAGES].limit = 8;
++ ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE;
++ ub->ub_parms[UB_SHMPAGES].limit = 64;
++ ub->ub_parms[UB_NUMPROC].limit = max_threads / 2;
++ ub->ub_parms[UB_NUMTCPSOCK].limit = 1024;
++ ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */
++ ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */
++ ub->ub_parms[UB_NUMOTHERSOCK].limit = 256;
++ ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */
++ ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */
++ ub->ub_parms[UB_NUMFLOCK].limit = 1024;
++ ub->ub_parms[UB_NUMPTY].limit = 16;
++ ub->ub_parms[UB_NUMSIGINFO].limit = 1024;
++ ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024;
++ ub->ub_parms[UB_NUMFILE].limit = 1024;
++
++ for (k = 0; k < UB_RESOURCES; k++)
++ ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
++
++ ub->ub_limit_rl.burst = 4;
++ ub->ub_limit_rl.interval = 300*HZ;
++}
++
++void __init ub_init_ub0(void)
++{
++ struct user_beancounter *ub;
++
++ init_cache_counters();
++ ub = get_ub0();
++ memset(ub, 0, sizeof(*ub));
++ ub->ub_uid = 0;
++ init_beancounter_nolimits(ub);
++ init_beancounter_store(ub);
++ init_beancounter_struct(ub);
++
++ memset(&current->task_bc, 0, sizeof(struct task_beancounter));
++ (void)set_exec_ub(get_ub0());
++ current->task_bc.fork_sub = get_beancounter(get_ub0());
++ init_mm.mm_ub = get_beancounter(ub);
++}
++
++void __init ub_hash_init(void)
++{
++ struct ub_hash_slot *slot;
++
++ spin_lock_init(&ub_hash_lock);
++ /* insert ub0 into the hash */
++ slot = &ub_hash[ub_hash_fun(get_ub0()->ub_uid)];
++ slot->ubh_beans = get_ub0();
++}
++
++void __init ub_init_cache(unsigned long mempages)
++{
++ extern int skbc_cache_init(void);
++ int res;
++
++ res = 0; /* skbc_cache_init(); */
++ ub_cachep = kmem_cache_create("user_beancounters",
++ sizeof(struct user_beancounter),
++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
++ if (res < 0 || ub_cachep == NULL)
++ panic("Can't create ubc caches\n");
++
++ memset(&default_beancounter, 0, sizeof(default_beancounter));
++#ifdef CONFIG_UBC_UNLIMITED
++ init_beancounter_nolimits(&default_beancounter);
++#else
++ init_beancounter_syslimits(&default_beancounter, mempages);
++#endif
++ init_beancounter_store(&default_beancounter);
++ init_beancounter_struct(&default_beancounter);
++
++ ub_hash_init();
++}
+diff -upr linux-2.6.16.orig/kernel/ub/ub_dcache.c linux-2.6.16-026test015/kernel/ub/ub_dcache.c
+--- linux-2.6.16.orig/kernel/ub/ub_dcache.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_dcache.c 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,325 @@
++/*
++ * kernel/ub/ub_dcache.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/dcache.h>
++#include <linux/slab.h>
++#include <linux/kmem_cache.h>
++#include <linux/fs.h>
++#include <linux/err.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_mem.h>
++#include <ub/ub_dcache.h>
++
++/*
++ * Locking
++ * traverse dcache_lock d_lock
++ * ub_dentry_charge + + +
++ * ub_dentry_uncharge + - +
++ * ub_dentry_charge_nofail + + -
++ *
++ * d_inuse is atomic so that we can inc dentry's parent d_inuse in
++ * ub_dentry_charhe with the only dentry's d_lock held.
++ *
++ * Race in uncharge vs charge_nofail is handled with dcache_lock.
++ * Race in charge vs charge_nofail is inessential since they both inc d_inuse.
++ * Race in uncharge vs charge is handled by altering d_inuse under d_lock.
++ *
++ * Race with d_move is handled this way:
++ * - charge_nofail and uncharge are protected by dcache_lock;
++ * - charge works only with dentry and dentry->d_parent->d_inuse, so
++ * it's enough to lock only the dentry.
++ */
++
++/*
++ * Beancounting
++ * UB argument must NOT be NULL
++ */
++
++static int do_charge_dcache(struct user_beancounter *ub, unsigned long size,
++ enum severity sv)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv))
++ goto out_mem;
++ if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv))
++ goto out_dcache;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return 0;
++
++out_dcache:
++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size));
++out_mem:
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return -ENOMEM;
++}
++
++static void do_uncharge_dcache(struct user_beancounter *ub,
++ unsigned long size)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size));
++ __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++static int charge_dcache(struct user_beancounter *ub, unsigned long size,
++ enum severity sv)
++{
++ struct user_beancounter *p, *q;
++
++ for (p = ub; p != NULL; p = p->parent) {
++ if (do_charge_dcache(p, size, sv))
++ goto unroll;
++ }
++ return 0;
++
++unroll:
++ for (q = ub; q != p; q = q->parent)
++ do_uncharge_dcache(q, size);
++ return -ENOMEM;
++}
++
++void uncharge_dcache(struct user_beancounter *ub, unsigned long size)
++{
++ for (; ub != NULL; ub = ub->parent)
++ do_uncharge_dcache(ub, size);
++}
++
++static inline void charge_dcache_forced(struct user_beancounter *ub,
++ unsigned long size)
++{
++ charge_dcache(ub, size, UB_FORCE);
++}
++
++static inline void d_forced_charge(struct dentry_beancounter *d_bc)
++{
++ d_bc->d_ub = get_beancounter(get_exec_ub());
++ if (d_bc->d_ub == NULL)
++ return;
++
++ charge_dcache_forced(d_bc->d_ub, d_bc->d_ubsize);
++}
++
++static inline void d_uncharge(struct dentry_beancounter *d_bc)
++{
++ if (d_bc->d_ub == NULL)
++ return;
++
++ uncharge_dcache(d_bc->d_ub, d_bc->d_ubsize);
++ put_beancounter(d_bc->d_ub);
++ d_bc->d_ub = NULL;
++}
++
++/*
++ * Alloc / free dentry_beancounter
++ */
++
++static inline int d_alloc_beancounter(struct dentry *d)
++{
++ return 0;
++}
++
++static inline void d_free_beancounter(struct dentry_beancounter *d_bc)
++{
++}
++
++static inline unsigned long d_charge_size(struct dentry *dentry)
++{
++ /* dentry's d_name is already set to appropriate value (see d_alloc) */
++ return inode_cachep->objuse + dentry_cache->objuse +
++ (dname_external(dentry) ?
++ kmem_obj_memusage((void *)dentry->d_name.name) : 0);
++}
++
++/*
++ * dentry mark in use operation
++ * d_lock is held
++ */
++
++static int d_inc_inuse(struct dentry *dentry)
++{
++ struct user_beancounter *ub;
++ struct dentry_beancounter *d_bc;
++
++ if (dentry != dentry->d_parent) {
++ struct dentry *parent;
++
++ /*
++ * Increment d_inuse of parent.
++ * It can't change since dentry->d_lock is held.
++ */
++ parent = dentry->d_parent;
++ if (ub_dget_testone(parent))
++ BUG();
++ }
++
++ d_bc = &dentry->dentry_bc;
++ ub = get_beancounter(get_exec_ub());
++
++ if (ub != NULL && charge_dcache(ub, d_bc->d_ubsize, UB_SOFT))
++ goto out_err;
++
++ d_bc->d_ub = ub;
++ return 0;
++
++out_err:
++ put_beancounter(ub);
++ d_bc->d_ub = NULL;
++ return -ENOMEM;
++}
++
++/*
++ * no locks
++ */
++int ub_dentry_alloc(struct dentry *dentry)
++{
++ int err;
++ struct dentry_beancounter *d_bc;
++
++ err = d_alloc_beancounter(dentry);
++ if (err < 0)
++ return err;
++
++ d_bc = &dentry->dentry_bc;
++ d_bc->d_ub = get_beancounter(get_exec_ub());
++ atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in ub_dcache.h */
++ d_bc->d_ubsize = d_charge_size(dentry);
++
++ err = 0;
++ if (d_bc->d_ub != NULL &&
++ charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) {
++ put_beancounter(d_bc->d_ub);
++ d_free_beancounter(d_bc);
++ err = -ENOMEM;
++ }
++
++ return err;
++}
++
++/*
++ * Charge / uncharge functions.
++ *
++ * We take d_lock to protect dentry_bc from concurrent acces
++ * when simultaneous __d_lookup and d_put happens on one dentry.
++ */
++
++/*
++ * no dcache_lock, d_lock and rcu_read_lock are held
++ * drops d_lock, rcu_read_lock and returns error if any
++ */
++int ub_dentry_charge(struct dentry *dentry)
++{
++ int err;
++
++ err = 0;
++ if (ub_dget_testone(dentry))
++ err = d_inc_inuse(dentry);
++
++ /*
++ * d_lock and rcu_read_lock are dropped here
++ * (see also __d_lookup)
++ */
++ spin_unlock(&dentry->d_lock);
++ rcu_read_unlock();
++
++ if (!err)
++ return 0;
++
++ /*
++ * d_invlaidate is required for real_lookup
++ * since it tries to create new dentry on
++ * d_lookup failure.
++ */
++ if (!d_invalidate(dentry))
++ return err;
++
++ /* didn't succeeded, force dentry to be charged */
++ d_forced_charge(&dentry->dentry_bc);
++ return 0;
++}
++
++/*
++ * dcache_lock is held
++ * no d_locks, sequentaly takes and drops from dentry upward
++ */
++void ub_dentry_uncharge(struct dentry *dentry)
++{
++ struct dentry *parent;
++
++ /* go up until status is changed and root is not reached */
++ while (1) {
++ /*
++ * We need d_lock here to handle
++ * the race with ub_dentry_charge
++ */
++ spin_lock(&dentry->d_lock);
++ if (!ub_dput_testzero(dentry)) {
++ spin_unlock(&dentry->d_lock);
++ break;
++ }
++
++ /* state transition 0 => -1 */
++ d_uncharge(&dentry->dentry_bc);
++ parent = dentry->d_parent;
++ spin_unlock(&dentry->d_lock);
++
++ /*
++ * dcache_lock is held (see comment in __dget_locked)
++ * so we can safely move upwards.
++ */
++ if (dentry == parent)
++ break;
++ dentry = parent;
++ }
++}
++
++/*
++ * forced version. for dget in clean cache, when error is not an option
++ *
++ * dcache_lock is held
++ * no d_locks
++ */
++void ub_dentry_charge_nofail(struct dentry *dentry)
++{
++ struct dentry *parent;
++
++ /* go up until status is changed and root is not reached */
++ while (1) {
++ if (!ub_dget_testone(dentry))
++ break;
++
++ /*
++ * state transition -1 => 0
++ *
++ * No need to lock dentry before atomic_inc
++ * like we do in ub_dentry_uncharge.
++ * We can't race with ub_dentry_uncharge due
++ * to dcache_lock. The only possible race with
++ * ub_dentry_charge is OK since they both
++ * do atomic_inc.
++ */
++ d_forced_charge(&dentry->dentry_bc);
++ /*
++ * dcache_lock is held (see comment in __dget_locked)
++ * so we can safely move upwards.
++ */
++ parent = dentry->d_parent;
++
++ if (dentry == parent)
++ break;
++ dentry = parent;
++ }
++}
+diff -upr linux-2.6.16.orig/kernel/ub/ub_mem.c linux-2.6.16-026test015/kernel/ub/ub_mem.c
+--- linux-2.6.16.orig/kernel/ub/ub_mem.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_mem.c 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,384 @@
++/*
++ * kernel/ub/ub_mem.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/slab.h>
++#include <linux/kmem_cache.h>
++#include <linux/kmem_slab.h>
++#include <linux/highmem.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/gfp.h>
++#include <linux/swap.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>
++#include <linux/module.h>
++#include <ub/beancounter.h>
++#include <ub/ub_mem.h>
++#include <ub/ub_hash.h>
++
++/*
++ * Initialization
++ */
++
++/*
++ * Slab accounting
++ */
++
++#ifdef CONFIG_UBC_DEBUG_KMEM
++
++#define CC_HASH_SIZE 1024
++static struct ub_cache_counter *cc_hash[CC_HASH_SIZE];
++spinlock_t cc_lock;
++
++static void __free_cache_counters(struct user_beancounter *ub,
++ kmem_cache_t *cachep)
++{
++ struct ub_cache_counter *cc, **pprev, *del;
++ int i;
++ unsigned long flags;
++
++ del = NULL;
++ spin_lock_irqsave(&cc_lock, flags);
++ for (i = 0; i < CC_HASH_SIZE; i++) {
++ pprev = &cc_hash[i];
++ cc = cc_hash[i];
++ while (cc != NULL) {
++ if (cc->ub != ub && cc->cachep != cachep) {
++ pprev = &cc->next;
++ cc = cc->next;
++ continue;
++ }
++
++ list_del(&cc->ulist);
++ *pprev = cc->next;
++ cc->next = del;
++ del = cc;
++ cc = *pprev;
++ }
++ }
++ spin_unlock_irqrestore(&cc_lock, flags);
++
++ while (del != NULL) {
++ cc = del->next;
++ kfree(del);
++ del = cc;
++ }
++}
++
++void ub_free_counters(struct user_beancounter *ub)
++{
++ __free_cache_counters(ub, NULL);
++}
++
++void ub_kmemcache_free(kmem_cache_t *cachep)
++{
++ __free_cache_counters(NULL, cachep);
++}
++
++void __init init_cache_counters(void)
++{
++ memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0]));
++ spin_lock_init(&cc_lock);
++}
++
++#define cc_hash_fun(ub, cachep) ( \
++ (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \
++ ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \
++ ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \
++ ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \
++ ) & (CC_HASH_SIZE - 1))
++
++static int change_slab_charged(struct user_beancounter *ub, void *objp,
++ unsigned long val, int mask)
++{
++ struct ub_cache_counter *cc, *new_cnt, **pprev;
++ kmem_cache_t *cachep;
++ unsigned long flags;
++
++ cachep = virt_to_cache(objp);
++ new_cnt = NULL;
++
++again:
++ spin_lock_irqsave(&cc_lock, flags);
++ cc = cc_hash[cc_hash_fun(ub, cachep)];
++ while (cc) {
++ if (cc->ub == ub && cc->cachep == cachep)
++ goto found;
++ cc = cc->next;
++ }
++
++ if (new_cnt != NULL)
++ goto insert;
++
++ spin_unlock_irqrestore(&cc_lock, flags);
++
++ new_cnt = kmalloc(sizeof(*new_cnt), mask & ~__GFP_UBC);
++ if (new_cnt == NULL)
++ return -ENOMEM;
++
++ new_cnt->counter = 0;
++ new_cnt->ub = ub;
++ new_cnt->cachep = cachep;
++ goto again;
++
++insert:
++ pprev = &cc_hash[cc_hash_fun(ub, cachep)];
++ new_cnt->next = *pprev;
++ *pprev = new_cnt;
++ list_add(&new_cnt->ulist, &ub->ub_cclist);
++ cc = new_cnt;
++ new_cnt = NULL;
++
++found:
++ cc->counter += val;
++ spin_unlock_irqrestore(&cc_lock, flags);
++ if (new_cnt)
++ kfree(new_cnt);
++ return 0;
++}
++
++static inline int inc_slab_charged(struct user_beancounter *ub,
++ void *objp, int mask)
++{
++ return change_slab_charged(ub, objp, 1, mask);
++}
++
++static inline void dec_slab_charged(struct user_beancounter *ub, void *objp)
++{
++ if (change_slab_charged(ub, objp, -1, 0) < 0)
++ BUG();
++}
++
++#include <linux/vmalloc.h>
++
++static inline int inc_pages_charged(struct user_beancounter *ub,
++ struct page *pg, int order)
++{
++ int cpu;
++
++ cpu = get_cpu();
++ ub->ub_stat[cpu].pages_charged += (1 << order);
++ put_cpu();
++ return 0;
++}
++
++static inline void dec_pages_charged(struct user_beancounter *ub,
++ struct page *pg, int order)
++{
++ int cpu;
++
++ cpu = get_cpu();
++ ub->ub_stat[cpu].pages_charged -= (1 << order);
++ put_cpu();
++}
++
++void inc_vmalloc_charged(struct vm_struct *vm, int flags)
++{
++ int cpu;
++ struct user_beancounter *ub;
++
++ if (!(flags & __GFP_UBC))
++ return;
++
++ ub = get_exec_ub();
++ if (ub == NULL)
++ return;
++
++ cpu = get_cpu();
++ ub->ub_stat[cpu].vmalloc_charged += vm->nr_pages;
++ put_cpu();
++}
++
++void dec_vmalloc_charged(struct vm_struct *vm)
++{
++ int cpu;
++ struct user_beancounter *ub;
++
++ ub = page_ub(vm->pages[0]);
++ if (ub == NULL)
++ return;
++
++ cpu = get_cpu();
++ ub->ub_stat[cpu].vmalloc_charged -= vm->nr_pages;
++ put_cpu();
++}
++
++#else
++#define inc_slab_charged(ub, o, m) (0)
++#define dec_slab_charged(ub, o) do { } while (0)
++#define inc_pages_charged(ub, pg, o) (0)
++#define dec_pages_charged(ub, pg, o) do { } while (0)
++#endif
++
++static inline struct user_beancounter **slab_ub_ref(void *objp)
++{
++ kmem_cache_t *cachep;
++ struct slab *slabp;
++ int objnr;
++
++ cachep = virt_to_cache(objp);
++ BUG_ON(!(cachep->flags & SLAB_UBC));
++ slabp = virt_to_slab(objp);
++ objnr = (objp - slabp->s_mem) / cachep->buffer_size;
++ return slab_ubcs(cachep, slabp) + objnr;
++}
++
++struct user_beancounter *slab_ub(void *objp)
++{
++ struct user_beancounter **ub_ref;
++
++ ub_ref = slab_ub_ref(objp);
++ return *ub_ref;
++}
++
++EXPORT_SYMBOL(slab_ub);
++
++static inline int should_charge(void *objp, int flags)
++{
++ kmem_cache_t *cachep;
++
++ cachep = virt_to_cache(objp);
++ if (!(cachep->flags & SLAB_UBC))
++ return 0;
++ if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC))
++ return 0;
++ return 1;
++}
++
++#define should_uncharge(objp) should_charge(objp, __GFP_UBC)
++
++int ub_slab_charge(void *objp, int flags)
++{
++ unsigned int size;
++ struct user_beancounter *ub;
++
++ if (!should_charge(objp, flags))
++ return 0;
++
++ ub = get_beancounter(get_exec_ub());
++ if (ub == NULL)
++ return 0;
++
++ size = CHARGE_SIZE(kmem_obj_memusage(objp));
++ if (charge_beancounter(ub, UB_KMEMSIZE, size,
++ (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD)))
++ goto out_err;
++
++ if (inc_slab_charged(ub, objp, flags) < 0) {
++ uncharge_beancounter(ub, UB_KMEMSIZE, size);
++ goto out_err;
++ }
++ *slab_ub_ref(objp) = ub;
++ return 0;
++
++out_err:
++ put_beancounter(ub);
++ return -ENOMEM;
++}
++
++void ub_slab_uncharge(void *objp)
++{
++ unsigned int size;
++ struct user_beancounter **ub_ref;
++
++ if (!should_uncharge(objp))
++ return;
++
++ ub_ref = slab_ub_ref(objp);
++ if (*ub_ref == NULL)
++ return;
++
++ dec_slab_charged(*ub_ref, objp);
++ size = CHARGE_SIZE(kmem_obj_memusage(objp));
++ uncharge_beancounter(*ub_ref, UB_KMEMSIZE, size);
++ put_beancounter(*ub_ref);
++ *ub_ref = NULL;
++}
++
++/*
++ * Pages accounting
++ */
++
++inline int ub_page_charge(struct page *page, int order, int mask)
++{
++ struct user_beancounter *ub;
++
++ ub = NULL;
++ if (!(mask & __GFP_UBC))
++ goto out;
++
++ ub = get_beancounter(get_exec_ub());
++ if (ub == NULL)
++ goto out;
++
++ if (charge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order),
++ (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD)))
++ goto err;
++ if (inc_pages_charged(ub, page, order) < 0) {
++ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order));
++ goto err;
++ }
++out:
++ BUG_ON(page_ub(page) != NULL);
++ page_ub(page) = ub;
++ return 0;
++
++err:
++ BUG_ON(page_ub(page) != NULL);
++ put_beancounter(ub);
++ return -ENOMEM;
++}
++
++inline void ub_page_uncharge(struct page *page, int order)
++{
++ struct user_beancounter *ub;
++
++ ub = page_ub(page);
++ if (ub == NULL)
++ return;
++
++ dec_pages_charged(ub, page, order);
++ BUG_ON(ub->ub_magic != UB_MAGIC);
++ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order));
++ put_beancounter(ub);
++ page_ub(page) = NULL;
++}
++
++/*
++ * takes init_mm.page_table_lock
++ * some outer lock to protect pages from vmalloced area must be held
++ */
++struct user_beancounter *vmalloc_ub(void *obj)
++{
++ struct page *pg;
++
++ pg = vmalloc_to_page(obj);
++ if (pg == NULL)
++ return NULL;
++
++ return page_ub(pg);
++}
++
++EXPORT_SYMBOL(vmalloc_ub);
++
++struct user_beancounter *mem_ub(void *obj)
++{
++ struct user_beancounter *ub;
++
++ if ((unsigned long)obj >= VMALLOC_START &&
++ (unsigned long)obj < VMALLOC_END)
++ ub = vmalloc_ub(obj);
++ else
++ ub = slab_ub(obj);
++
++ return ub;
++}
++
++EXPORT_SYMBOL(mem_ub);
+diff -upr linux-2.6.16.orig/kernel/ub/ub_misc.c linux-2.6.16-026test015/kernel/ub/ub_misc.c
+--- linux-2.6.16.orig/kernel/ub/ub_misc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_misc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,244 @@
++/*
++ * kernel/ub/ub_misc.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/tty.h>
++#include <linux/tty_driver.h>
++#include <linux/signal.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/sched.h>
++#include <linux/kmem_cache.h>
++#include <linux/module.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_mem.h>
++
++/*
++ * Task staff
++ */
++
++static void init_task_sub(struct task_struct *tsk,
++ struct task_beancounter *old_bc)
++{
++ struct task_beancounter *new_bc;
++ struct user_beancounter *sub;
++
++ new_bc = &tsk->task_bc;
++ sub = old_bc->fork_sub;
++ new_bc->fork_sub = get_beancounter(sub);
++ new_bc->task_fnode = NULL;
++ new_bc->task_freserv = old_bc->task_freserv;
++ old_bc->task_freserv = NULL;
++ memset(&new_bc->task_data, 0, sizeof(new_bc->task_data));
++}
++
++int ub_task_charge(struct task_struct *parent, struct task_struct *task)
++{
++ struct task_beancounter *old_bc;
++ struct task_beancounter *new_bc;
++ struct user_beancounter *ub;
++
++ old_bc = &parent->task_bc;
++#if 0
++ if (old_bc->exec_ub == NULL) {
++ /* FIXME: this won't work if task_bc is outside task_struct */
++ init_task_sub(task, old_bc);
++ return 0;
++ }
++#endif
++ ub = old_bc->fork_sub;
++
++ if (charge_beancounter(ub, UB_NUMPROC, 1, UB_HARD) < 0)
++ return -ENOMEM;
++
++ new_bc = &task->task_bc;
++ new_bc->task_ub = get_beancounter(ub);
++ new_bc->exec_ub = get_beancounter(ub);
++ init_task_sub(task, old_bc);
++ return 0;
++}
++
++void ub_task_uncharge(struct task_struct *task)
++{
++ struct task_beancounter *task_bc;
++
++ task_bc = &task->task_bc;
++ if (task_bc->task_ub != NULL)
++ uncharge_beancounter(task_bc->task_ub, UB_NUMPROC, 1);
++
++ put_beancounter(task_bc->exec_ub);
++ put_beancounter(task_bc->task_ub);
++ put_beancounter(task_bc->fork_sub);
++ /* can't be freed elsewhere, failures possible in the middle of fork */
++ if (task_bc->task_freserv != NULL)
++ kfree(task_bc->task_freserv);
++
++ task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc;
++}
++
++/*
++ * Files and file locks.
++ */
++
++int ub_file_charge(struct file *f)
++{
++ struct user_beancounter *ub;
++
++ /* No need to get_beancounter here since it's already got in slab */
++ ub = slab_ub(f);
++ if (ub == NULL)
++ return 0;
++
++ return charge_beancounter(ub, UB_NUMFILE, 1, UB_HARD);
++}
++
++void ub_file_uncharge(struct file *f)
++{
++ struct user_beancounter *ub;
++
++ /* Ub will be put in slab */
++ ub = slab_ub(f);
++ if (ub == NULL)
++ return;
++
++ uncharge_beancounter(ub, UB_NUMFILE, 1);
++}
++
++int ub_flock_charge(struct file_lock *fl, int hard)
++{
++ struct user_beancounter *ub;
++ int err;
++
++ /* No need to get_beancounter here since it's already got in slab */
++ ub = slab_ub(fl);
++ if (ub == NULL)
++ return 0;
++
++ err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT);
++ if (!err)
++ fl->fl_charged = 1;
++ return err;
++}
++
++void ub_flock_uncharge(struct file_lock *fl)
++{
++ struct user_beancounter *ub;
++
++ /* Ub will be put in slab */
++ ub = slab_ub(fl);
++ if (ub == NULL || !fl->fl_charged)
++ return;
++
++ uncharge_beancounter(ub, UB_NUMFLOCK, 1);
++ fl->fl_charged = 0;
++}
++
++/*
++ * Signal handling
++ */
++
++static int do_ub_siginfo_charge(struct user_beancounter *ub,
++ unsigned long size)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD))
++ goto out_kmem;
++
++ if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD))
++ goto out_num;
++
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return 0;
++
++out_num:
++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size);
++out_kmem:
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return -ENOMEM;
++}
++
++static void do_ub_siginfo_uncharge(struct user_beancounter *ub,
++ unsigned long size)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size);
++ __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub)
++{
++ unsigned long size;
++ struct user_beancounter *p, *q;
++
++ size = CHARGE_SIZE(kmem_obj_memusage(sq));
++ for (p = ub; p != NULL; p = p->parent) {
++ if (do_ub_siginfo_charge(p, size))
++ goto unroll;
++ }
++
++ sq->sig_ub = get_beancounter(ub);
++ return 0;
++
++unroll:
++ for (q = ub; q != p; q = q->parent)
++ do_ub_siginfo_uncharge(q, size);
++ return -ENOMEM;
++}
++EXPORT_SYMBOL(ub_siginfo_charge);
++
++void ub_siginfo_uncharge(struct sigqueue *sq)
++{
++ unsigned long size;
++ struct user_beancounter *ub, *p;
++
++ p = ub = sq->sig_ub;
++ sq->sig_ub = NULL;
++ size = CHARGE_SIZE(kmem_obj_memusage(sq));
++ for (; ub != NULL; ub = ub->parent)
++ do_ub_siginfo_uncharge(ub, size);
++ put_beancounter(p);
++}
++
++/*
++ * PTYs
++ */
++
++int ub_pty_charge(struct tty_struct *tty)
++{
++ struct user_beancounter *ub;
++ int retval;
++
++ ub = slab_ub(tty);
++ retval = 0;
++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
++ !test_bit(TTY_CHARGED, &tty->flags)) {
++ retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD);
++ if (!retval)
++ set_bit(TTY_CHARGED, &tty->flags);
++ }
++ return retval;
++}
++
++void ub_pty_uncharge(struct tty_struct *tty)
++{
++ struct user_beancounter *ub;
++
++ ub = slab_ub(tty);
++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
++ test_bit(TTY_CHARGED, &tty->flags)) {
++ uncharge_beancounter(ub, UB_NUMPTY, 1);
++ clear_bit(TTY_CHARGED, &tty->flags);
++ }
++}
+diff -upr linux-2.6.16.orig/kernel/ub/ub_net.c linux-2.6.16-026test015/kernel/ub/ub_net.c
+--- linux-2.6.16.orig/kernel/ub/ub_net.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_net.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,1044 @@
++/*
++ * linux/kernel/ub/ub_net.c
++ *
++ * Copyright (C) 1998-2004 Andrey V. Savochkin <saw@saw.sw.com.sg>
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * TODO:
++ * - sizeof(struct inode) charge
++ * = tcp_mem_schedule() feedback based on ub limits
++ * + measures so that one socket won't exhaust all send buffers,
++ * see bug in bugzilla
++ * = sk->socket check for NULL in snd_wakeups
++ * (tcp_write_space checks for NULL itself)
++ * + in tcp_close(), orphaned socket abortion should be based on ubc
++ * resources (same in tcp_out_of_resources)
++ * Beancounter should also have separate orphaned socket counter...
++ * + for rcv, in-order segment should be accepted
++ * if only barrier is exceeded
++ * = tcp_rmem_schedule() feedback based on ub limits
++ * - repair forward_alloc mechanism for receive buffers
++ * It's idea is that some buffer space is pre-charged so that receive fast
++ * path doesn't need to take spinlocks and do other heavy stuff
++ * + tcp_prune_queue actions based on ub limits
++ * + window adjustments depending on available buffers for receive
++ * - window adjustments depending on available buffers for send
++ * + race around usewreserv
++ * + avoid allocating new page for each tiny-gram, see letter from ANK
++ * + rename ub_sock_lock
++ * + sk->sleep wait queue probably can be used for all wakeups, and
++ * sk->ub_wait is unnecessary
++ * + for UNIX sockets, the current algorithm will lead to
++ * UB_UNIX_MINBUF-sized messages only for non-blocking case
++ * - charge for af_packet sockets
++ * + all datagram sockets should be charged to NUMUNIXSOCK
++ * - we do not charge for skb copies and clones staying in device queues
++ * + live-lock if number of sockets is big and buffer limits are small
++ * [diff-ubc-dbllim3]
++ * - check that multiple readers/writers on the same socket won't cause fatal
++ * consequences
++ * - check allocation/charge orders
++ * + There is potential problem with callback_lock. In *snd_wakeup we take
++ * beancounter first, in sock_def_error_report - callback_lock first.
++ * then beancounter. This is not a problem if callback_lock taken
++ * readonly, but anyway...
++ * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator
++ * General kernel problems:
++ * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC
++ * notification won't get signals
++ * - datagram_poll looks racy
++ *
++ */
++
++#include <linux/net.h>
++#include <linux/slab.h>
++#include <linux/kmem_cache.h>
++#include <linux/gfp.h>
++#include <linux/err.h>
++#include <linux/socket.h>
++#include <linux/module.h>
++#include <linux/sched.h>
++
++#include <net/sock.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_net.h>
++#include <ub/ub_debug.h>
++
++
++/* Skb truesize definition. Bad place. Den */
++
++static inline int skb_chargesize_head(struct sk_buff *skb)
++{
++ return skb_charge_size(skb->end - skb->head +
++ sizeof(struct skb_shared_info));
++}
++
++int skb_charge_fullsize(struct sk_buff *skb)
++{
++ int chargesize;
++ struct sk_buff *skbfrag;
++
++ chargesize = skb_chargesize_head(skb) +
++ PAGE_SIZE * skb_shinfo(skb)->nr_frags;
++ if (likely(skb_shinfo(skb)->frag_list == NULL))
++ return chargesize;
++ for (skbfrag = skb_shinfo(skb)->frag_list;
++ skbfrag != NULL;
++ skbfrag = skbfrag->next) {
++ chargesize += skb_charge_fullsize(skbfrag);
++ }
++ return chargesize;
++}
++EXPORT_SYMBOL(skb_charge_fullsize);
++
++static int ub_sock_makewreserv_locked(struct sock *sk,
++ int bufid, int sockid, unsigned long size);
++
++int __ub_too_many_orphans(struct sock *sk, int count)
++{
++ struct user_beancounter *ub;
++
++ if (sock_has_ubc(sk)) {
++ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent);
++ if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2)
++ return 1;
++ }
++ return 0;
++}
++
++/*
++ * Queueing
++ */
++
++static void ub_sock_snd_wakeup(struct user_beancounter *ub)
++{
++ struct list_head *p;
++ struct sock_beancounter *skbc;
++ struct sock *sk;
++ struct user_beancounter *cub;
++ unsigned long added;
++
++ while (!list_empty(&ub->ub_other_sk_list)) {
++ p = ub->ub_other_sk_list.next;
++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
++ sk = skbc_sock(skbc);
++ ub_debug(UBD_NET_SLEEP, "Found sock to wake up\n");
++ added = -skbc->poll_reserv;
++ if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF,
++ UB_NUMOTHERSOCK, skbc->ub_waitspc))
++ break;
++ added += skbc->poll_reserv;
++
++ /*
++ * See comments in ub_tcp_snd_wakeup.
++ * Locking note: both unix_write_space and
++ * sock_def_write_space take callback_lock themselves.
++ * We take it here just to be on the safe side and to
++ * act the same way as ub_tcp_snd_wakeup does.
++ */
++ sk->sk_write_space(sk);
++
++ list_del_init(&skbc->ub_sock_list);
++
++ if (skbc->ub != ub && added) {
++ cub = get_beancounter(skbc->ub);
++ spin_unlock(&ub->ub_lock);
++ charge_beancounter_notop(cub, UB_OTHERSOCKBUF, added);
++ put_beancounter(cub);
++ spin_lock(&ub->ub_lock);
++ }
++ }
++}
++
++static void ub_tcp_snd_wakeup(struct user_beancounter *ub)
++{
++ struct list_head *p;
++ struct sock *sk;
++ struct sock_beancounter *skbc;
++ struct socket *sock;
++ struct user_beancounter *cub;
++ unsigned long added;
++
++ while (!list_empty(&ub->ub_tcp_sk_list)) {
++ p = ub->ub_tcp_sk_list.next;
++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list);
++ sk = skbc_sock(skbc);
++
++ added = 0;
++ sock = sk->sk_socket;
++ if (sock == NULL)
++ /* sk being destroyed */
++ goto cont;
++
++ ub_debug(UBD_NET_SLEEP,
++ "Checking queue, waiting %lu, reserv %lu\n",
++ skbc->ub_waitspc, skbc->poll_reserv);
++ added = -skbc->poll_reserv;
++ if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF,
++ UB_NUMTCPSOCK, skbc->ub_waitspc))
++ break;
++ added += skbc->poll_reserv;
++
++ /*
++ * Send async notifications and wake up.
++ * Locking note: we get callback_lock here because
++ * tcp_write_space is over-optimistic about calling context
++ * (socket lock is presumed). So we get the lock here although
++ * it belongs to the callback.
++ */
++ sk->sk_write_space(sk);
++
++cont:
++ list_del_init(&skbc->ub_sock_list);
++
++ if (skbc->ub != ub && added) {
++ cub = get_beancounter(skbc->ub);
++ spin_unlock(&ub->ub_lock);
++ charge_beancounter_notop(cub, UB_TCPSNDBUF, added);
++ put_beancounter(cub);
++ spin_lock(&ub->ub_lock);
++ }
++ }
++}
++
++void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size)
++{
++ unsigned long flags;
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++ unsigned long added_reserv;
++
++ if (!sock_has_ubc(sk))
++ return;
++
++ skbc = sock_bc(sk);
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size);
++ added_reserv = -skbc->poll_reserv;
++ if (!ub_sock_makewreserv_locked(sk, res, bid2sid(res), size)) {
++ /*
++ * It looks a bit hackish, but it is compatible with both
++ * wait_for_xx_ubspace and poll.
++ * This __set_current_state is equivalent to a wakeup event
++ * right after spin_unlock_irqrestore.
++ */
++ __set_current_state(TASK_RUNNING);
++ added_reserv += skbc->poll_reserv;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ if (added_reserv)
++ charge_beancounter_notop(skbc->ub, res, added_reserv);
++ return;
++ }
++
++ ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n");
++ skbc->ub_waitspc = size;
++ if (!list_empty(&skbc->ub_sock_list)) {
++ ub_debug(UBD_NET_SOCKET,
++ "re-adding socket to beancounter %p.\n", ub);
++ goto out;
++ }
++
++ switch (res) {
++ case UB_TCPSNDBUF:
++ list_add_tail(&skbc->ub_sock_list,
++ &ub->ub_tcp_sk_list);
++ break;
++ case UB_OTHERSOCKBUF:
++ list_add_tail(&skbc->ub_sock_list,
++ &ub->ub_other_sk_list);
++ break;
++ default:
++ BUG();
++ }
++out:
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++
++/*
++ * Helpers
++ */
++
++void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk,
++ unsigned long size, int resource)
++{
++ if (!sock_has_ubc(sk))
++ return;
++
++ if (sock_bc(sk)->ub == NULL)
++ BUG();
++ skb_bc(skb)->ub = sock_bc(sk)->ub;
++ skb_bc(skb)->charged = size;
++ skb_bc(skb)->resource = resource;
++
++ /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */
++ if (skb->sk == NULL)
++ skb->sk = sk;
++}
++
++static inline void ub_skb_set_uncharge(struct sk_buff *skb)
++{
++ skb_bc(skb)->ub = NULL;
++ skb_bc(skb)->charged = 0;
++ skb_bc(skb)->resource = 0;
++}
++
++static inline void __uncharge_sockbuf(struct sock_beancounter *skbc,
++ struct user_beancounter *ub, int resource, unsigned long size)
++{
++ if (ub != NULL)
++ __uncharge_beancounter_locked(ub, resource, size);
++
++ if (skbc != NULL) {
++ if (skbc->ub_wcharged > size)
++ skbc->ub_wcharged -= size;
++ else
++ skbc->ub_wcharged = 0;
++ }
++}
++
++static void ub_update_rmem_thres(struct sock_beancounter *skub)
++{
++ struct user_beancounter *ub;
++
++ if (skub && skub->ub) {
++ for (ub = skub->ub; ub->parent != NULL; ub = ub->parent);
++ ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier /
++ (ub->ub_parms[UB_NUMTCPSOCK].held + 1);
++ }
++}
++inline int ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask)
++{
++ memset(skb_bc(skb), 0, sizeof(struct skb_beancounter));
++ return 0;
++}
++
++inline void ub_skb_free_bc(struct sk_buff *skb)
++{
++}
++
++
++/*
++ * Charge socket number
++ */
++
++static inline int sk_alloc_beancounter(struct sock *sk)
++{
++ struct sock_beancounter *skbc;
++
++ skbc = sock_bc(sk);
++ memset(skbc, 0, sizeof(struct sock_beancounter));
++ return 0;
++}
++
++static inline void sk_free_beancounter(struct sock *sk)
++{
++}
++
++static int __sock_charge(struct sock *sk, int res)
++{
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++
++ ub = get_exec_ub();
++ if (ub == NULL)
++ return 0;
++ if (sk_alloc_beancounter(sk) < 0)
++ return -ENOMEM;
++
++ skbc = sock_bc(sk);
++ INIT_LIST_HEAD(&skbc->ub_sock_list);
++
++ if (charge_beancounter(ub, res, 1, UB_HARD) < 0)
++ goto out_limit;
++
++ /* TCP listen sock or process keeps referrence to UB */
++ skbc->ub = get_beancounter(ub);
++ return 0;
++
++out_limit:
++ sk_free_beancounter(sk);
++ return -ENOMEM;
++}
++
++int ub_tcp_sock_charge(struct sock *sk)
++{
++ int ret;
++
++ ret = __sock_charge(sk, UB_NUMTCPSOCK);
++ ub_update_rmem_thres(sock_bc(sk));
++
++ return ret;
++}
++
++int ub_other_sock_charge(struct sock *sk)
++{
++ return __sock_charge(sk, UB_NUMOTHERSOCK);
++}
++
++EXPORT_SYMBOL(ub_other_sock_charge);
++
++int ub_sock_charge(struct sock *sk, int family, int type)
++{
++ return (IS_TCP_SOCK(family, type) ?
++ ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk));
++}
++EXPORT_SYMBOL(ub_sock_charge);
++
++/*
++ * Uncharge socket number
++ */
++
++void ub_sock_uncharge(struct sock *sk)
++{
++ int is_tcp_sock;
++ unsigned long flags;
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++ unsigned long reserv;
++
++ if (!sock_has_ubc(sk))
++ return;
++
++ is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type);
++ skbc = sock_bc(sk);
++ ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk);
++
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ if (!list_empty(&skbc->ub_sock_list)) {
++ ub_debug(UBD_NET_SOCKET,
++ "ub_sock_uncharge: removing from ub(%p) queue.\n",
++ skbc);
++ list_del_init(&skbc->ub_sock_list);
++ }
++
++ reserv = skbc->poll_reserv;
++ __uncharge_beancounter_locked(ub,
++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
++ reserv);
++ __uncharge_beancounter_locked(ub,
++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
++
++ /* The check sk->sk_family != PF_NETLINK is made as the skb is
++ * queued to the kernel end of socket while changed to the user one.
++ * Den */
++ if (skbc->ub_wcharged > reserv &&
++ sk->sk_family != PF_NETLINK) {
++ skbc->ub_wcharged -= reserv;
++ printk(KERN_WARNING
++ "ub_sock_uncharge: wch=%lu for ub %p (%d).\n",
++ skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid);
++ } else
++ skbc->ub_wcharged = 0;
++ skbc->poll_reserv = 0;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ uncharge_beancounter_notop(skbc->ub,
++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF),
++ reserv);
++ uncharge_beancounter_notop(skbc->ub,
++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1);
++
++ put_beancounter(skbc->ub);
++ sk_free_beancounter(sk);
++}
++
++/*
++ * Send - receive buffers
++ */
++
++/* Special case for netlink_dump - (un)charges precalculated size */
++int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)
++{
++ int ret;
++ unsigned long chargesize;
++
++ if (!sock_has_ubc(sk))
++ return 0;
++
++ chargesize = skb_charge_fullsize(skb);
++ ret = charge_beancounter(sock_bc(sk)->ub,
++ UB_DGRAMRCVBUF, chargesize, UB_HARD);
++ if (ret < 0)
++ return ret;
++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF);
++ return ret;
++}
++
++/*
++ * Poll reserv accounting
++ */
++static int ub_sock_makewreserv_locked(struct sock *sk,
++ int bufid, int sockid, unsigned long size)
++{
++ unsigned long wcharge_added;
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++
++ if (!sock_has_ubc(sk))
++ goto out;
++
++ skbc = sock_bc(sk);
++ if (skbc->poll_reserv >= size) /* no work to be done */
++ goto out;
++
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++ ub->ub_parms[bufid].held += size - skbc->poll_reserv;
++
++ wcharge_added = 0;
++ /*
++ * Logic:
++ * 1) when used memory hits barrier, we set wmem_pressure;
++ * wmem_pressure is reset under barrier/2;
++ * between barrier/2 and barrier we limit per-socket buffer growth;
++ * 2) each socket is guaranteed to get (limit-barrier)/maxsockets
++ * calculated on the base of memory eaten after the barrier is hit
++ */
++ skbc = sock_bc(sk);
++ if (!ub_hfbarrier_hit(ub, bufid)) {
++ if (ub->ub_wmem_pressure)
++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 "
++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
++ sk, size, skbc->poll_reserv,
++ ub->ub_parms[bufid].held,
++ skbc->ub_wcharged, sk->sk_sndbuf);
++ ub->ub_wmem_pressure = 0;
++ }
++ if (ub_barrier_hit(ub, bufid)) {
++ if (!ub->ub_wmem_pressure)
++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 "
++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
++ sk, size, skbc->poll_reserv,
++ ub->ub_parms[bufid].held,
++ skbc->ub_wcharged, sk->sk_sndbuf);
++ ub->ub_wmem_pressure = 1;
++ wcharge_added = size - skbc->poll_reserv;
++ skbc->ub_wcharged += wcharge_added;
++ if (skbc->ub_wcharged * ub->ub_parms[sockid].limit +
++ ub->ub_parms[bufid].barrier >
++ ub->ub_parms[bufid].limit)
++ goto unroll;
++ }
++ if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit)
++ goto unroll;
++
++ ub_adjust_maxheld(ub, bufid);
++ skbc->poll_reserv = size;
++out:
++ return 0;
++
++unroll:
++ ub_debug(UBD_NET_SEND,
++ "makewres: deny "
++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n",
++ sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held,
++ skbc->ub_wcharged, sk->sk_sndbuf);
++ skbc->ub_wcharged -= wcharge_added;
++ ub->ub_parms[bufid].failcnt++;
++ ub->ub_parms[bufid].held -= size - skbc->poll_reserv;
++ return -ENOMEM;
++}
++
++int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size)
++{
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++ unsigned long flags;
++ unsigned long added_reserv;
++ int err;
++
++ skbc = sock_bc(sk);
++
++ /*
++ * This function provides that there is sufficient reserve upon return
++ * only if sk has only one user. We can check poll_reserv without
++ * serialization and avoid locking if the reserve already exists.
++ */
++ if (!sock_has_ubc(sk) || skbc->poll_reserv >= size)
++ return 0;
++
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ added_reserv = -skbc->poll_reserv;
++ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size);
++ added_reserv += skbc->poll_reserv;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ if (added_reserv)
++ charge_beancounter_notop(skbc->ub, bufid, added_reserv);
++
++ return err;
++}
++
++int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size)
++{
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++ unsigned long flags;
++ unsigned long added_reserv;
++ int err;
++
++ if (!sock_has_ubc(sk))
++ return 0;
++
++ skbc = sock_bc(sk);
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ added_reserv = -skbc->poll_reserv;
++ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size);
++ added_reserv += skbc->poll_reserv;
++ if (!err)
++ skbc->poll_reserv -= size;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ if (added_reserv)
++ charge_beancounter_notop(skbc->ub, bufid, added_reserv);
++
++ return err;
++}
++
++void ub_sock_ret_wreserv(struct sock *sk, int bufid,
++ unsigned long size, unsigned long ressize)
++{
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++ unsigned long extra;
++ unsigned long flags;
++
++ if (!sock_has_ubc(sk))
++ return;
++
++ extra = 0;
++ skbc = sock_bc(sk);
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ skbc->poll_reserv += size;
++ if (skbc->poll_reserv > ressize) {
++ extra = skbc->poll_reserv - ressize;
++ __uncharge_beancounter_locked(ub, bufid, extra);
++
++ if (skbc->ub_wcharged > skbc->poll_reserv - ressize)
++ skbc->ub_wcharged -= skbc->poll_reserv - ressize;
++ else
++ skbc->ub_wcharged = 0;
++ skbc->poll_reserv = ressize;
++ }
++
++ ub_tcp_snd_wakeup(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ if (extra)
++ uncharge_beancounter_notop(skbc->ub, bufid, extra);
++}
++
++long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size)
++{
++ DECLARE_WAITQUEUE(wait, current);
++
++ add_wait_queue(sk->sk_sleep, &wait);
++ for (;;) {
++ if (signal_pending(current))
++ break;
++ set_current_state(TASK_INTERRUPTIBLE);
++ if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size))
++ break;
++
++ if (sk->sk_shutdown & SEND_SHUTDOWN)
++ break;
++ if (sk->sk_err)
++ break;
++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size);
++ timeo = schedule_timeout(timeo);
++ }
++ __set_current_state(TASK_RUNNING);
++ remove_wait_queue(sk->sk_sleep, &wait);
++ return timeo;
++}
++
++int ub_sock_makewres_other(struct sock *sk, unsigned long size)
++{
++ return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size);
++}
++
++int ub_sock_makewres_tcp(struct sock *sk, unsigned long size)
++{
++ return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size);
++}
++
++int ub_sock_getwres_other(struct sock *sk, unsigned long size)
++{
++ return ub_sock_get_wreserv(sk, UB_OTHERSOCKBUF, size);
++}
++
++int ub_sock_getwres_tcp(struct sock *sk, unsigned long size)
++{
++ return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size);
++}
++
++void ub_sock_retwres_other(struct sock *sk, unsigned long size,
++ unsigned long ressize)
++{
++ ub_sock_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize);
++}
++
++void ub_sock_retwres_tcp(struct sock *sk, unsigned long size,
++ unsigned long ressize)
++{
++ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize);
++}
++
++void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz)
++{
++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz);
++}
++
++void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)
++{
++ ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz);
++}
++
++void ub_sock_sndqueuedel(struct sock *sk)
++{
++ struct sock_beancounter *skbc;
++ unsigned long flags;
++
++ if (!sock_has_ubc(sk))
++ return;
++ skbc = sock_bc(sk);
++
++ /* race with write_space callback of other socket */
++ spin_lock_irqsave(&skbc->ub->ub_lock, flags);
++ list_del_init(&skbc->ub_sock_list);
++ spin_unlock_irqrestore(&skbc->ub->ub_lock, flags);
++}
++
++/*
++ * UB_DGRAMRCVBUF
++ */
++
++int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)
++{
++ unsigned long chargesize;
++
++ if (!sock_has_ubc(sk))
++ return 0;
++
++ chargesize = skb_charge_fullsize(skb);
++ if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF,
++ chargesize, UB_HARD))
++ return -ENOMEM;
++
++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF);
++ return 0;
++}
++
++EXPORT_SYMBOL(ub_sockrcvbuf_charge);
++
++static void ub_sockrcvbuf_uncharge(struct sk_buff *skb)
++{
++ uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF,
++ skb_bc(skb)->charged);
++ ub_skb_set_uncharge(skb);
++}
++
++/*
++ * UB_TCPRCVBUF
++ */
++static int charge_tcprcvbuf(struct sock *sk, struct sk_buff *skb,
++ enum severity strict)
++{
++ int retval;
++ unsigned long flags;
++ struct user_beancounter *ub;
++ unsigned long chargesize;
++
++ if (!sock_has_ubc(sk))
++ return 0;
++
++ /*
++ * Memory pressure reactions:
++ * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND)
++ * 2) set UB_RMEM_SHRINK and tcp_clamp_window()
++ * tcp_collapse_queues() if rmem_alloc > rcvbuf
++ * 3) drop OFO, tcp_purge_ofo()
++ * 4) drop all.
++ * Currently, we do #2 and #3 at once (which means that current
++ * collapsing of OFO queue in tcp_collapse_queues() is a waste of time,
++ * for example...)
++ * On memory pressure we jump from #0 to #3, and when the pressure
++ * subsides, to #1.
++ */
++ retval = 0;
++ chargesize = skb_charge_fullsize(skb);
++
++ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_parms[UB_TCPRCVBUF].held += chargesize;
++ if (ub->ub_parms[UB_TCPRCVBUF].held >
++ ub->ub_parms[UB_TCPRCVBUF].barrier &&
++ strict != UB_FORCE)
++ goto excess;
++ ub_adjust_maxheld(ub, UB_TCPRCVBUF);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++out:
++ if (retval == 0) {
++ charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF,
++ chargesize);
++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF);
++ }
++ return retval;
++
++excess:
++ ub->ub_rmem_pressure = UB_RMEM_SHRINK;
++ if (strict == UB_HARD)
++ retval = -ENOMEM;
++ if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit)
++ retval = -ENOMEM;
++ /*
++ * We try to leave numsock*maxadvmss as a reserve for sockets not
++ * queueing any data yet (if the difference between the barrier and the
++ * limit is enough for this reserve).
++ */
++ if (ub->ub_parms[UB_TCPRCVBUF].held +
++ ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss
++ > ub->ub_parms[UB_TCPRCVBUF].limit &&
++ atomic_read(&sk->sk_rmem_alloc))
++ retval = -ENOMEM;
++ if (retval) {
++ ub->ub_parms[UB_TCPRCVBUF].held -= chargesize;
++ ub->ub_parms[UB_TCPRCVBUF].failcnt++;
++ }
++ ub_adjust_maxheld(ub, UB_TCPRCVBUF);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ goto out;
++}
++
++int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)
++{
++ return charge_tcprcvbuf(sk, skb, UB_HARD);
++}
++
++int ub_tcprcvbuf_charge_forced(struct sock *sk, struct sk_buff *skb)
++{
++ return charge_tcprcvbuf(sk, skb, UB_FORCE);
++}
++EXPORT_SYMBOL(ub_tcprcvbuf_charge_forced);
++
++static void ub_tcprcvbuf_uncharge(struct sk_buff *skb)
++{
++ unsigned long flags;
++ unsigned long held, bar;
++ int prev_pres;
++ struct user_beancounter *ub;
++
++ for (ub = skb_bc(skb)->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) {
++ printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n",
++ skb_bc(skb)->charged,
++ ub, ub->ub_parms[UB_TCPRCVBUF].held);
++ /* ass-saving bung */
++ skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held;
++ }
++ ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged;
++ held = ub->ub_parms[UB_TCPRCVBUF].held;
++ bar = ub->ub_parms[UB_TCPRCVBUF].barrier;
++ prev_pres = ub->ub_rmem_pressure;
++ if (held <= bar - (bar >> 2))
++ ub->ub_rmem_pressure = UB_RMEM_EXPAND;
++ else if (held <= bar)
++ ub->ub_rmem_pressure = UB_RMEM_KEEP;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF,
++ skb_bc(skb)->charged);
++ ub_skb_set_uncharge(skb);
++}
++
++
++/*
++ * UB_OTHERSOCKBUF
++ */
++
++static void ub_socksndbuf_uncharge(struct sk_buff *skb)
++{
++ unsigned long flags;
++ struct user_beancounter *ub, *cub;
++ struct sock_beancounter *sk_bc;
++
++ /* resource was set. no check for ub required */
++ cub = skb_bc(skb)->ub;
++ for (ub = cub; ub->parent != NULL; ub = ub->parent);
++ skb_bc(skb)->ub = NULL;
++ if (skb->sk != NULL)
++ sk_bc = sock_bc(skb->sk);
++ else
++ sk_bc = NULL;
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __uncharge_sockbuf(sk_bc, ub, UB_OTHERSOCKBUF,
++ skb_bc(skb)->charged);
++ ub_sock_snd_wakeup(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, skb_bc(skb)->charged);
++ ub_skb_set_uncharge(skb);
++}
++
++static void ub_tcpsndbuf_uncharge(struct sk_buff *skb)
++{
++ unsigned long flags;
++ struct user_beancounter *ub, *cub;
++
++ /* resource can be not set, called manually */
++ cub = skb_bc(skb)->ub;
++ if (cub == NULL)
++ return;
++ for (ub = cub; ub->parent != NULL; ub = ub->parent);
++ skb_bc(skb)->ub = NULL;
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __uncharge_sockbuf(sock_bc(skb->sk), ub, UB_TCPSNDBUF,
++ skb_bc(skb)->charged);
++ ub_tcp_snd_wakeup(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ uncharge_beancounter_notop(cub, UB_TCPSNDBUF, skb_bc(skb)->charged);
++ ub_skb_set_uncharge(skb);
++}
++
++void ub_skb_uncharge(struct sk_buff *skb)
++{
++ switch (skb_bc(skb)->resource) {
++ case UB_TCPSNDBUF:
++ ub_tcpsndbuf_uncharge(skb);
++ break;
++ case UB_TCPRCVBUF:
++ ub_tcprcvbuf_uncharge(skb);
++ break;
++ case UB_DGRAMRCVBUF:
++ ub_sockrcvbuf_uncharge(skb);
++ break;
++ case UB_OTHERSOCKBUF:
++ ub_socksndbuf_uncharge(skb);
++ break;
++ }
++}
++
++EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */
++
++/*
++ * TCP send buffers accouting. Paged part
++ */
++int ub_sock_tcp_chargepage(struct sock *sk)
++{
++ struct sock_beancounter *skbc;
++ struct user_beancounter *ub;
++ unsigned long added;
++ unsigned long flags;
++ int err;
++
++ if (!sock_has_ubc(sk))
++ return 0;
++
++ skbc = sock_bc(sk);
++
++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ /* Try to charge full page */
++ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK,
++ PAGE_SIZE);
++ if (err == 0) {
++ skbc->poll_reserv -= PAGE_SIZE;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, PAGE_SIZE);
++ return 0;
++ }
++
++ /* Try to charge page enough to satisfy sys_select. The possible
++ overdraft for the rest of the page is generally better then
++ requesting full page in tcp_poll. This should not happen
++ frequently. Den */
++ added = -skbc->poll_reserv;
++ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK,
++ SOCK_MIN_UBCSPACE);
++ if (err < 0) {
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return err;
++ }
++ __charge_beancounter_locked(ub, UB_TCPSNDBUF,
++ PAGE_SIZE - skbc->poll_reserv,
++ UB_FORCE);
++ added += PAGE_SIZE;
++ skbc->poll_reserv = 0;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added);
++
++ return 0;
++
++}
++
++void ub_sock_tcp_detachpage(struct sock *sk)
++{
++ struct sk_buff *skb;
++
++ if (!sock_has_ubc(sk))
++ return;
++
++ /* The page is just detached from socket. The last skb in queue
++ with paged part holds referrence to it */
++ skb = skb_peek_tail(&sk->sk_write_queue);
++ if (skb == NULL) {
++ /* If the queue is empty - all data is sent and page is about
++ to be freed */
++ uncharge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, PAGE_SIZE);
++ return;
++ }
++ /* Last skb is a good aproximation for a last skb with paged part */
++ skb_bc(skb)->charged += PAGE_SIZE;
++}
++
++static int charge_tcpsndbuf(struct sock *sk, struct sk_buff *skb,
++ enum severity strict)
++{
++ int ret;
++ unsigned long chargesize;
++
++ if (!sock_has_ubc(sk))
++ return 0;
++
++ chargesize = skb_charge_fullsize(skb);
++ ret = charge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, chargesize,
++ strict);
++ if (ret < 0)
++ return ret;
++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
++ sock_bc(sk)->ub_wcharged += chargesize;
++ return ret;
++}
++
++int ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb)
++{
++ return charge_tcpsndbuf(sk, skb, UB_HARD);
++}
++
++int ub_tcpsndbuf_charge_forced(struct sock *sk, struct sk_buff *skb)
++{
++ return charge_tcpsndbuf(sk, skb, UB_FORCE);
++}
++EXPORT_SYMBOL(ub_tcpsndbuf_charge_forced);
++
++/*
++ * Initialization staff
++ */
++int __init skbc_cache_init(void)
++{
++ return 0;
++}
+diff -upr linux-2.6.16.orig/kernel/ub/ub_page_bc.c linux-2.6.16-026test015/kernel/ub/ub_page_bc.c
+--- linux-2.6.16.orig/kernel/ub/ub_page_bc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_page_bc.c 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,439 @@
++/*
++ * kernel/ub/ub_page_bc.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/spinlock.h>
++#include <linux/slab.h>
++#include <linux/mm.h>
++#include <linux/gfp.h>
++#include <linux/vmalloc.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_hash.h>
++#include <ub/ub_vmpages.h>
++#include <ub/ub_page.h>
++
++static kmem_cache_t *pb_cachep;
++static spinlock_t pb_lock = SPIN_LOCK_UNLOCKED;
++static struct page_beancounter **pb_hash_table;
++static unsigned int pb_hash_mask;
++
++/*
++ * Auxiliary staff
++ */
++
++static inline struct page_beancounter *next_page_pb(struct page_beancounter *p)
++{
++ return list_entry(p->page_list.next, struct page_beancounter,
++ page_list);
++}
++
++static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p)
++{
++ return list_entry(p->page_list.prev, struct page_beancounter,
++ page_list);
++}
++
++/*
++ * Held pages manipulation
++ */
++static inline void set_held_pages(struct user_beancounter *bc)
++{
++ /* all three depend on ub_held_pages */
++ __ub_update_physpages(bc);
++ __ub_update_oomguarpages(bc);
++ __ub_update_privvm(bc);
++}
++
++static inline void do_dec_held_pages(struct user_beancounter *ub, int value)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_held_pages -= value;
++ set_held_pages(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++static void dec_held_pages(struct user_beancounter *ub, int value)
++{
++ for (; ub != NULL; ub = ub->parent)
++ do_dec_held_pages(ub, value);
++}
++
++static inline void do_inc_held_pages(struct user_beancounter *ub, int value)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_held_pages += value;
++ set_held_pages(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++static void inc_held_pages(struct user_beancounter *ub, int value)
++{
++ for (; ub != NULL; ub = ub->parent)
++ do_inc_held_pages(ub, value);
++}
++
++/*
++ * Alloc - free
++ */
++
++inline int pb_alloc(struct page_beancounter **pbc)
++{
++ *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL);
++ if (*pbc != NULL) {
++ (*pbc)->next_hash = NULL;
++ (*pbc)->pb_magic = PB_MAGIC;
++ }
++ return (*pbc == NULL);
++}
++
++inline void pb_free(struct page_beancounter **pb)
++{
++ if (*pb != NULL) {
++ kmem_cache_free(pb_cachep, *pb);
++ *pb = NULL;
++ }
++}
++
++void pb_free_list(struct page_beancounter **p_pb)
++{
++ struct page_beancounter *list, *pb;
++
++ list = *p_pb;
++ if (list == PBC_COPY_SAME)
++ return;
++
++ while (list) {
++ pb = list;
++ list = list->next_hash;
++ pb_free(&pb);
++ }
++ *p_pb = NULL;
++}
++
++/*
++ * head -> <new objs> -> <old objs> -> ...
++ */
++static int __alloc_list(struct page_beancounter **head, int num)
++{
++ struct page_beancounter *pb;
++
++ while (num > 0) {
++ if (pb_alloc(&pb))
++ return -1;
++ pb->next_hash = *head;
++ *head = pb;
++ num--;
++ }
++
++ return num;
++}
++
++/*
++ * Ensure that the list contains at least num elements.
++ * p_pb points to an initialized list, may be of the zero length.
++ *
++ * mm->page_table_lock should be held
++ */
++int pb_alloc_list(struct page_beancounter **p_pb, int num)
++{
++ struct page_beancounter *list;
++
++ for (list = *p_pb; list != NULL && num; list = list->next_hash, num--);
++ if (!num)
++ return 0;
++
++ /*
++ * *p_pb(after) *p_pb (before)
++ * \ \
++ * <new objs> -...-> <old objs> -> ...
++ */
++ if (__alloc_list(p_pb, num) < 0)
++ goto nomem;
++ return 0;
++
++nomem:
++ pb_free_list(p_pb);
++ return -ENOMEM;
++}
++
++/*
++ * Allocates a page_beancounter for each
++ * user_beancounter in a hash
++ */
++int pb_alloc_all(struct page_beancounter **pbs)
++{
++ int i, need_alloc;
++ unsigned long flags;
++ struct user_beancounter *ub;
++
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ need_alloc = 0;
++ for_each_beancounter(i, ub)
++ need_alloc++;
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++ if (!__alloc_list(pbs, need_alloc))
++ return 0;
++
++ pb_free_list(pbs);
++ return -ENOMEM;
++}
++
++/*
++ * Hash routines
++ */
++
++static inline int pb_hash(struct user_beancounter *ub, struct page *page)
++{
++ return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask;
++}
++
++/* pb_lock should be held */
++static inline void insert_pb(struct page_beancounter *p, struct page *page,
++ struct user_beancounter *ub, int hash)
++{
++ p->page = page;
++ p->ub = get_beancounter(ub);
++ p->next_hash = pb_hash_table[hash];
++ pb_hash_table[hash] = p;
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ ub->ub_stat[smp_processor_id()].pbcs++;
++#endif
++}
++
++/*
++ * Heart
++ */
++
++static int __pb_dup_ref(struct page *page, struct user_beancounter *bc,
++ int hash)
++{
++ struct page_beancounter *p;
++
++ for (p = pb_hash_table[hash];
++ p != NULL && (p->page != page || p->ub != bc);
++ p = p->next_hash);
++ if (p == NULL)
++ return -1;
++
++ PB_COUNT_INC(p->refcount);
++ return 0;
++}
++
++static void __pb_add_ref(struct page *page, struct user_beancounter *bc,
++ struct page_beancounter **ppb, int hash)
++{
++ struct page_beancounter *head, *p;
++ int shift;
++
++ p = *ppb;
++ *ppb = p->next_hash;
++
++ insert_pb(p, page, bc, hash);
++ head = page_pbc(page);
++
++ if (head != NULL) {
++ /*
++ * Move the first element to the end of the list.
++ * List head (pb_head) is set to the next entry.
++ * Note that this code works even if head is the only element
++ * on the list (because it's cyclic).
++ */
++ BUG_ON(head->pb_magic != PB_MAGIC);
++ page_pbc(page) = next_page_pb(head);
++ PB_SHIFT_INC(head->refcount);
++ shift = PB_SHIFT_GET(head->refcount);
++ /*
++ * Update user beancounter, the share of head has been changed.
++ * Note that the shift counter is taken after increment.
++ */
++ dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift);
++ /* add the new page beancounter to the end of the list */
++ list_add_tail(&p->page_list, &page_pbc(page)->page_list);
++ } else {
++ page_pbc(page) = p;
++ shift = 0;
++ INIT_LIST_HEAD(&p->page_list);
++ }
++
++ p->refcount = PB_REFCOUNT_MAKE(shift, 1);
++ /* update user beancounter for the new page beancounter */
++ inc_held_pages(bc, UB_PAGE_WEIGHT >> shift);
++}
++
++void pb_add_ref(struct page *page, struct mm_struct *mm,
++ struct page_beancounter **p_pb)
++{
++ int hash;
++ struct user_beancounter *bc;
++
++ bc = mm->mm_ub;
++ if (bc == NULL)
++ return;
++
++ if (!PageAnon(page) && is_shmem_mapping(page->mapping))
++ return;
++
++ hash = pb_hash(bc, page);
++
++ spin_lock(&pb_lock);
++ if (__pb_dup_ref(page, bc, hash))
++ __pb_add_ref(page, bc, p_pb, hash);
++ spin_unlock(&pb_lock);
++}
++
++void pb_dup_ref(struct page *page, struct mm_struct *mm,
++ struct page_beancounter **p_pb)
++{
++ int hash;
++ struct user_beancounter *bc;
++
++ bc = mm->mm_ub;
++ if (bc == NULL)
++ return;
++
++ if (!PageAnon(page) && is_shmem_mapping(page->mapping))
++ return;
++
++ hash = pb_hash(bc, page);
++
++ spin_lock(&pb_lock);
++ if (page_pbc(page) == NULL)
++ /*
++ * pages like ZERO_PAGE must not be accounted in pbc
++ * so on fork we just skip them
++ */
++ goto out_unlock;
++
++ if (unlikely(*p_pb != PBC_COPY_SAME))
++ __pb_add_ref(page, bc, p_pb, hash);
++ else if (unlikely(__pb_dup_ref(page, bc, hash)))
++ WARN_ON(1);
++out_unlock:
++ spin_unlock(&pb_lock);
++}
++
++void pb_remove_ref(struct page *page, struct mm_struct *mm)
++{
++ int hash;
++ struct user_beancounter *bc;
++ struct page_beancounter *p, **q;
++ int shift, shiftt;
++
++ bc = mm->mm_ub;
++ if (bc == NULL)
++ return;
++
++ if (!PageAnon(page) && is_shmem_mapping(page->mapping))
++ return;
++
++ hash = pb_hash(bc, page);
++
++ spin_lock(&pb_lock);
++ BUG_ON(page_pbc(page) != NULL && page_pbc(page)->pb_magic != PB_MAGIC);
++ for (q = pb_hash_table + hash, p = *q;
++ p != NULL && (p->page != page || p->ub != bc);
++ q = &p->next_hash, p = *q);
++ if (p == NULL)
++ goto out_unlock;
++
++ PB_COUNT_DEC(p->refcount);
++ if (PB_COUNT_GET(p->refcount))
++ /*
++ * More references from the same user beancounter exist.
++ * Nothing needs to be done.
++ */
++ goto out_unlock;
++
++ /* remove from the hash list */
++ *q = p->next_hash;
++
++ shift = PB_SHIFT_GET(p->refcount);
++
++ dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift);
++
++ if (page_pbc(page) == p) {
++ if (list_empty(&p->page_list))
++ goto out_free;
++ page_pbc(page) = next_page_pb(p);
++ }
++ list_del(&p->page_list);
++ put_beancounter(p->ub);
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ p->ub->ub_stat[smp_processor_id()].pbcs--;
++#endif
++ pb_free(&p);
++
++ /* Now balance the list. Move the tail and adjust its shift counter. */
++ p = prev_page_pb(page_pbc(page));
++ shiftt = PB_SHIFT_GET(p->refcount);
++ page_pbc(page) = p;
++ PB_SHIFT_DEC(p->refcount);
++
++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt);
++
++ /*
++ * If the shift counter of the moved beancounter is different from the
++ * removed one's, repeat the procedure for one more tail beancounter
++ */
++ if (shiftt > shift) {
++ p = prev_page_pb(page_pbc(page));
++ page_pbc(page) = p;
++ PB_SHIFT_DEC(p->refcount);
++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt);
++ }
++ spin_unlock(&pb_lock);
++ return;
++
++out_free:
++ page_pbc(page) = NULL;
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ p->ub->ub_stat[smp_processor_id()].pbcs--;
++#endif
++ put_beancounter(p->ub);
++ pb_free(&p);
++out_unlock:
++ spin_unlock(&pb_lock);
++ return;
++}
++
++struct user_beancounter *pb_grab_page_ub(struct page *page)
++{
++ struct page_beancounter *pb;
++ struct user_beancounter *ub;
++
++ spin_lock(&pb_lock);
++ pb = page_pbc(page);
++ ub = (pb == NULL ? ERR_PTR(-EINVAL) :
++ get_beancounter(pb->ub));
++ spin_unlock(&pb_lock);
++ return ub;
++}
++
++void __init ub_init_pbc(void)
++{
++ unsigned long hash_size;
++
++ pb_cachep = kmem_cache_create("page_beancounter",
++ sizeof(struct page_beancounter), 0,
++ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL);
++ hash_size = num_physpages >> 2;
++ for (pb_hash_mask = 1;
++ (hash_size & pb_hash_mask) != hash_size;
++ pb_hash_mask = (pb_hash_mask << 1) + 1);
++ hash_size = pb_hash_mask + 1;
++ printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size);
++ pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *));
++ memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *));
++}
+diff -upr linux-2.6.16.orig/kernel/ub/ub_pages.c linux-2.6.16-026test015/kernel/ub/ub_pages.c
+--- linux-2.6.16.orig/kernel/ub/ub_pages.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_pages.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,530 @@
++/*
++ * kernel/ub/ub_pages.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/mm.h>
++#include <linux/highmem.h>
++#include <linux/virtinfo.h>
++#include <linux/module.h>
++#include <linux/shmem_fs.h>
++#include <linux/vmalloc.h>
++
++#include <asm/pgtable.h>
++#include <asm/page.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_vmpages.h>
++
++void warn_bad_rss(struct vm_area_struct *vma, unsigned long freed)
++{
++ static struct ub_rate_info ri = {
++ .burst = 10,
++ .interval = 40 * HZ,
++ };
++ struct user_beancounter *ub;
++ char ubuid[64] = "No UB";
++ unsigned long vmrss;
++
++ if (!ub_ratelimit(&ri))
++ return;
++
++ ub = vma->vm_mm->mm_ub;
++ if (ub)
++ print_ub_uid(ub, ubuid, sizeof(ubuid));
++
++ vmrss = get_vma_rss(vma) + freed;
++ printk(KERN_WARNING
++ "%s vm_rss: process pid %d comm %.20s flags %lx\n"
++ "vma %p/%p rss %lu/%lu freed %lu\n"
++ "flags %lx, ub %s\n",
++ vmrss > freed ? "Positive" : "Negative",
++ current->pid, current->comm, current->flags,
++ vma, vma->vm_mm, vmrss, vma_pages(vma), freed,
++ vma->vm_flags, ubuid);
++ dump_stack();
++}
++
++static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma,
++ pmd_t *pmd, unsigned long addr, unsigned long end,
++ unsigned long *ret)
++{
++ pte_t *pte;
++ spinlock_t *ptl;
++
++ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
++ do {
++ if (!pte_none(*pte) && pte_present(*pte))
++ (*ret)++;
++ } while (pte++, addr += PAGE_SIZE, (addr != end));
++ pte_unmap_unlock(pte - 1, ptl);
++
++ return addr;
++}
++
++static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma,
++ pud_t *pud, unsigned long addr, unsigned long end,
++ unsigned long *ret)
++{
++ pmd_t *pmd;
++ unsigned long next;
++
++ pmd = pmd_offset(pud, addr);
++ do {
++ next = pmd_addr_end(addr, end);
++ if (pmd_none_or_clear_bad(pmd))
++ continue;
++ next = pages_in_pte_range(vma, pmd, addr, next, ret);
++ } while (pmd++, addr = next, (addr != end));
++
++ return addr;
++}
++
++static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma,
++ pgd_t *pgd, unsigned long addr, unsigned long end,
++ unsigned long *ret)
++{
++ pud_t *pud;
++ unsigned long next;
++
++ pud = pud_offset(pgd, addr);
++ do {
++ next = pud_addr_end(addr, end);
++ if (pud_none_or_clear_bad(pud))
++ continue;
++ next = pages_in_pmd_range(vma, pud, addr, next, ret);
++ } while (pud++, addr = next, (addr != end));
++
++ return addr;
++}
++
++unsigned long pages_in_vma_range(struct vm_area_struct *vma,
++ unsigned long addr, unsigned long end)
++{
++ pgd_t *pgd;
++ unsigned long next;
++ unsigned long ret;
++
++ ret = 0;
++ BUG_ON(addr >= end);
++ pgd = pgd_offset(vma->vm_mm, addr);
++ do {
++ next = pgd_addr_end(addr, end);
++ if (pgd_none_or_clear_bad(pgd))
++ continue;
++ next = pages_in_pud_range(vma, pgd, addr, next, &ret);
++ } while (pgd++, addr = next, (addr != end));
++ return ret;
++}
++
++void fastcall __ub_update_physpages(struct user_beancounter *ub)
++{
++ ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages
++ + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT);
++ ub_adjust_maxheld(ub, UB_PHYSPAGES);
++}
++
++void fastcall __ub_update_oomguarpages(struct user_beancounter *ub)
++{
++ ub->ub_parms[UB_OOMGUARPAGES].held =
++ ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages;
++ ub_adjust_maxheld(ub, UB_OOMGUARPAGES);
++}
++
++void fastcall __ub_update_privvm(struct user_beancounter *ub)
++{
++ ub->ub_parms[UB_PRIVVMPAGES].held =
++ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT)
++ + ub->ub_unused_privvmpages
++ + ub->ub_parms[UB_SHMPAGES].held;
++ ub_adjust_maxheld(ub, UB_PRIVVMPAGES);
++}
++
++static inline int __charge_privvm_locked(struct user_beancounter *ub,
++ unsigned long s, enum severity strict)
++{
++ if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0)
++ return -ENOMEM;
++
++ ub->ub_unused_privvmpages += s;
++ return 0;
++}
++
++static void __unused_privvm_dec_locked(struct user_beancounter *ub,
++ long size)
++{
++ /* catch possible overflow */
++ if (ub->ub_unused_privvmpages < size) {
++ uncharge_warn(ub, UB_UNUSEDPRIVVM,
++ size, ub->ub_unused_privvmpages);
++ size = ub->ub_unused_privvmpages;
++ }
++ ub->ub_unused_privvmpages -= size;
++ __ub_update_privvm(ub);
++}
++
++void __ub_unused_privvm_dec(struct mm_struct *mm, long size)
++{
++ unsigned long flags;
++ struct user_beancounter *ub;
++
++ ub = mm->mm_ub;
++ if (ub == NULL)
++ return;
++
++ for (; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __unused_privvm_dec_locked(ub, size);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_unused_privvm_sub(struct mm_struct *mm,
++ struct vm_area_struct *vma, unsigned long count)
++{
++ if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file))
++ __ub_unused_privvm_dec(mm, count);
++}
++
++void ub_unused_privvm_add(struct mm_struct *mm,
++ struct vm_area_struct *vma, unsigned long size)
++{
++ unsigned long flags;
++ struct user_beancounter *ub;
++
++ ub = mm->mm_ub;
++ if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file))
++ return;
++
++ for (; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_unused_privvmpages += size;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++int ub_protected_charge(struct mm_struct *mm, unsigned long size,
++ unsigned long newflags, struct vm_area_struct *vma)
++{
++ unsigned long flags;
++ struct file *file;
++ struct user_beancounter *ub;
++
++ ub = mm->mm_ub;
++ if (ub == NULL)
++ return PRIVVM_NO_CHARGE;
++
++ flags = vma->vm_flags;
++ if (!((newflags ^ flags) & VM_WRITE))
++ return PRIVVM_NO_CHARGE;
++
++ file = vma->vm_file;
++ if (!VM_UB_PRIVATE(newflags | VM_WRITE, file))
++ return PRIVVM_NO_CHARGE;
++
++ if (flags & VM_WRITE)
++ return PRIVVM_TO_SHARED;
++
++ for (; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ if (__charge_privvm_locked(ub, size, UB_SOFT) < 0)
++ goto err;
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return PRIVVM_TO_PRIVATE;
++
++err:
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return PRIVVM_ERROR;
++}
++
++int ub_memory_charge(struct mm_struct *mm, unsigned long size,
++ unsigned vm_flags, struct file *vm_file, int sv)
++{
++ struct user_beancounter *ub, *ubl;
++ unsigned long flags;
++
++ ub = mm->mm_ub;
++ if (ub == NULL)
++ return 0;
++
++ size >>= PAGE_SHIFT;
++ if (size > UB_MAXVALUE)
++ return -EINVAL;
++
++ BUG_ON(sv != UB_SOFT && sv != UB_HARD);
++
++ if (vm_flags & VM_LOCKED) {
++ if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv))
++ goto out_err;
++ }
++ if (VM_UB_PRIVATE(vm_flags, vm_file)) {
++ for (ubl = ub; ubl->parent != NULL; ubl = ubl->parent);
++ spin_lock_irqsave(&ubl->ub_lock, flags);
++ if (__charge_privvm_locked(ubl, size, sv))
++ goto out_private;
++ spin_unlock_irqrestore(&ubl->ub_lock, flags);
++ }
++ return 0;
++
++out_private:
++ spin_unlock_irqrestore(&ubl->ub_lock, flags);
++ if (vm_flags & VM_LOCKED)
++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
++out_err:
++ return -ENOMEM;
++}
++
++void ub_memory_uncharge(struct mm_struct *mm, unsigned long size,
++ unsigned vm_flags, struct file *vm_file)
++{
++ struct user_beancounter *ub;
++ unsigned long flags;
++
++ ub = mm->mm_ub;
++ if (ub == NULL)
++ return;
++
++ size >>= PAGE_SHIFT;
++
++ if (vm_flags & VM_LOCKED)
++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
++ if (VM_UB_PRIVATE(vm_flags, vm_file)) {
++ for (; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __unused_privvm_dec_locked(ub, size);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ }
++}
++
++int ub_locked_charge(struct mm_struct *mm, unsigned long size)
++{
++ struct user_beancounter *ub;
++
++ ub = mm->mm_ub;
++ if (ub == NULL)
++ return 0;
++
++ return charge_beancounter(ub, UB_LOCKEDPAGES,
++ size >> PAGE_SHIFT, UB_HARD);
++}
++
++void ub_locked_uncharge(struct mm_struct *mm, unsigned long size)
++{
++ struct user_beancounter *ub;
++
++ ub = mm->mm_ub;
++ if (ub == NULL)
++ return;
++
++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
++}
++
++int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size)
++{
++ struct user_beancounter *ub;
++
++ ub = shi->shmi_ub;
++ if (ub == NULL)
++ return 0;
++
++ return charge_beancounter(ub, UB_LOCKEDPAGES,
++ size >> PAGE_SHIFT, UB_HARD);
++}
++
++void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
++{
++ struct user_beancounter *ub;
++
++ ub = shi->shmi_ub;
++ if (ub == NULL)
++ return;
++
++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
++}
++
++
++static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_tmpfs_respages++;
++ __ub_update_physpages(ub);
++ __ub_update_oomguarpages(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_tmpfs_respages_inc(struct shmem_inode_info *shi)
++{
++ struct user_beancounter *ub;
++
++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent)
++ do_ub_tmpfs_respages_inc(ub);
++}
++
++static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub,
++ unsigned long size)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ /* catch possible overflow */
++ if (ub->ub_tmpfs_respages < size) {
++ uncharge_warn(ub, UB_TMPFSPAGES,
++ size, ub->ub_tmpfs_respages);
++ size = ub->ub_tmpfs_respages;
++ }
++ ub->ub_tmpfs_respages -= size;
++ /* update values what is the most interesting */
++ __ub_update_physpages(ub);
++ __ub_update_oomguarpages(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_tmpfs_respages_sub(struct shmem_inode_info *shi,
++ unsigned long size)
++{
++ struct user_beancounter *ub;
++
++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent)
++ do_ub_tmpfs_respages_sub(ub, size);
++}
++
++int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size)
++{
++ int ret;
++ unsigned long flags;
++ struct user_beancounter *ub;
++
++ ub = shi->shmi_ub;
++ if (ub == NULL)
++ return 0;
++
++ for (; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD);
++ if (ret == 0)
++ __ub_update_privvm(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++ return ret;
++}
++
++void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size)
++{
++ unsigned long flags;
++ struct user_beancounter *ub;
++
++ ub = shi->shmi_ub;
++ if (ub == NULL)
++ return;
++
++ for (; ub->parent != NULL; ub = ub->parent);
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ __uncharge_beancounter_locked(ub, UB_SHMPAGES, size);
++ __ub_update_privvm(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++#ifdef CONFIG_USER_SWAP_ACCOUNTING
++static inline void do_ub_swapentry_inc(struct user_beancounter *ub)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_swap_pages++;
++ __ub_update_oomguarpages(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num,
++ struct user_beancounter *ub)
++{
++ si->swap_ubs[num] = get_beancounter(ub);
++ for (; ub != NULL; ub = ub->parent)
++ do_ub_swapentry_inc(ub);
++}
++EXPORT_SYMBOL(ub_swapentry_inc);
++
++static inline void do_ub_swapentry_dec(struct user_beancounter *ub)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ if (ub->ub_swap_pages <= 0)
++ uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages);
++ else
++ ub->ub_swap_pages--;
++ __ub_update_oomguarpages(ub);
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++}
++
++void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num)
++{
++ struct user_beancounter *ub, *ubp;
++
++ ub = si->swap_ubs[num];
++ si->swap_ubs[num] = NULL;
++ for (ubp = ub; ubp != NULL; ubp = ubp->parent)
++ do_ub_swapentry_dec(ubp);
++ put_beancounter(ub);
++}
++EXPORT_SYMBOL(ub_swapentry_dec);
++
++int ub_swap_init(struct swap_info_struct *si, pgoff_t num)
++{
++ struct user_beancounter **ubs;
++
++ ubs = vmalloc(num * sizeof(struct user_beancounter *));
++ if (ubs == NULL)
++ return -ENOMEM;
++
++ memset(ubs, 0, num * sizeof(struct user_beancounter *));
++ si->swap_ubs = ubs;
++ return 0;
++}
++
++void ub_swap_fini(struct swap_info_struct *si)
++{
++ if (si->swap_ubs) {
++ vfree(si->swap_ubs);
++ si->swap_ubs = NULL;
++ }
++}
++#endif
++
++static int vmguar_enough_memory(struct vnotifier_block *self,
++ unsigned long event, void *arg, int old_ret)
++{
++ struct user_beancounter *ub;
++
++ if (event != VIRTINFO_ENOUGHMEM)
++ return old_ret;
++
++ for (ub = current->mm->mm_ub; ub->parent != NULL; ub = ub->parent);
++ if (ub->ub_parms[UB_PRIVVMPAGES].held >
++ ub->ub_parms[UB_VMGUARPAGES].barrier)
++ return old_ret;
++
++ return NOTIFY_OK;
++}
++
++static struct vnotifier_block vmguar_notifier_block = {
++ .notifier_call = vmguar_enough_memory
++};
++
++static int __init init_vmguar_notifier(void)
++{
++ virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block);
++ return 0;
++}
++
++static void __exit fini_vmguar_notifier(void)
++{
++ virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block);
++}
++
++module_init(init_vmguar_notifier);
++module_exit(fini_vmguar_notifier);
+diff -upr linux-2.6.16.orig/kernel/ub/ub_proc.c linux-2.6.16-026test015/kernel/ub/ub_proc.c
+--- linux-2.6.16.orig/kernel/ub/ub_proc.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_proc.c 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,389 @@
++/*
++ * linux/fs/proc/proc_ub.c
++ *
++ * Copyright (C) 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg>
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ * TODO:
++ *
++ * Changes:
++ */
++
++#include <linux/errno.h>
++#include <linux/sched.h>
++#include <linux/kernel.h>
++#include <linux/mm.h>
++#include <linux/proc_fs.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_hash.h>
++#include <ub/ub_debug.h>
++#include <ub/ub_page.h>
++
++#include <asm/page.h>
++#include <asm/uaccess.h>
++
++/*
++ * we have 8 format strings depending on:
++ * 1. BITS_PER_LONG
++ * 2. CONFIG_UBC_KEEP_UNUSED
++ * 3. resource number (see out_proc_beancounter)
++ */
++
++#ifdef CONFIG_UBC_KEEP_UNUSED
++#define REF_FORMAT "%5.5s %4i: %-12s "
++#define UID_HEAD_STR "uid ref"
++#else
++#define REF_FORMAT "%10.10s: %-12s "
++#define UID_HEAD_STR "uid"
++#endif
++#define REF2_FORMAT "%10s %-12s "
++
++#if BITS_PER_LONG == 32
++#define RES_FORMAT "%10lu %10lu %10lu %10lu %10lu"
++#define HEAD_FORMAT "%10s %10s %10s %10s %10s"
++#define UB_PROC_LINE_TEXT (10+2+12+1+10+1+10+1+10+1+10+1+10)
++#else
++#define RES_FORMAT "%20lu %20lu %20lu %20lu %20lu"
++#define HEAD_FORMAT "%20s %20s %20s %20s %20s"
++#define UB_PROC_LINE_TEXT (10+2+12+1+20+1+20+1+20+1+20+1+20)
++#endif
++
++#define UB_PROC_LINE_LEN (UB_PROC_LINE_TEXT + 1)
++
++static void out_proc_version(char *buf)
++{
++ int len;
++
++ len = sprintf(buf, "Version: 2.5");
++ memset(buf + len, ' ', UB_PROC_LINE_TEXT - len);
++ buf[UB_PROC_LINE_TEXT] = '\n';
++}
++
++static void out_proc_head(char *buf)
++{
++ sprintf(buf, REF2_FORMAT HEAD_FORMAT,
++ UID_HEAD_STR, "resource", "held", "maxheld",
++ "barrier", "limit", "failcnt");
++ buf[UB_PROC_LINE_TEXT] = '\n';
++}
++
++static void out_proc_beancounter(char *buf, struct user_beancounter *ub, int r)
++{
++ if (r == 0) {
++ char tmpbuf[64];
++ print_ub_uid(ub, tmpbuf, sizeof(tmpbuf));
++ sprintf(buf, REF_FORMAT RES_FORMAT,
++ tmpbuf,
++#ifdef CONFIG_UBC_KEEP_UNUSED
++ atomic_read(&ub->ub_refcount),
++#endif
++ ub_rnames[r], ub->ub_parms[r].held,
++ ub->ub_parms[r].maxheld, ub->ub_parms[r].barrier,
++ ub->ub_parms[r].limit, ub->ub_parms[r].failcnt);
++ } else
++ sprintf(buf, REF2_FORMAT RES_FORMAT,
++ "", ub_rnames[r],
++ ub->ub_parms[r].held, ub->ub_parms[r].maxheld,
++ ub->ub_parms[r].barrier, ub->ub_parms[r].limit,
++ ub->ub_parms[r].failcnt);
++
++ buf[UB_PROC_LINE_TEXT] = '\n';
++}
++
++static int ub_accessible(struct user_beancounter *ub,
++ struct user_beancounter *exec_ub,
++ struct file *file)
++{
++ struct user_beancounter *p, *q;
++
++ for (p = exec_ub; p->parent != NULL; p = p->parent);
++ for (q = ub; q->parent != NULL; q = q->parent);
++ if (p != get_ub0() && q != p)
++ return 0;
++ if (ub->parent == NULL)
++ return 1;
++ return file->private_data == NULL ? 0 : 1;
++}
++
++static ssize_t ub_proc_read(struct file *file, char *usrbuf, size_t len,
++ loff_t *poff)
++{
++ ssize_t retval;
++ char *buf;
++ unsigned long flags;
++ int i, resource;
++ struct ub_hash_slot *slot;
++ struct user_beancounter *ub;
++ struct user_beancounter *exec_ub = get_exec_ub();
++ loff_t n, off;
++ int rem, produced, job, tocopy;
++ const int is_capable =
++ (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH));
++
++ retval = -ENOBUFS;
++ buf = (char *)__get_free_page(GFP_KERNEL);
++ if (buf == NULL)
++ goto out;
++
++ retval = 0;
++ if (!is_capable)
++ goto out_free;
++
++ off = *poff;
++ if (off < 0) /* can't happen, just in case */
++ goto inval;
++
++again:
++ i = 0;
++ slot = ub_hash;
++ n = off; /* The amount of data tp skip */
++ produced = 0;
++ if (n < (UB_PROC_LINE_LEN * 2)) {
++ if (n < UB_PROC_LINE_LEN) {
++ out_proc_version(buf);
++ produced += UB_PROC_LINE_LEN;
++ n += UB_PROC_LINE_LEN;
++ }
++ out_proc_head(buf + produced);
++ produced += UB_PROC_LINE_LEN;
++ n += UB_PROC_LINE_LEN;
++ }
++ n -= (2 * UB_PROC_LINE_LEN);
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ while (1) {
++ for (ub = slot->ubh_beans;
++ ub != NULL && n >= (UB_RESOURCES * UB_PROC_LINE_LEN);
++ ub = ub->ub_next)
++ if (is_capable && ub_accessible(ub, exec_ub, file))
++ n -= (UB_RESOURCES * UB_PROC_LINE_LEN);
++ if (ub != NULL || ++i >= UB_HASH_SIZE)
++ break;
++ ++slot;
++ }
++ rem = n; /* the amount of the data in the buffer to skip */
++ job = PAGE_SIZE - UB_PROC_LINE_LEN + 1; /* end of buffer data */
++ if (len < job - rem)
++ job = rem + len;
++ while (ub != NULL && produced < job) {
++ if (is_capable && ub_accessible(ub, exec_ub, file))
++ for (resource = 0;
++ produced < job && resource < UB_RESOURCES;
++ resource++, produced += UB_PROC_LINE_LEN)
++ {
++ out_proc_beancounter(buf + produced,
++ ub, resource);
++ }
++ if (produced >= job)
++ break;
++ /* Find the next beancounter to produce more data. */
++ ub = ub->ub_next;
++ while (ub == NULL && ++i < UB_HASH_SIZE) {
++ ++slot;
++ ub = slot->ubh_beans;
++ }
++ }
++
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ ub_debug(UBD_ALLOC, KERN_DEBUG "UB_PROC: produced %d, job %d, rem %d\n",
++ produced, job, rem);
++
++ /*
++ * Temporary buffer `buf' contains `produced' bytes.
++ * Extract no more than `len' bytes at offset `rem'.
++ */
++ if (produced <= rem)
++ goto out_free;
++ tocopy = produced - rem;
++ if (len < tocopy)
++ tocopy = len;
++ if (!tocopy)
++ goto out_free;
++ if (copy_to_user(usrbuf, buf + rem, tocopy))
++ goto fault;
++ off += tocopy; /* can't overflow */
++ *poff = off;
++ len -= tocopy;
++ retval += tocopy;
++ if (!len)
++ goto out_free;
++ usrbuf += tocopy;
++ goto again;
++
++fault:
++ retval = -EFAULT;
++out_free:
++ free_page((unsigned long)buf);
++out:
++ return retval;
++
++inval:
++ retval = -EINVAL;
++ goto out_free;
++}
++
++static int ub_proc_open(struct inode *inode, struct file *file)
++{
++ file->private_data = strcmp(file->f_dentry->d_name.name,
++ "user_beancounters") ?
++ (void *)-1 : NULL;
++ return 0;
++}
++
++static struct file_operations ub_file_operations = {
++ .read = &ub_proc_read,
++ .open = &ub_proc_open
++};
++
++#ifdef CONFIG_UBC_DEBUG_KMEM
++#include <linux/seq_file.h>
++#include <linux/kmem_cache.h>
++
++static void *ubd_start(struct seq_file *m, loff_t *pos)
++{
++ loff_t n = *pos;
++ struct user_beancounter *ub;
++ long slot;
++
++ spin_lock_irq(&ub_hash_lock);
++ for (slot = 0; slot < UB_HASH_SIZE; slot++)
++ for (ub = ub_hash[slot].ubh_beans; ub; ub = ub->ub_next) {
++ if (n == 0) {
++ m->private = (void *)slot;
++ return (void *)ub;
++ }
++ n--;
++ }
++ return NULL;
++}
++
++static void *ubd_next(struct seq_file *m, void *p, loff_t *pos)
++{
++ struct user_beancounter *ub;
++ long slot;
++
++ ub = (struct user_beancounter *)p;
++ slot = (long)m->private;
++
++ ++*pos;
++ ub = ub->ub_next;
++ while (1) {
++ for (; ub; ub = ub->ub_next) {
++ m->private = (void *)slot;
++ return (void *)ub;
++ }
++ slot++;
++ if (slot == UB_HASH_SIZE)
++ break;
++ ub = ub_hash[slot].ubh_beans;
++ }
++ return NULL;
++}
++
++static void ubd_stop(struct seq_file *m, void *p)
++{
++ spin_unlock_irq(&ub_hash_lock);
++}
++
++#define PROC_LINE_FMT "\t%-17s\t%5lu\t%5lu\n"
++
++static int ubd_show(struct seq_file *m, void *p)
++{
++ struct user_beancounter *ub;
++ struct ub_cache_counter *cc;
++ long pages, vmpages, pbc, swap, unmap;
++ int i;
++ char id[64];
++
++ ub = (struct user_beancounter *)p;
++ print_ub_uid(ub, id, sizeof(id));
++ seq_printf(m, "%s:%d\n", id, atomic_read(&ub->ub_refcount));
++
++ pages = vmpages = pbc = swap = unmap = 0;
++ for (i = 0; i < NR_CPUS; i++) {
++ pages += ub->ub_stat[i].pages_charged;
++ vmpages += ub->ub_stat[i].vmalloc_charged;
++ pbc += ub->ub_stat[i].pbcs;
++ swap += ub->ub_stat[i].swapin;
++ unmap += ub->ub_stat[i].unmap;
++ }
++ if (pages < 0)
++ pages = 0;
++ if (vmpages < 0)
++ vmpages = 0;
++ seq_printf(m, PROC_LINE_FMT, "pages", pages, PAGE_SIZE);
++ seq_printf(m, PROC_LINE_FMT, "vmalloced", vmpages, PAGE_SIZE);
++
++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_UNUSEDPRIVVM],
++ ub->ub_unused_privvmpages, PAGE_SIZE);
++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_TMPFSPAGES],
++ ub->ub_tmpfs_respages, PAGE_SIZE);
++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_SWAPPAGES],
++ ub->ub_swap_pages, PAGE_SIZE);
++ seq_printf(m, PROC_LINE_FMT, "pbcs", pbc,
++ (unsigned long)sizeof(struct page_beancounter));
++
++ seq_printf(m, PROC_LINE_FMT, "swapin", swap, 0UL);
++ seq_printf(m, PROC_LINE_FMT, "unmap", unmap, 0UL);
++ /* interrupts are disabled by locking ub_hash_lock */
++ spin_lock(&cc_lock);
++ list_for_each_entry (cc, &ub->ub_cclist, ulist) {
++ kmem_cache_t *cachep;
++
++ cachep = cc->cachep;
++ seq_printf(m, PROC_LINE_FMT,
++ cachep->name,
++ cc->counter,
++ (unsigned long)cachep->objuse);
++ }
++ spin_unlock(&cc_lock);
++ return 0;
++}
++
++static struct seq_operations kmemdebug_op = {
++ .start = ubd_start,
++ .next = ubd_next,
++ .stop = ubd_stop,
++ .show = ubd_show,
++};
++
++static int kmem_debug_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &kmemdebug_op);
++}
++
++static struct file_operations kmem_debug_ops = {
++ .open = kmem_debug_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++#endif
++
++void __init ub_init_proc(void)
++{
++ struct proc_dir_entry *entry;
++
++ entry = create_proc_entry("user_beancounters", S_IRUGO, NULL);
++ if (entry)
++ entry->proc_fops = &ub_file_operations;
++ else
++ panic("Can't create /proc/user_beancounters entry!\n");
++
++ entry = create_proc_entry("user_beancounters_sub", S_IRUGO, NULL);
++ if (entry)
++ entry->proc_fops = &ub_file_operations;
++ else
++ panic("Can't create /proc/user_beancounters2 entry!\n");
++
++#ifdef CONFIG_UBC_DEBUG_KMEM
++ entry = create_proc_entry("user_beancounters_debug", S_IRUGO, NULL);
++ if (entry)
++ entry->proc_fops = &kmem_debug_ops;
++ else
++ panic("Can't create /proc/user_beancounters_debug entry!\n");
++#endif
++}
+diff -upr linux-2.6.16.orig/kernel/ub/ub_stat.c linux-2.6.16-026test015/kernel/ub/ub_stat.c
+--- linux-2.6.16.orig/kernel/ub/ub_stat.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_stat.c 2006-07-04 14:41:37.000000000 +0400
+@@ -0,0 +1,465 @@
++/*
++ * kernel/ub/ub_stat.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/timer.h>
++#include <linux/sched.h>
++#include <linux/init.h>
++#include <linux/jiffies.h>
++#include <linux/list.h>
++#include <linux/errno.h>
++#include <linux/suspend.h>
++
++#include <asm/uaccess.h>
++#include <asm/param.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_hash.h>
++#include <ub/ub_stat.h>
++
++static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED;
++static LIST_HEAD(ubs_notify_list);
++static long ubs_min_interval;
++static ubstattime_t ubs_start_time, ubs_end_time;
++static struct timer_list ubs_timer;
++
++static int ubstat_get_list(void *buf, long size)
++{
++ int retval;
++ unsigned long flags;
++ int slotnr;
++ struct ub_hash_slot *slot;
++ struct user_beancounter *ub, *last_ub;
++ long *page, *ptr, *end;
++ int len;
++
++ page = (long *)__get_free_page(GFP_KERNEL);
++ if (page == NULL)
++ return -ENOMEM;
++
++ retval = 0;
++ slotnr = 0;
++ slot = ub_hash;
++ last_ub = NULL;
++ while (1) {
++ ptr = page;
++ end = page + PAGE_SIZE / sizeof(*ptr);
++
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ if (last_ub == NULL)
++ ub = slot->ubh_beans;
++ else
++ ub = last_ub->ub_next;
++ while (1) {
++ for (; ub != NULL; ub = ub->ub_next) {
++ if (ub->parent != NULL)
++ continue;
++ *ptr++ = ub->ub_uid;
++ if (ptr == end)
++ break;
++ }
++ if (ptr == end)
++ break;
++ ++slot;
++ if (++slotnr >= UB_HASH_SIZE)
++ break;
++ ub = slot->ubh_beans;
++ }
++ if (ptr == page)
++ goto out_unlock;
++ if (ub != NULL)
++ get_beancounter(ub);
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++
++ if (last_ub != NULL)
++ put_beancounter(last_ub);
++ last_ub = ub; /* last visited beancounter in the slot */
++
++ len = min_t(long, (ptr - page) * sizeof(*ptr), size);
++ if (copy_to_user(buf, page, len)) {
++ retval = -EFAULT;
++ break;
++ }
++ retval += len;
++ if (len < PAGE_SIZE)
++ break;
++ buf += len;
++ size -= len;
++ }
++out:
++ if (last_ub != NULL)
++ put_beancounter(last_ub);
++ free_page((unsigned long)page);
++ return retval;
++
++out_unlock:
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++ goto out;
++}
++
++static int ubstat_gettime(void *buf, long size)
++{
++ ubgettime_t data;
++ int retval;
++
++ spin_lock(&ubs_notify_lock);
++ data.start_time = ubs_start_time;
++ data.end_time = ubs_end_time;
++ data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ;
++ spin_unlock(&ubs_notify_lock);
++
++ retval = min_t(long, sizeof(data), size);
++ if (copy_to_user(buf, &data, retval))
++ retval = -EFAULT;
++ return retval;
++}
++
++static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf)
++{
++ struct {
++ ubstattime_t start_time;
++ ubstattime_t end_time;
++ ubstatparm_t param[1];
++ } *data;
++
++ data = kbuf;
++ data->start_time = ubs_start_time;
++ data->end_time = ubs_end_time;
++
++ data->param[0].maxheld = ub->ub_store[res].maxheld;
++ data->param[0].failcnt = ub->ub_store[res].failcnt;
++
++ return sizeof(*data);
++}
++
++static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size)
++{
++ int wrote;
++ struct {
++ ubstattime_t start_time;
++ ubstattime_t end_time;
++ ubstatparm_t param[UB_RESOURCES];
++ } *data;
++ int resource;
++
++ data = kbuf;
++ data->start_time = ubs_start_time;
++ data->end_time = ubs_end_time;
++ wrote = sizeof(data->start_time) + sizeof(data->end_time);
++
++ for (resource = 0; resource < UB_RESOURCES; resource++) {
++ if (size < wrote + sizeof(data->param[resource]))
++ break;
++ data->param[resource].maxheld = ub->ub_store[resource].maxheld;
++ data->param[resource].failcnt = ub->ub_store[resource].failcnt;
++ wrote += sizeof(data->param[resource]);
++ }
++
++ return wrote;
++}
++
++static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf,
++ int size)
++{
++ int wrote;
++ struct {
++ ubstattime_t start_time;
++ ubstattime_t end_time;
++ ubstatparmf_t param[UB_RESOURCES];
++ } *data;
++ int resource;
++
++ data = kbuf;
++ data->start_time = ubs_start_time;
++ data->end_time = ubs_end_time;
++ wrote = sizeof(data->start_time) + sizeof(data->end_time);
++
++ for (resource = 0; resource < UB_RESOURCES; resource++) {
++ if (size < wrote + sizeof(data->param[resource]))
++ break;
++ /* The beginning of ubstatparmf_t matches struct ubparm. */
++ memcpy(&data->param[resource], &ub->ub_store[resource],
++ sizeof(ub->ub_store[resource]));
++ data->param[resource].__unused1 = 0;
++ data->param[resource].__unused2 = 0;
++ wrote += sizeof(data->param[resource]);
++ }
++ return wrote;
++}
++
++static int ubstat_get_stat(struct user_beancounter *ub, long cmd,
++ void *buf, long size)
++{
++ void *kbuf;
++ int retval;
++
++ kbuf = (void *)__get_free_page(GFP_KERNEL);
++ if (kbuf == NULL)
++ return -ENOMEM;
++
++ spin_lock(&ubs_notify_lock);
++ switch (UBSTAT_CMD(cmd)) {
++ case UBSTAT_READ_ONE:
++ retval = -EINVAL;
++ if (UBSTAT_PARMID(cmd) >= UB_RESOURCES)
++ break;
++ retval = ubstat_do_read_one(ub,
++ UBSTAT_PARMID(cmd), kbuf);
++ break;
++ case UBSTAT_READ_ALL:
++ retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE);
++ break;
++ case UBSTAT_READ_FULL:
++ retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE);
++ break;
++ default:
++ retval = -EINVAL;
++ }
++ spin_unlock(&ubs_notify_lock);
++
++ if (retval > 0) {
++ retval = min_t(long, retval, size);
++ if (copy_to_user(buf, kbuf, retval))
++ retval = -EFAULT;
++ }
++
++ free_page((unsigned long)kbuf);
++ return retval;
++}
++
++static int ubstat_handle_notifrq(ubnotifrq_t *req)
++{
++ int retval;
++ struct ub_stat_notify *new_notify;
++ struct list_head *entry;
++ struct task_struct *tsk_to_free;
++
++ new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL);
++ if (new_notify == NULL)
++ return -ENOMEM;
++
++ tsk_to_free = NULL;
++ INIT_LIST_HEAD(&new_notify->list);
++
++ spin_lock(&ubs_notify_lock);
++ list_for_each(entry, &ubs_notify_list) {
++ struct ub_stat_notify *notify;
++
++ notify = list_entry(entry, struct ub_stat_notify, list);
++ if (notify->task == current) {
++ kfree(new_notify);
++ new_notify = notify;
++ break;
++ }
++ }
++
++ retval = -EINVAL;
++ if (req->maxinterval < 1)
++ goto out_unlock;
++ if (req->maxinterval > TIME_MAX_SEC)
++ req->maxinterval = TIME_MAX_SEC;
++ if (req->maxinterval < ubs_min_interval) {
++ unsigned long dif;
++
++ ubs_min_interval = req->maxinterval;
++ dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ;
++ if (dif > req->maxinterval)
++ mod_timer(&ubs_timer,
++ ubs_timer.expires -
++ (dif - req->maxinterval) * HZ);
++ }
++
++ if (entry != &ubs_notify_list) {
++ list_del(&new_notify->list);
++ tsk_to_free = new_notify->task;
++ }
++ if (req->signum) {
++ new_notify->task = current;
++ get_task_struct(new_notify->task);
++ new_notify->signum = req->signum;
++ list_add(&new_notify->list, &ubs_notify_list);
++ } else
++ kfree(new_notify);
++ retval = 0;
++out_unlock:
++ spin_unlock(&ubs_notify_lock);
++ if (tsk_to_free != NULL)
++ put_task_struct(tsk_to_free);
++ return retval;
++}
++
++/*
++ * former sys_ubstat
++ */
++long do_ubstat(int func, unsigned long arg1, unsigned long arg2, void *buf,
++ long size)
++{
++ int retval;
++ struct user_beancounter *ub;
++
++ if (func == UBSTAT_UBPARMNUM)
++ return UB_RESOURCES;
++ if (func == UBSTAT_UBLIST)
++ return ubstat_get_list(buf, size);
++ if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)))
++ return -EPERM;
++
++ if (func == UBSTAT_GETTIME) {
++ retval = ubstat_gettime(buf, size);
++ goto notify;
++ }
++
++ ub = get_exec_ub();
++ if (ub != NULL && ub->ub_uid == arg1)
++ get_beancounter(ub);
++ else /* FIXME must be if (ve_is_super) */
++ ub = get_beancounter_byuid(arg1, 0);
++
++ if (ub == NULL)
++ return -ESRCH;
++
++ retval = ubstat_get_stat(ub, func, buf, size);
++ put_beancounter(ub);
++notify:
++ /* Handle request for notification */
++ if (retval >= 0) {
++ ubnotifrq_t notifrq;
++ int err;
++
++ err = -EFAULT;
++ if (!copy_from_user(&notifrq, (void *)arg2, sizeof(notifrq)))
++ err = ubstat_handle_notifrq(&notifrq);
++ if (err)
++ retval = err;
++ }
++
++ return retval;
++}
++
++static void ubstat_save_onestat(struct user_beancounter *ub)
++{
++ int resource;
++
++ /* called with local irq disabled */
++ spin_lock(&ub->ub_lock);
++ for (resource = 0; resource < UB_RESOURCES; resource++) {
++ memcpy(&ub->ub_store[resource], &ub->ub_parms[resource],
++ sizeof(struct ubparm));
++ ub->ub_parms[resource].minheld =
++ ub->ub_parms[resource].maxheld =
++ ub->ub_parms[resource].held;
++ }
++ spin_unlock(&ub->ub_lock);
++}
++
++static void ubstat_save_statistics(void)
++{
++ unsigned long flags;
++ int i;
++ struct user_beancounter *ub;
++
++ spin_lock_irqsave(&ub_hash_lock, flags);
++ for_each_beancounter(i, ub)
++ ubstat_save_onestat(ub);
++ spin_unlock_irqrestore(&ub_hash_lock, flags);
++}
++
++static void ubstatd_timeout(unsigned long __data)
++{
++ struct task_struct *p;
++
++ p = (struct task_struct *) __data;
++ wake_up_process(p);
++}
++
++/*
++ * Safe wrapper for send_sig. It prevents a race with release_task
++ * for sighand.
++ * Should be called under tasklist_lock.
++ */
++static void task_send_sig(struct ub_stat_notify *notify)
++{
++ if (likely(notify->task->sighand != NULL))
++ send_sig(notify->signum, notify->task, 1);
++}
++
++static inline void do_notifies(void)
++{
++ LIST_HEAD(notif_free_list);
++ struct ub_stat_notify *notify;
++ struct ub_stat_notify *tmp;
++
++ spin_lock(&ubs_notify_lock);
++ ubs_start_time = ubs_end_time;
++ /*
++ * the expression below relies on time being unsigned long and
++ * arithmetic promotion rules
++ */
++ ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ;
++ mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ);
++ ubs_min_interval = TIME_MAX_SEC;
++ /* save statistics accumulated for the interval */
++ ubstat_save_statistics();
++ /* send signals */
++ read_lock(&tasklist_lock);
++ while (!list_empty(&ubs_notify_list)) {
++ notify = list_entry(ubs_notify_list.next,
++ struct ub_stat_notify, list);
++ task_send_sig(notify);
++ list_del(&notify->list);
++ list_add(&notify->list, &notif_free_list);
++ }
++ read_unlock(&tasklist_lock);
++ spin_unlock(&ubs_notify_lock);
++
++ list_for_each_entry_safe(notify, tmp, &notif_free_list, list) {
++ put_task_struct(notify->task);
++ kfree(notify);
++ }
++}
++
++/*
++ * Kernel thread
++ */
++static int ubstatd(void *unused)
++{
++ /* daemonize call will take care of signals */
++ daemonize("ubstatd");
++
++ ubs_timer.data = (unsigned long)current;
++ ubs_timer.function = ubstatd_timeout;
++ add_timer(&ubs_timer);
++
++ while (1) {
++ set_task_state(current, TASK_INTERRUPTIBLE);
++ if (time_after(ubs_timer.expires, jiffies)) {
++ schedule();
++ try_to_freeze();
++ continue;
++ }
++
++ __set_task_state(current, TASK_RUNNING);
++ do_notifies();
++ }
++ return 0;
++}
++
++static int __init ubstatd_init(void)
++{
++ init_timer(&ubs_timer);
++ ubs_timer.expires = TIME_MAX_JIF;
++ ubs_min_interval = TIME_MAX_SEC;
++ ubs_start_time = ubs_end_time = 0;
++
++ kernel_thread(ubstatd, NULL, 0);
++ return 0;
++}
++
++module_init(ubstatd_init);
+diff -upr linux-2.6.16.orig/kernel/ub/ub_sys.c linux-2.6.16-026test015/kernel/ub/ub_sys.c
+--- linux-2.6.16.orig/kernel/ub/ub_sys.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ub/ub_sys.c 2006-07-04 14:41:38.000000000 +0400
+@@ -0,0 +1,154 @@
++/*
++ * kernel/ub/ub_sys.c
++ *
++ * Copyright (C) 2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/config.h>
++#include <asm/uaccess.h>
++
++#include <ub/beancounter.h>
++
++#ifndef CONFIG_USER_RESOURCE
++asmlinkage long sys_getluid(void)
++{
++ return -ENOSYS;
++}
++
++asmlinkage long sys_setluid(uid_t uid)
++{
++ return -ENOSYS;
++}
++
++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
++ unsigned long *limits)
++{
++ return -ENOSYS;
++}
++
++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2,
++ void *buf, long size)
++{
++ return -ENOSYS;
++}
++#else /* CONFIG_USER_RESOURCE */
++
++/*
++ * The (rather boring) getluid syscall
++ */
++asmlinkage long sys_getluid(void)
++{
++ struct user_beancounter *ub;
++
++ ub = get_exec_ub();
++ if (ub == NULL)
++ return -EINVAL;
++
++ return ub->ub_uid;
++}
++
++/*
++ * The setluid syscall
++ */
++asmlinkage long sys_setluid(uid_t uid)
++{
++ struct user_beancounter *ub;
++ struct task_beancounter *task_bc;
++ int error;
++
++ task_bc = &current->task_bc;
++
++ /* You may not disown a setluid */
++ error = -EINVAL;
++ if (uid == (uid_t)-1)
++ goto out;
++
++ /* You may only set an ub as root */
++ error = -EPERM;
++ if (!capable(CAP_SETUID))
++ goto out;
++
++ /* Ok - set up a beancounter entry for this user */
++ error = -ENOBUFS;
++ ub = get_beancounter_byuid(uid, 1);
++ if (ub == NULL)
++ goto out;
++
++ ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) "
++ "for %.20s pid %d\n",
++ ub, atomic_read(&ub->ub_refcount),
++ current->comm, current->pid);
++ /* install bc */
++ put_beancounter(task_bc->exec_ub);
++ task_bc->exec_ub = ub;
++ put_beancounter(task_bc->fork_sub);
++ task_bc->fork_sub = get_beancounter(ub);
++ error = 0;
++out:
++ return error;
++}
++
++/*
++ * The setbeanlimit syscall
++ */
++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource,
++ unsigned long *limits)
++{
++ int error;
++ unsigned long flags;
++ struct user_beancounter *ub;
++ unsigned long new_limits[2];
++
++ error = -EPERM;
++ if(!capable(CAP_SYS_RESOURCE))
++ goto out;
++
++ if (!ve_is_super(get_exec_env()))
++ goto out;
++
++ error = -EINVAL;
++ if (resource >= UB_RESOURCES)
++ goto out;
++
++ error = -EFAULT;
++ if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
++ goto out;
++
++ error = -EINVAL;
++ if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
++ goto out;
++
++ error = -ENOENT;
++ ub = get_beancounter_byuid(uid, 0);
++ if (ub == NULL) {
++ ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid);
++ goto out;
++ }
++
++ spin_lock_irqsave(&ub->ub_lock, flags);
++ ub->ub_parms[resource].barrier = new_limits[0];
++ ub->ub_parms[resource].limit = new_limits[1];
++ spin_unlock_irqrestore(&ub->ub_lock, flags);
++
++ put_beancounter(ub);
++
++ error = 0;
++out:
++ return error;
++}
++
++extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
++ void *buf, long size);
++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2,
++ void *buf, long size)
++{
++ if (!ve_is_super(get_exec_env()))
++ return -EPERM;
++
++ return do_ubstat(func, arg1, arg2, buf, size);
++}
++#endif
+diff -upr linux-2.6.16.orig/kernel/uid16.c linux-2.6.16-026test015/kernel/uid16.c
+--- linux-2.6.16.orig/kernel/uid16.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/uid16.c 2006-07-04 14:41:36.000000000 +0400
+@@ -20,43 +20,67 @@
+
+ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group)
+ {
+- return sys_chown(filename, low2highuid(user), low2highgid(group));
++ long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group)
+ {
+- return sys_lchown(filename, low2highuid(user), low2highgid(group));
++ long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
+ {
+- return sys_fchown(fd, low2highuid(user), low2highgid(group));
++ long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
+ {
+- return sys_setregid(low2highgid(rgid), low2highgid(egid));
++ long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_setgid16(old_gid_t gid)
+ {
+- return sys_setgid(low2highgid(gid));
++ long ret = sys_setgid(low2highgid(gid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
+ {
+- return sys_setreuid(low2highuid(ruid), low2highuid(euid));
++ long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_setuid16(old_uid_t uid)
+ {
+- return sys_setuid(low2highuid(uid));
++ long ret = sys_setuid(low2highuid(uid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
+ {
+- return sys_setresuid(low2highuid(ruid), low2highuid(euid),
+- low2highuid(suid));
++ long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
++ low2highuid(suid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
+@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_
+
+ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
+ {
+- return sys_setresgid(low2highgid(rgid), low2highgid(egid),
+- low2highgid(sgid));
++ long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
++ low2highgid(sgid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
+@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_
+
+ asmlinkage long sys_setfsuid16(old_uid_t uid)
+ {
+- return sys_setfsuid(low2highuid(uid));
++ long ret = sys_setfsuid(low2highuid(uid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ asmlinkage long sys_setfsgid16(old_gid_t gid)
+ {
+- return sys_setfsgid(low2highgid(gid));
++ long ret = sys_setfsgid(low2highgid(gid));
++ /* avoid REGPARM breakage on x86: */
++ prevent_tail_call(ret);
++ return ret;
+ }
+
+ static int groups16_to_user(old_gid_t __user *grouplist,
+diff -upr linux-2.6.16.orig/kernel/user.c linux-2.6.16-026test015/kernel/user.c
+--- linux-2.6.16.orig/kernel/user.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/kernel/user.c 2006-07-04 14:41:39.000000000 +0400
+@@ -14,6 +14,7 @@
+ #include <linux/bitops.h>
+ #include <linux/key.h>
+ #include <linux/interrupt.h>
++#include <linux/module.h>
+
+ /*
+ * UID task count cache, to get fast user lookup in "alloc_uid"
+@@ -24,7 +25,20 @@
+ #define UIDHASH_SZ (1 << UIDHASH_BITS)
+ #define UIDHASH_MASK (UIDHASH_SZ - 1)
+ #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
+-#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
++#define __uidhashentry(uid) (uidhash_table + __uidhashfn((uid)))
++
++#ifdef CONFIG_VE
++#define UIDHASH_MASK_VE (UIDHASH_SZ_VE - 1)
++#define __uidhashfn_ve(uid) (((uid >> UIDHASH_BITS_VE) ^ uid) & \
++ UIDHASH_MASK_VE)
++#define __uidhashentry_ve(uid, envid) ((envid)->uidhash_table + \
++ __uidhashfn_ve(uid))
++#define uidhashentry_ve(uid) (ve_is_super(get_exec_env()) ? \
++ __uidhashentry(uid) : \
++ __uidhashentry_ve(uid, get_exec_env()))
++#else
++#define uidhashentry_ve(uid) __uidhashentry(uid)
++#endif
+
+ static kmem_cache_t *uid_cachep;
+ static struct list_head uidhash_table[UIDHASH_SZ];
+@@ -96,7 +110,7 @@ struct user_struct *find_user(uid_t uid)
+ unsigned long flags;
+
+ spin_lock_irqsave(&uidhash_lock, flags);
+- ret = uid_hash_find(uid, uidhashentry(uid));
++ ret = uid_hash_find(uid, uidhashentry_ve(uid));
+ spin_unlock_irqrestore(&uidhash_lock, flags);
+ return ret;
+ }
+@@ -115,10 +129,11 @@ void free_uid(struct user_struct *up)
+ }
+ local_irq_restore(flags);
+ }
++EXPORT_SYMBOL_GPL(free_uid);
+
+ struct user_struct * alloc_uid(uid_t uid)
+ {
+- struct list_head *hashent = uidhashentry(uid);
++ struct list_head *hashent = uidhashentry_ve(uid);
+ struct user_struct *up;
+
+ spin_lock_irq(&uidhash_lock);
+@@ -168,6 +183,7 @@ struct user_struct * alloc_uid(uid_t uid
+ }
+ return up;
+ }
++EXPORT_SYMBOL_GPL(alloc_uid);
+
+ void switch_uid(struct user_struct *new_user)
+ {
+@@ -186,21 +202,21 @@ void switch_uid(struct user_struct *new_
+ free_uid(old_user);
+ suid_keys(current);
+ }
+-
++EXPORT_SYMBOL_GPL(switch_uid);
+
+ static int __init uid_cache_init(void)
+ {
+ int n;
+
+ uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
+- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
++ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL);
+
+ for(n = 0; n < UIDHASH_SZ; ++n)
+ INIT_LIST_HEAD(uidhash_table + n);
+
+ /* Insert the root user immediately (init already runs as root) */
+ spin_lock_irq(&uidhash_lock);
+- uid_hash_insert(&root_user, uidhashentry(0));
++ uid_hash_insert(&root_user, __uidhashentry(0));
+ spin_unlock_irq(&uidhash_lock);
+
+ return 0;
+diff -upr linux-2.6.16.orig/kernel/ve.c linux-2.6.16-026test015/kernel/ve.c
+--- linux-2.6.16.orig/kernel/ve.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/ve.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,205 @@
++/*
++ * linux/kernel/ve.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++/*
++ * 've.c' helper file performing VE sub-system initialization
++ */
++
++#include <linux/sched.h>
++#include <linux/delay.h>
++#include <linux/capability.h>
++#include <linux/ve.h>
++#include <linux/smp_lock.h>
++#include <linux/init.h>
++
++#include <linux/errno.h>
++#include <linux/unistd.h>
++#include <linux/slab.h>
++#include <linux/sys.h>
++#include <linux/kdev_t.h>
++#include <linux/termios.h>
++#include <linux/tty_driver.h>
++#include <linux/netdevice.h>
++#include <linux/utsname.h>
++#include <linux/proc_fs.h>
++#include <linux/kernel_stat.h>
++#include <linux/module.h>
++#include <linux/rcupdate.h>
++#include <linux/ve_proto.h>
++#include <linux/ve_owner.h>
++#include <linux/devpts_fs.h>
++
++#include <linux/nfcalls.h>
++
++unsigned long vz_rstamp = 0x37e0f59d;
++
++#ifdef CONFIG_MODULES
++struct module no_module = { .state = MODULE_STATE_GOING };
++EXPORT_SYMBOL(no_module);
++#endif
++
++#ifdef CONFIG_VE
++
++DCL_VE_OWNER(SKB, struct sk_buff, owner_env)
++DCL_VE_OWNER(SK, struct sock, sk_owner_env)
++DCL_VE_OWNER(TW, struct tcp_tw_bucket, tw_owner_env)
++DCL_VE_OWNER(FILP, struct file, owner_env)
++DCL_VE_OWNER(FSTYPE, struct file_system_type, owner_env)
++
++INIT_KSYM_MODULE(x_tables);
++INIT_KSYM_MODULE(xt_tcpudp);
++INIT_KSYM_MODULE(ip_tables);
++INIT_KSYM_MODULE(ip6_tables);
++INIT_KSYM_MODULE(iptable_filter);
++INIT_KSYM_MODULE(ip6table_filter);
++INIT_KSYM_MODULE(iptable_mangle);
++INIT_KSYM_MODULE(ip6table_mangle);
++INIT_KSYM_MODULE(xt_limit);
++INIT_KSYM_MODULE(ipt_multiport);
++INIT_KSYM_MODULE(ip6t_multiport);
++INIT_KSYM_MODULE(ipt_tos);
++INIT_KSYM_MODULE(ipt_TOS);
++INIT_KSYM_MODULE(ipt_REJECT);
++INIT_KSYM_MODULE(ip6t_REJECT);
++INIT_KSYM_MODULE(ipt_TCPMSS);
++INIT_KSYM_MODULE(xt_tcpmss);
++INIT_KSYM_MODULE(ipt_ttl);
++INIT_KSYM_MODULE(ipt_LOG);
++INIT_KSYM_MODULE(ip6t_LOG);
++INIT_KSYM_MODULE(xt_length);
++INIT_KSYM_MODULE(ip_conntrack);
++INIT_KSYM_MODULE(ip_conntrack_ftp);
++INIT_KSYM_MODULE(ip_conntrack_irc);
++INIT_KSYM_MODULE(xt_conntrack);
++INIT_KSYM_MODULE(xt_state);
++INIT_KSYM_MODULE(xt_helper);
++INIT_KSYM_MODULE(ip_nat);
++INIT_KSYM_MODULE(iptable_nat);
++INIT_KSYM_MODULE(ip_nat_ftp);
++INIT_KSYM_MODULE(ip_nat_irc);
++INIT_KSYM_MODULE(ipt_REDIRECT);
++
++INIT_KSYM_CALL(int, init_netfilter, (void));
++INIT_KSYM_CALL(int, init_xtables, (void));
++INIT_KSYM_CALL(int, init_xt_tcpudp, (void));
++INIT_KSYM_CALL(int, init_iptables, (void));
++INIT_KSYM_CALL(int, init_ip6tables, (void));
++INIT_KSYM_CALL(int, init_iptable_filter, (void));
++INIT_KSYM_CALL(int, init_ip6table_filter, (void));
++INIT_KSYM_CALL(int, init_iptable_mangle, (void));
++INIT_KSYM_CALL(int, init_ip6table_mangle, (void));
++INIT_KSYM_CALL(int, init_xt_limit, (void));
++INIT_KSYM_CALL(int, init_iptable_multiport, (void));
++INIT_KSYM_CALL(int, init_ip6table_multiport, (void));
++INIT_KSYM_CALL(int, init_iptable_tos, (void));
++INIT_KSYM_CALL(int, init_iptable_TOS, (void));
++INIT_KSYM_CALL(int, init_iptable_REJECT, (void));
++INIT_KSYM_CALL(int, init_ip6table_REJECT, (void));
++INIT_KSYM_CALL(int, init_iptable_TCPMSS, (void));
++INIT_KSYM_CALL(int, init_xt_tcpmss, (void));
++INIT_KSYM_CALL(int, init_iptable_ttl, (void));
++INIT_KSYM_CALL(int, init_iptable_LOG, (void));
++INIT_KSYM_CALL(int, init_ip6table_LOG, (void));
++INIT_KSYM_CALL(int, init_xt_length, (void));
++INIT_KSYM_CALL(int, init_iptable_conntrack, (void));
++INIT_KSYM_CALL(int, init_iptable_ftp, (void));
++INIT_KSYM_CALL(int, init_iptable_irc, (void));
++INIT_KSYM_CALL(int, init_xt_conntrack_match, (void));
++INIT_KSYM_CALL(int, init_xt_state, (void));
++INIT_KSYM_CALL(int, init_xt_helper, (void));
++INIT_KSYM_CALL(int, ip_nat_init, (void));
++INIT_KSYM_CALL(int, init_iptable_nat, (void));
++INIT_KSYM_CALL(int, init_iptable_nat_ftp, (void));
++INIT_KSYM_CALL(int, init_iptable_nat_irc, (void));
++INIT_KSYM_CALL(int, init_iptable_REDIRECT, (void));
++INIT_KSYM_CALL(void, fini_iptable_nat_irc, (void));
++INIT_KSYM_CALL(void, fini_iptable_nat_ftp, (void));
++INIT_KSYM_CALL(void, fini_iptable_nat, (void));
++INIT_KSYM_CALL(void, ip_nat_cleanup, (void));
++INIT_KSYM_CALL(void, fini_xt_helper, (void));
++INIT_KSYM_CALL(void, fini_xt_state, (void));
++INIT_KSYM_CALL(void, fini_xt_conntrack_match, (void));
++INIT_KSYM_CALL(void, fini_iptable_irc, (void));
++INIT_KSYM_CALL(void, fini_iptable_ftp, (void));
++INIT_KSYM_CALL(void, fini_iptable_conntrack, (void));
++INIT_KSYM_CALL(void, fini_xt_length, (void));
++INIT_KSYM_CALL(void, fini_ip6table_LOG, (void));
++INIT_KSYM_CALL(void, fini_iptable_LOG, (void));
++INIT_KSYM_CALL(void, fini_iptable_ttl, (void));
++INIT_KSYM_CALL(void, fini_xt_tcpmss, (void));
++INIT_KSYM_CALL(void, fini_iptable_TCPMSS, (void));
++INIT_KSYM_CALL(void, fini_ip6table_REJECT, (void));
++INIT_KSYM_CALL(void, fini_iptable_REJECT, (void));
++INIT_KSYM_CALL(void, fini_iptable_TOS, (void));
++INIT_KSYM_CALL(void, fini_iptable_tos, (void));
++INIT_KSYM_CALL(void, fini_ip6table_multiport, (void));
++INIT_KSYM_CALL(void, fini_iptable_multiport, (void));
++INIT_KSYM_CALL(void, fini_xt_limit, (void));
++INIT_KSYM_CALL(void, fini_iptable_filter, (void));
++INIT_KSYM_CALL(void, fini_ip6table_filter, (void));
++INIT_KSYM_CALL(void, fini_iptable_mangle, (void));
++INIT_KSYM_CALL(void, fini_ip6table_mangle, (void));
++INIT_KSYM_CALL(void, fini_ip6tables, (void));
++INIT_KSYM_CALL(void, fini_iptables, (void));
++INIT_KSYM_CALL(void, fini_xt_tcpudp, (void));
++INIT_KSYM_CALL(void, fini_xtables, (void));
++INIT_KSYM_CALL(void, fini_netfilter, (void));
++INIT_KSYM_CALL(void, fini_iptable_REDIRECT, (void));
++
++INIT_KSYM_CALL(void, ipt_flush_table, (struct xt_table *table));
++INIT_KSYM_CALL(void, ip6t_flush_table, (struct xt_table *table));
++
++#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS)
++INIT_KSYM_MODULE(vzmon);
++INIT_KSYM_CALL(int, real_get_device_perms_ve,
++ (int dev_type, dev_t dev, int access_mode));
++INIT_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env));
++INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env));
++INIT_KSYM_CALL(void, real_update_load_avg_ve, (void));
++
++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode)
++{
++ return KSYMSAFECALL(int, vzmon, real_get_device_perms_ve,
++ (dev_type, dev, access_mode));
++}
++EXPORT_SYMBOL(get_device_perms_ve);
++
++void do_env_cleanup(struct ve_struct *env)
++{
++ KSYMSAFECALL_VOID(vzmon, real_do_env_cleanup, (env));
++}
++
++void do_env_free(struct ve_struct *env)
++{
++ KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env));
++}
++EXPORT_SYMBOL(do_env_free);
++
++void do_update_load_avg_ve(void)
++{
++ KSYMSAFECALL_VOID(vzmon, real_update_load_avg_ve, ());
++}
++#endif
++
++struct ve_struct ve0 = {
++ .utsname = &system_utsname,
++ .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh),
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ ._net_dev_tail = &ve0._net_dev_base,
++ .ifindex = -1,
++#endif
++#ifdef CONFIG_UNIX98_PTYS
++ .devpts_config = &devpts_config,
++#endif
++};
++
++EXPORT_SYMBOL(ve0);
++
++#endif /* CONFIG_VE */
+diff -upr linux-2.6.16.orig/kernel/vecalls.c linux-2.6.16-026test015/kernel/vecalls.c
+--- linux-2.6.16.orig/kernel/vecalls.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/vecalls.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,3547 @@
++/*
++ * linux/kernel/vecalls.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ */
++
++/*
++ * 'vecalls.c' is file with basic VE support. It provides basic primities
++ * along with initialization script
++ */
++
++#include <linux/sched.h>
++#include <linux/delay.h>
++#include <linux/capability.h>
++#include <linux/ve.h>
++#include <linux/smp_lock.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/ve_owner.h>
++#include <linux/errno.h>
++#include <linux/unistd.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/sys.h>
++#include <linux/fs.h>
++#include <linux/namespace.h>
++#include <linux/termios.h>
++#include <linux/tty_driver.h>
++#include <linux/netdevice.h>
++#include <linux/wait.h>
++#include <linux/inetdevice.h>
++#include <net/addrconf.h>
++#include <linux/utsname.h>
++#include <linux/sysctl.h>
++#include <linux/proc_fs.h>
++#include <linux/seq_file.h>
++#include <linux/kernel_stat.h>
++#include <linux/module.h>
++#include <linux/suspend.h>
++#include <linux/rcupdate.h>
++#include <linux/in.h>
++#include <linux/major.h>
++#include <linux/kdev_t.h>
++#include <linux/idr.h>
++#include <linux/inetdevice.h>
++#include <net/pkt_sched.h>
++#include <linux/divert.h>
++#include <ub/beancounter.h>
++
++#include <net/route.h>
++#include <net/ip_fib.h>
++#include <net/ip6_route.h>
++#include <net/arp.h>
++#include <net/ipv6.h>
++
++#include <linux/ve_proto.h>
++#include <linux/venet.h>
++#include <linux/vzctl.h>
++#include <linux/vzcalluser.h>
++#ifdef CONFIG_FAIRSCHED
++#include <linux/fairsched.h>
++#endif
++
++#include <linux/nfcalls.h>
++#include <linux/virtinfo.h>
++
++struct ve_struct *ve_list_head = NULL;
++int nr_ve = 1; /* One VE always exists. Compatibility with vestat */
++rwlock_t ve_list_guard = RW_LOCK_UNLOCKED;
++static rwlock_t devperms_hash_guard = RW_LOCK_UNLOCKED;
++
++extern int glob_virt_pids;
++
++static int do_env_enter(struct ve_struct *ve, unsigned int flags);
++static void do_clean_devperms(envid_t veid);
++static int alloc_ve_tty_drivers(struct ve_struct* ve);
++static void free_ve_tty_drivers(struct ve_struct* ve);
++static int register_ve_tty_drivers(struct ve_struct* ve);
++static void unregister_ve_tty_drivers(struct ve_struct* ve);
++static int init_ve_tty_drivers(struct ve_struct *);
++static void fini_ve_tty_drivers(struct ve_struct *);
++static void clear_termios(struct tty_driver* driver );
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++static void ve_mapped_devs_cleanup(struct ve_struct *ve);
++#endif
++
++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf);
++
++static void vecalls_exit(void);
++
++struct ve_struct *__find_ve_by_id(envid_t veid)
++{
++ struct ve_struct *ve;
++ for (ve = ve_list_head;
++ ve != NULL && ve->veid != veid;
++ ve = ve->next);
++ return ve;
++}
++
++struct ve_struct *get_ve_by_id(envid_t veid)
++{
++ struct ve_struct *ve;
++ read_lock(&ve_list_guard);
++ ve = __find_ve_by_id(veid);
++ get_ve(ve);
++ read_unlock(&ve_list_guard);
++ return ve;
++}
++
++/*
++ * real_put_ve() MUST be used instead of put_ve() inside vecalls.
++ */
++void real_do_env_free(struct ve_struct *ve);
++static inline void real_put_ve(struct ve_struct *ve)
++{
++ if (ve && atomic_dec_and_test(&ve->counter)) {
++ if (atomic_read(&ve->pcounter) > 0)
++ BUG();
++ if (ve->is_running)
++ BUG();
++ real_do_env_free(ve);
++ }
++}
++
++extern struct file_system_type devpts_fs_type;
++extern struct file_system_type sysfs_fs_type;
++extern struct file_system_type tmpfs_fs_type;
++extern struct file_system_type proc_fs_type;
++
++extern spinlock_t task_capability_lock;
++extern void ve_ipc_free(struct ve_struct * ve);
++extern void ip_fragment_cleanup(struct ve_struct *ve);
++
++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf)
++{
++ struct ve_struct *ve;
++ struct vz_cpu_stat *vstat;
++ int retval;
++ int i, cpu;
++ unsigned long tmp;
++
++ if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid))
++ return -EPERM;
++ if (veid == 0)
++ return -ESRCH;
++
++ vstat = kmalloc(sizeof(*vstat), GFP_KERNEL);
++ if (!vstat)
++ return -ENOMEM;
++ memset(vstat, 0, sizeof(*vstat));
++
++ retval = -ESRCH;
++ read_lock(&ve_list_guard);
++ ve = __find_ve_by_id(veid);
++ if (ve == NULL)
++ goto out_unlock;
++ for (cpu = 0; cpu < NR_CPUS; cpu++) {
++ struct ve_cpu_stats *st;
++
++ st = VE_CPU_STATS(ve, cpu);
++ vstat->user_jif += st->user;
++ vstat->nice_jif += st->nice;
++ vstat->system_jif += st->system;
++ vstat->idle_clk += ve_sched_get_idle_time(ve, cpu);
++ }
++ vstat->uptime_clk = get_cycles() - ve->start_cycles;
++ vstat->uptime_jif = jiffies - ve->start_jiffies;
++ for (i = 0; i < 3; i++) {
++ tmp = ve->avenrun[i] + (FIXED_1/200);
++ vstat->avenrun[i].val_int = LOAD_INT(tmp);
++ vstat->avenrun[i].val_frac = LOAD_FRAC(tmp);
++ }
++ read_unlock(&ve_list_guard);
++
++ retval = 0;
++ if (copy_to_user(buf, vstat, sizeof(*vstat)))
++ retval = -EFAULT;
++out_free:
++ kfree(vstat);
++ return retval;
++
++out_unlock:
++ read_unlock(&ve_list_guard);
++ goto out_free;
++}
++
++/**********************************************************************
++ * Devices permissions routines,
++ * character and block devices separately
++ **********************************************************************/
++
++/* Rules applied in the following order:
++ MAJOR!=0, MINOR!=0
++ MAJOR!=0, MINOR==0
++ MAJOR==0, MINOR==0
++*/
++struct devperms_struct
++{
++ dev_t dev; /* device id */
++ unsigned char mask;
++ unsigned type;
++ envid_t veid;
++
++ struct devperms_struct *devhash_next;
++ struct devperms_struct **devhash_pprev;
++};
++
++static struct devperms_struct original_perms[] =
++{{
++ MKDEV(0,0), /*device*/
++ S_IROTH | S_IWOTH,
++ S_IFCHR, /*type*/
++ 0, /*veid*/
++ NULL, NULL
++},
++{
++ MKDEV(0,0), /*device*/
++ S_IXGRP | S_IROTH | S_IWOTH,
++ S_IFBLK, /*type*/
++ 0, /*veid*/
++ NULL, NULL
++}};
++
++static struct devperms_struct default_major_perms[] = {
++ {MKDEV(UNIX98_PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},
++ {MKDEV(UNIX98_PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},
++ {MKDEV(PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},
++ {MKDEV(PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},
++};
++static struct devperms_struct default_minor_perms[] = {
++ {MKDEV(MEM_MAJOR, 3), S_IROTH | S_IWOTH, S_IFCHR}, /* null */
++ {MKDEV(MEM_MAJOR, 5), S_IROTH | S_IWOTH, S_IFCHR}, /* zero */
++ {MKDEV(MEM_MAJOR, 7), S_IROTH | S_IWOTH, S_IFCHR}, /* full */
++ {MKDEV(TTYAUX_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},/* tty */
++ {MKDEV(TTYAUX_MAJOR, 2), S_IROTH | S_IWOTH, S_IFCHR},/* ptmx */
++ {MKDEV(MEM_MAJOR, 8), S_IROTH, S_IFCHR}, /* random */
++ {MKDEV(MEM_MAJOR, 9), S_IROTH, S_IFCHR}, /* urandom */
++};
++
++static struct devperms_struct default_deny_perms = {
++ MKDEV(0, 0), 0, S_IFCHR
++};
++
++static inline struct devperms_struct *find_default_devperms(int type,
++ dev_t dev)
++{
++ int i;
++
++ /* XXX all defaults perms are S_IFCHR */
++ if (type != S_IFCHR)
++ return &default_deny_perms;
++
++ for (i = 0;
++ i < sizeof(default_minor_perms)/sizeof(struct devperms_struct);
++ i++)
++ if (MAJOR(dev) == MAJOR(default_minor_perms[i].dev) &&
++ MINOR(dev) == MINOR(default_minor_perms[i].dev))
++ return &default_minor_perms[i];
++ for (i = 0;
++ i < sizeof(default_major_perms)/sizeof(struct devperms_struct);
++ i++)
++ if (MAJOR(dev) == MAJOR(default_major_perms[i].dev))
++ return &default_major_perms[i];
++
++ return &default_deny_perms;
++}
++
++#define DEVPERMS_HASH_SZ 512
++struct devperms_struct *devperms_hash[DEVPERMS_HASH_SZ];
++
++#define devperms_hashfn(id,dev) \
++ ( (id << 5) ^ (id >> 5) ^ (MAJOR(dev)) ^ MINOR(dev) ) & \
++ (DEVPERMS_HASH_SZ - 1)
++
++static inline void hash_devperms(struct devperms_struct *p)
++{
++ struct devperms_struct **htable =
++ &devperms_hash[devperms_hashfn(p->veid,p->dev)];
++
++ if ((p->devhash_next = *htable) != NULL)
++ (*htable)->devhash_pprev = &p->devhash_next;
++ *htable = p;
++ p->devhash_pprev = htable;
++}
++
++static inline void unhash_devperms(struct devperms_struct *p)
++{
++ if (p->devhash_next)
++ p->devhash_next->devhash_pprev = p->devhash_pprev;
++ *p->devhash_pprev = p->devhash_next;
++}
++
++static int __init init_devperms_hash(void)
++{
++ write_lock_irq(&devperms_hash_guard);
++ memset(devperms_hash, 0, sizeof(devperms_hash));
++ hash_devperms(original_perms);
++ hash_devperms(original_perms+1);
++ write_unlock_irq(&devperms_hash_guard);
++ return 0;
++}
++
++static inline void fini_devperms_hash(void)
++{
++}
++
++static inline struct devperms_struct *find_devperms(envid_t veid,
++ int type,
++ dev_t dev)
++{
++ struct devperms_struct *p, **htable =
++ &devperms_hash[devperms_hashfn(veid,dev)];
++
++ for (p = *htable; p && !(p->type==type &&
++ MAJOR(dev)==MAJOR(p->dev) &&
++ MINOR(dev)==MINOR(p->dev) &&
++ p->veid==veid);
++ p = p->devhash_next)
++ ;
++ return p;
++}
++
++
++static void do_clean_devperms(envid_t veid)
++{
++ int i;
++ struct devperms_struct* ve;
++
++ write_lock_irq(&devperms_hash_guard);
++ for (i = 0; i < DEVPERMS_HASH_SZ; i++)
++ for (ve = devperms_hash[i]; ve;) {
++ struct devperms_struct *next = ve->devhash_next;
++ if (ve->veid == veid) {
++ unhash_devperms(ve);
++ kfree(ve);
++ }
++
++ ve = next;
++ }
++ write_unlock_irq(&devperms_hash_guard);
++}
++
++/*
++ * Mode is a mask of
++ * FMODE_READ for read access (configurable by S_IROTH)
++ * FMODE_WRITE for write access (configurable by S_IWOTH)
++ * FMODE_QUOTACTL for quotactl access (configurable by S_IXGRP)
++ */
++int real_get_device_perms_ve(int dev_type, dev_t dev, int access_mode)
++{
++ struct devperms_struct *perms;
++ struct ve_struct *ve;
++ envid_t veid;
++
++ perms = NULL;
++ ve = get_exec_env();
++ veid = ve->veid;
++
++ read_lock(&devperms_hash_guard);
++
++ perms = find_devperms(veid, dev_type|VE_USE_MINOR, dev);
++ if (perms)
++ goto end;
++
++ perms = find_devperms(veid, dev_type|VE_USE_MAJOR, MKDEV(MAJOR(dev),0));
++ if (perms)
++ goto end;
++
++ perms = find_devperms(veid, dev_type, MKDEV(0,0));
++ if (perms)
++ goto end;
++
++ perms = find_default_devperms(dev_type, dev);
++
++end:
++ read_unlock(&devperms_hash_guard);
++
++ access_mode = "\000\004\002\006\010\014\012\016"[access_mode];
++ return perms ?
++ (((perms->mask & access_mode) == access_mode) ? 0 : -EACCES) :
++ -ENODEV;
++}
++EXPORT_SYMBOL(real_get_device_perms_ve);
++
++int do_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask)
++{
++ struct devperms_struct *perms;
++
++ write_lock_irq(&devperms_hash_guard);
++ perms = find_devperms(veid, type, dev);
++ if (!perms) {
++ struct devperms_struct *perms_new;
++ write_unlock_irq(&devperms_hash_guard);
++
++ perms_new = kmalloc(sizeof(struct devperms_struct), GFP_KERNEL);
++ if (!perms_new)
++ return -ENOMEM;
++
++ write_lock_irq(&devperms_hash_guard);
++ perms = find_devperms(veid, type, dev);
++ if (perms) {
++ kfree(perms_new);
++ perms_new = perms;
++ }
++
++ switch (type & VE_USE_MASK) {
++ case 0:
++ dev = 0;
++ break;
++ case VE_USE_MAJOR:
++ dev = MKDEV(MAJOR(dev),0);
++ break;
++ }
++
++ perms_new->veid = veid;
++ perms_new->dev = dev;
++ perms_new->type = type;
++ perms_new->mask = mask & S_IALLUGO;
++ hash_devperms(perms_new);
++ } else
++ perms->mask = mask & S_IALLUGO;
++ write_unlock_irq(&devperms_hash_guard);
++ return 0;
++}
++EXPORT_SYMBOL(do_setdevperms);
++
++int real_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask)
++{
++ struct ve_struct *ve;
++ int err;
++
++ if (!capable(CAP_SETVEID) || veid == 0)
++ return -EPERM;
++
++ if ((ve = get_ve_by_id(veid)) == NULL)
++ return -ESRCH;
++
++ down_read(&ve->op_sem);
++ err = -ESRCH;
++ if (ve->is_running)
++ err = do_setdevperms(veid, type, dev, mask);
++ up_read(&ve->op_sem);
++ real_put_ve(ve);
++ return err;
++}
++
++void real_update_load_avg_ve(void)
++{
++ struct ve_struct *ve;
++ unsigned long nr_active;
++
++ read_lock(&ve_list_guard);
++ for (ve = ve_list_head; ve != NULL; ve = ve->next) {
++ nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve);
++ nr_active *= FIXED_1;
++ CALC_LOAD(ve->avenrun[0], EXP_1, nr_active);
++ CALC_LOAD(ve->avenrun[1], EXP_5, nr_active);
++ CALC_LOAD(ve->avenrun[2], EXP_15, nr_active);
++ }
++ read_unlock(&ve_list_guard);
++}
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * FS-related helpers to VE start/stop
++ *
++ **********************************************************************
++ **********************************************************************/
++
++/*
++ * DEVPTS needs a virtualization: each environment should see each own list of
++ * pseudo-terminals.
++ * To implement it we need to have separate devpts superblocks for each
++ * VE, and each VE should mount its own one.
++ * Thus, separate vfsmount structures are required.
++ * To minimize intrusion into vfsmount lookup code, separate file_system_type
++ * structures are created.
++ *
++ * In addition to this, patch fo character device itself is required, as file
++ * system itself is used only for MINOR/MAJOR lookup.
++ */
++static int register_ve_fs_type(struct ve_struct *ve,
++ struct file_system_type *template,
++ struct file_system_type **p_fs_type, struct vfsmount **p_mnt)
++{
++ struct vfsmount *mnt;
++ struct file_system_type *local_fs_type;
++ int ret;
++
++ VZTRACE("register_ve_fs_type(\"%s\")\n", template->name);
++
++ local_fs_type = kmalloc(sizeof(*local_fs_type) + sizeof(void *),
++ GFP_KERNEL);
++ if (local_fs_type == NULL)
++ return -ENOMEM;
++
++ memset(local_fs_type, 0, sizeof(*local_fs_type));
++ local_fs_type->name = template->name;
++ local_fs_type->fs_flags = template->fs_flags;
++ local_fs_type->get_sb = template->get_sb;
++ local_fs_type->kill_sb = template->kill_sb;
++ local_fs_type->owner = template->owner;
++ /*
++ * 1. we do not have refcounter on fstype
++ * 2. fstype holds reference to ve using get_ve()/put_ve().
++ * so we free fstype when freeing ve and we are sure it's ok to free it
++ */
++ SET_VE_OWNER_FSTYPE(local_fs_type, ve);
++ get_filesystem(local_fs_type); /* get_ve() inside */
++
++ ret = register_filesystem(local_fs_type); /* does not get */
++ if (ret)
++ goto reg_err;
++
++ mnt = kern_mount(local_fs_type);
++ if (IS_ERR(mnt))
++ goto mnt_err;
++
++ /* Usage counters after succesful execution kern_mount:
++ * local_fs_type - +1 (get_fs_type,get_sb_single,put_filesystem)
++ * mnt - +1 == 1 (alloc_vfsmnt)
++ */
++
++ *p_fs_type = local_fs_type;
++ *p_mnt = mnt;
++ return 0;
++
++mnt_err:
++ ret = PTR_ERR(mnt);
++ unregister_filesystem(local_fs_type); /* does not put */
++
++reg_err:
++ put_filesystem(local_fs_type);
++ kfree(local_fs_type);
++ printk(KERN_DEBUG
++ "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret);
++ return ret;
++}
++
++static void umount_ve_fs_type(struct file_system_type *local_fs_type)
++{
++ struct vfsmount *mnt;
++ struct list_head *p, *q;
++ LIST_HEAD(kill);
++ LIST_HEAD(umount_list);
++
++ down_write(&namespace_sem);
++ spin_lock(&vfsmount_lock);
++ list_for_each_safe(p, q, &current->namespace->list) {
++ mnt = list_entry(p, struct vfsmount, mnt_list);
++ if (mnt->mnt_sb->s_type != local_fs_type)
++ continue;
++ list_del(p);
++ list_add(p, &kill);
++ }
++
++ while (!list_empty(&kill)) {
++ mnt = list_entry(kill.next, struct vfsmount, mnt_list);
++ umount_tree(mnt, 1, &umount_list);
++ }
++ spin_unlock(&vfsmount_lock);
++ up_write(&namespace_sem);
++ release_mounts(&umount_list);
++}
++
++static void unregister_ve_fs_type(struct file_system_type *local_fs_type,
++ struct vfsmount *local_fs_mount)
++{
++ if (local_fs_mount == NULL ||
++ local_fs_type == NULL) {
++ if (local_fs_mount != NULL ||
++ local_fs_type != NULL)
++ BUG();
++ return;
++ }
++
++ VZTRACE("unregister_ve_fs_type(\"%s\")\n", local_fs_type->name);
++
++ unregister_filesystem(local_fs_type);
++ umount_ve_fs_type(local_fs_type);
++ kern_umount(local_fs_mount); /* alias to mntput, drop our ref */
++ put_filesystem(local_fs_type);
++}
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * FS-related helpers to VE start/stop
++ *
++ **********************************************************************
++ **********************************************************************/
++
++#ifdef CONFIG_SYSCTL
++static ctl_table ve_sysctl_tables[] = {
++ /* kernel */
++ {
++ .ctl_name = CTL_KERN,
++ .procname = "kernel",
++ .mode = 0555,
++ .child = &ve_sysctl_tables[2],
++ },
++ { .ctl_name = 0 },
++ /* kernel/[vars] */
++ {
++ .ctl_name = KERN_NODENAME,
++ .procname = "hostname",
++ .maxlen = 64,
++ .mode = 0644,
++ .proc_handler = &proc_doutsstring,
++ .strategy = &sysctl_string,
++ },
++ {
++ .ctl_name = KERN_DOMAINNAME,
++ .procname = "domainname",
++ .maxlen = 64,
++ .mode = 0644,
++ .proc_handler = &proc_doutsstring,
++ .strategy = &sysctl_string,
++ },
++ {
++ .ctl_name = KERN_SHMMAX,
++ .procname = "shmmax",
++ .maxlen = sizeof(size_t),
++ .mode = 0644,
++ .proc_handler = &proc_doulongvec_minmax,
++ },
++ {
++ .ctl_name = KERN_SHMALL,
++ .procname = "shmall",
++ .maxlen = sizeof(size_t),
++ .mode = 0644,
++ .proc_handler = &proc_doulongvec_minmax,
++ },
++ {
++ .ctl_name = KERN_SHMMNI,
++ .procname = "shmmni",
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_MSGMAX,
++ .procname = "msgmax",
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_MSGMNI,
++ .procname = "msgmni",
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_MSGMNB,
++ .procname = "msgmnb",
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
++ {
++ .ctl_name = KERN_SEM,
++ .procname = "sem",
++ .maxlen = 4 * sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec
++ },
++ { .ctl_name = 0, }
++};
++
++static int register_ve_sysctltables(struct ve_struct *ve)
++{
++ struct ctl_table_header *header;
++ ctl_table *root, *table;
++
++ VZTRACE("register_ve_sysctltables\n");
++
++ root = clone_sysctl_template(ve_sysctl_tables,
++ sizeof(ve_sysctl_tables) / sizeof(ctl_table));
++ if (root == NULL)
++ goto out;
++
++ table = root->child;
++ table[0].data = &ve->utsname->nodename;
++ table[1].data = &ve->utsname->domainname;
++ table[2].data = &ve->_shm_ctlmax;
++ table[3].data = &ve->_shm_ctlall;
++ table[4].data = &ve->_shm_ctlmni;
++ table[5].data = &ve->_msg_ctlmax;
++ table[6].data = &ve->_msg_ctlmni;
++ table[7].data = &ve->_msg_ctlmnb;
++ table[8].data = &ve->_sem_ctls[0];
++
++ /* insert at head to override kern entries */
++ header = register_sysctl_table(root, 1);
++ if (header == NULL)
++ goto out_free;
++
++ ve->kern_header = header;
++ ve->kern_table = root;
++ return 0;
++
++out_free:
++ free_sysctl_clone(root);
++out:
++ return -ENOMEM;
++}
++
++static inline void unregister_ve_sysctltables(struct ve_struct *ve)
++{
++ unregister_sysctl_table(ve->kern_header);
++}
++
++static inline void free_ve_sysctltables(struct ve_struct *ve)
++{
++ free_sysctl_clone(ve->kern_table);
++}
++#endif
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE start: subsystems
++ *
++ **********************************************************************
++ **********************************************************************/
++
++extern struct new_utsname virt_utsname;
++
++static int init_ve_utsname(struct ve_struct *ve)
++{
++ ve->utsname = kmalloc(sizeof(*ve->utsname), GFP_KERNEL);
++ if (ve->utsname == NULL)
++ return -ENOMEM;
++
++ down_read(&uts_sem); /* protect the source */
++ memcpy(ve->utsname, &system_utsname, sizeof(*ve->utsname));
++ memcpy(ve->utsname->release, virt_utsname.release,
++ sizeof(virt_utsname.release));
++ up_read(&uts_sem);
++
++ return 0;
++}
++
++static void free_ve_utsname(struct ve_struct *ve)
++{
++ kfree(ve->utsname);
++ ve->utsname = NULL;
++}
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#include <net/ip.h>
++#include <net/tcp.h>
++#include <net/udp.h>
++#include <net/icmp.h>
++
++static int init_fini_ve_mibs(struct ve_struct *ve, int fini)
++{
++ if (fini)
++ goto fini;
++ if (!(ve->_net_statistics[0] = alloc_percpu(struct linux_mib)))
++ goto out1;
++ if (!(ve->_net_statistics[1] = alloc_percpu(struct linux_mib)))
++ goto out2;
++ if (!(ve->_ip_statistics[0] = alloc_percpu(struct ipstats_mib)))
++ goto out3;
++ if (!(ve->_ip_statistics[1] = alloc_percpu(struct ipstats_mib)))
++ goto out4;
++ if (!(ve->_icmp_statistics[0] = alloc_percpu(struct icmp_mib)))
++ goto out5;
++ if (!(ve->_icmp_statistics[1] = alloc_percpu(struct icmp_mib)))
++ goto out6;
++ if (!(ve->_tcp_statistics[0] = alloc_percpu(struct tcp_mib)))
++ goto out7;
++ if (!(ve->_tcp_statistics[1] = alloc_percpu(struct tcp_mib)))
++ goto out8;
++ if (!(ve->_udp_statistics[0] = alloc_percpu(struct udp_mib)))
++ goto out9;
++ if (!(ve->_udp_statistics[1] = alloc_percpu(struct udp_mib)))
++ goto out10;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ if (!(ve->_ipv6_statistics[0] = alloc_percpu(struct ipstats_mib)))
++ goto out11;
++ if (!(ve->_ipv6_statistics[1] = alloc_percpu(struct ipstats_mib)))
++ goto out12;
++ if (!(ve->_icmpv6_statistics[0] = alloc_percpu(struct icmpv6_mib)))
++ goto out13;
++ if (!(ve->_icmpv6_statistics[1] = alloc_percpu(struct icmpv6_mib)))
++ goto out14;
++ if (!(ve->_udp_stats_in6[0] = alloc_percpu(struct udp_mib)))
++ goto out15;
++ if (!(ve->_udp_stats_in6[1] = alloc_percpu(struct udp_mib)))
++ goto out16;
++#endif
++ return 0;
++fini:
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ free_percpu(ve->_udp_stats_in6[1]);
++out16:
++ free_percpu(ve->_udp_stats_in6[0]);
++out15:
++ free_percpu(ve->_icmpv6_statistics[1]);
++out14:
++ free_percpu(ve->_icmpv6_statistics[0]);
++out13:
++ free_percpu(ve->_ipv6_statistics[1]);
++out12:
++ free_percpu(ve->_ipv6_statistics[0]);
++out11:
++#endif
++ free_percpu(ve->_udp_statistics[1]);
++out10:
++ free_percpu(ve->_udp_statistics[0]);
++out9:
++ free_percpu(ve->_tcp_statistics[1]);
++out8:
++ free_percpu(ve->_tcp_statistics[0]);
++out7:
++ free_percpu(ve->_icmp_statistics[1]);
++out6:
++ free_percpu(ve->_icmp_statistics[0]);
++out5:
++ free_percpu(ve->_ip_statistics[1]);
++out4:
++ free_percpu(ve->_ip_statistics[0]);
++out3:
++ free_percpu(ve->_net_statistics[1]);
++out2:
++ free_percpu(ve->_net_statistics[0]);
++out1:
++ return -ENOMEM;
++}
++
++static inline int init_ve_mibs(struct ve_struct *ve)
++{
++ return init_fini_ve_mibs(ve, 0);
++}
++
++static inline void fini_ve_mibs(struct ve_struct *ve)
++{
++ (void)init_fini_ve_mibs(ve, 1);
++}
++
++extern struct net_device templ_loopback_dev;
++static void veloop_setup(struct net_device *dev)
++{
++ int padded;
++ padded = dev->padded;
++ memcpy(dev, &templ_loopback_dev, sizeof(struct net_device));
++ dev->padded = padded;
++}
++
++static int init_ve_netdev(void)
++{
++ struct ve_struct *ve;
++ struct net_device_stats *stats;
++ int err;
++
++ ve = get_exec_env();
++ INIT_HLIST_HEAD(&ve->_net_dev_head);
++ ve->_net_dev_base = NULL;
++ ve->_net_dev_tail = &ve->_net_dev_base;
++
++ ve->_loopback_dev = alloc_netdev(0, templ_loopback_dev.name,
++ veloop_setup);
++ if (ve->_loopback_dev == NULL)
++ return -ENOMEM;
++ if (loopback_dev.get_stats != NULL) {
++ stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
++ if (stats != NULL) {
++ memset(stats, 0, sizeof(struct net_device_stats));
++ ve->_loopback_dev->priv = stats;
++ ve->_loopback_dev->get_stats = loopback_dev.get_stats;
++ ve->_loopback_dev->destructor = loopback_dev.destructor;
++ }
++ }
++ err = register_netdev(ve->_loopback_dev);
++ if (err) {
++ if (ve->_loopback_dev->priv != NULL)
++ kfree(ve->_loopback_dev->priv);
++ free_netdev(ve->_loopback_dev);
++ }
++ return err;
++}
++
++static void fini_ve_netdev(void)
++{
++ struct ve_struct *ve;
++ struct net_device *dev;
++
++ ve = get_exec_env();
++ while (1) {
++ rtnl_lock();
++ /*
++ * loopback is special, it can be referenced in fib's,
++ * so it must be freed the last. Doing so is
++ * sufficient to guarantee absence of such references.
++ */
++ if (dev_base == ve->_loopback_dev)
++ dev = dev_base->next;
++ else
++ dev = dev_base;
++ if (dev == NULL)
++ break;
++ unregister_netdevice(dev);
++ rtnl_unlock();
++ free_netdev(dev);
++ }
++ unregister_netdevice(ve->_loopback_dev);
++ rtnl_unlock();
++ free_netdev(ve->_loopback_dev);
++ ve->_loopback_dev = NULL;
++}
++#else
++#define init_ve_mibs(ve) (0)
++#define fini_ve_mibs(ve) do { } while (0)
++#define init_ve_netdev() (0)
++#define fini_ve_netdev() do { } while (0)
++#endif
++
++static int prepare_proc_root(struct ve_struct *ve)
++{
++ struct proc_dir_entry *de;
++
++ de = kmalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL);
++ if (de == NULL)
++ return -ENOMEM;
++ memset(de, 0, sizeof(struct proc_dir_entry));
++ memcpy(de + 1, "/proc", 6);
++ de->name = (char *)(de + 1);
++ de->namelen = 5;
++ de->mode = S_IFDIR | S_IRUGO | S_IXUGO;
++ de->nlink = 2;
++ atomic_set(&de->count, 1);
++
++ ve->proc_root = de;
++ return 0;
++}
++
++#ifdef CONFIG_PROC_FS
++static int init_ve_proc(struct ve_struct *ve)
++{
++ int err;
++ struct proc_dir_entry *de;
++
++ err = prepare_proc_root(ve);
++ if (err)
++ goto out_root;
++
++ err = register_ve_fs_type(ve, &proc_fs_type,
++ &ve->proc_fstype, &ve->proc_mnt);
++ if (err)
++ goto out_reg;
++
++ /* create necessary /proc subdirs in VE local proc tree */
++ err = -ENOMEM;
++ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL);
++ if (!de)
++ goto out_vz;
++
++#ifdef CONFIG_VE_IPTABLES
++ proc_net = proc_mkdir("net", NULL);
++ if (!proc_net)
++ goto out_net;
++#endif
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ if (ve_snmp_proc_init())
++ goto out_snmp;
++#endif
++
++ return 0;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++out_snmp:
++ remove_proc_entry("net", NULL);
++#endif
++#ifdef CONFIG_VE_IPTABLES
++out_net:
++ remove_proc_entry("vz", NULL);
++#endif
++out_vz:
++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt);
++ ve->proc_mnt = NULL;
++out_reg:
++ /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */
++ ;
++out_root:
++ return err;
++}
++
++static void fini_ve_proc(struct ve_struct *ve)
++{
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ ve_snmp_proc_fini();
++#endif
++#ifdef CONFIG_VE_IPTABLES
++ remove_proc_entry("net", NULL);
++ proc_net = NULL;
++#endif
++ remove_proc_entry("vz", NULL);
++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt);
++ ve->proc_mnt = NULL;
++}
++
++static void free_ve_proc(struct ve_struct *ve)
++{
++ /* proc filesystem frees proc_dir_entries on remove_proc_entry() only,
++ so we check that everything was removed and not lost */
++ if (ve->proc_root && ve->proc_root->subdir) {
++ struct proc_dir_entry *p = ve->proc_root;
++ printk(KERN_WARNING "VPS: %d: proc entry /proc", ve->veid);
++ while ((p = p->subdir) != NULL)
++ printk("/%s", p->name);
++ printk(" is not removed!\n");
++ }
++
++ kfree(ve->proc_root);
++ kfree(ve->proc_fstype);
++
++ ve->proc_fstype = NULL;
++ ve->proc_root = NULL;
++}
++#else
++#define init_ve_proc(ve) (0)
++#define fini_ve_proc(ve) do { } while (0)
++#define free_ve_proc(ve) do { } while (0)
++#endif
++
++#ifdef CONFIG_SYSCTL
++static int init_ve_sysctl(struct ve_struct *ve)
++{
++ int err;
++
++#ifdef CONFIG_PROC_FS
++ err = -ENOMEM;
++ ve->proc_sys_root = proc_mkdir("sys", 0);
++ if (ve->proc_sys_root == NULL)
++ goto out_proc;
++#endif
++ INIT_LIST_HEAD(&ve->sysctl_lh);
++ err = register_ve_sysctltables(ve);
++ if (err)
++ goto out_reg;
++
++ err = devinet_sysctl_init(ve);
++ if (err)
++ goto out_dev;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ err = addrconf_sysctl_init(ve);
++ if (err)
++ goto out_dev6;
++#endif
++
++ return 0;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++out_dev6:
++ devinet_sysctl_fini(ve);
++#endif
++out_dev:
++ unregister_ve_sysctltables(ve);
++ free_ve_sysctltables(ve);
++out_reg:
++#ifdef CONFIG_PROC_FS
++ remove_proc_entry("sys", NULL);
++out_proc:
++#endif
++ return err;
++}
++
++static void fini_ve_sysctl(struct ve_struct *ve)
++{
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ addrconf_sysctl_fini(ve);
++#endif
++ devinet_sysctl_fini(ve);
++ unregister_ve_sysctltables(ve);
++ remove_proc_entry("sys", NULL);
++}
++
++static void free_ve_sysctl(struct ve_struct *ve)
++{
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ addrconf_sysctl_free(ve);
++#endif
++ devinet_sysctl_free(ve);
++ free_ve_sysctltables(ve);
++}
++#else
++#define init_ve_sysctl(ve) (0)
++#define fini_ve_sysctl(ve) do { } while (0)
++#define free_ve_sysctl(ve) do { } while (0)
++#endif
++
++#ifdef CONFIG_UNIX98_PTYS
++#include <linux/devpts_fs.h>
++
++static int init_ve_devpts(struct ve_struct *ve)
++{
++ int err;
++
++ err = -ENOMEM;
++ ve->devpts_config = kmalloc(sizeof(struct devpts_config), GFP_KERNEL);
++ if (ve->devpts_config == NULL)
++ goto out;
++ memset(ve->devpts_config, 0, sizeof(struct devpts_config));
++ ve->devpts_config->mode = 0600;
++ err = register_ve_fs_type(ve, &devpts_fs_type,
++ &ve->devpts_fstype, &ve->devpts_mnt);
++ if (err) {
++ kfree(ve->devpts_config);
++ ve->devpts_config = NULL;
++ }
++out:
++ return err;
++}
++
++static void fini_ve_devpts(struct ve_struct *ve)
++{
++ unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt);
++ /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */
++ ve->devpts_mnt = NULL;
++ kfree(ve->devpts_config);
++ ve->devpts_config = NULL;
++}
++#else
++#define init_ve_devpts(ve) (0)
++#define fini_ve_devpts(ve) do { } while (0)
++#endif
++
++static int init_ve_shmem(struct ve_struct *ve)
++{
++ return register_ve_fs_type(ve,
++ &tmpfs_fs_type,
++ &ve->shmem_fstype,
++ &ve->shmem_mnt);
++}
++
++static void fini_ve_shmem(struct ve_struct *ve)
++{
++ unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt);
++ /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */
++ ve->shmem_mnt = NULL;
++}
++
++static inline int init_ve_sysfs_root(struct ve_struct *ve)
++{
++ struct sysfs_dirent *sysfs_root;
++
++ sysfs_root = kmalloc(sizeof(struct sysfs_dirent), GFP_KERNEL);
++ if (sysfs_root == NULL)
++ return -ENOMEM;
++
++ memset(sysfs_root, 0, sizeof(struct sysfs_dirent));
++ INIT_LIST_HEAD(&sysfs_root->s_sibling);
++ INIT_LIST_HEAD(&sysfs_root->s_children);
++ sysfs_root->s_type = SYSFS_ROOT;
++ ve->sysfs_root = sysfs_root;
++ return 0;
++}
++
++static int init_ve_sysfs(struct ve_struct *ve)
++{
++ struct subsystem *subsys;
++ struct class *nc;
++ int err;
++ extern struct subsystem class_obj_subsys;
++ extern struct subsystem class_subsys;
++ extern struct class net_class;
++
++#ifdef CONFIG_SYSFS
++ err = 0;
++ if (ve->features & VE_FEATURE_SYSFS) {
++ err = init_ve_sysfs_root(ve);
++ if (err != 0)
++ goto out;
++ err = register_ve_fs_type(ve,
++ &sysfs_fs_type,
++ &ve->sysfs_fstype,
++ &ve->sysfs_mnt);
++ }
++ if (err != 0)
++ goto out_fs_type;
++#endif
++ err = -ENOMEM;
++ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL);
++ if (subsys == NULL)
++ goto out_class_obj;
++ /* ick, this is ugly, the things we go through to keep from showing up
++ * in sysfs... */
++ memset(subsys, 0, sizeof(*subsys));
++ memcpy(&subsys->kset.kobj.name, &class_obj_subsys.kset.kobj.name,
++ sizeof(subsys->kset.kobj.name));
++ subsys->kset.ktype = class_obj_subsys.kset.ktype;
++ subsys->kset.uevent_ops = class_obj_subsys.kset.uevent_ops;
++ subsystem_init(subsys);
++ if (!subsys->kset.subsys)
++ subsys->kset.subsys = subsys;
++ ve->class_obj_subsys = subsys;
++
++ err = -ENOMEM;
++ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL);
++ if (subsys == NULL)
++ goto out_class_subsys;
++ /* ick, this is ugly, the things we go through to keep from showing up
++ * in sysfs... */
++ memset(subsys, 0, sizeof(*subsys));
++ memcpy(&subsys->kset.kobj.name, &class_subsys.kset.kobj.name,
++ sizeof(subsys->kset.kobj.name));
++ subsys->kset.ktype = class_subsys.kset.ktype;
++ subsys->kset.uevent_ops = class_subsys.kset.uevent_ops;
++ ve->class_subsys = subsys;
++ err = subsystem_register(subsys);
++ if (err != 0)
++ goto out_register;
++
++ err = -ENOMEM;
++ nc = kmalloc(sizeof(*nc), GFP_KERNEL);
++ if (nc == NULL)
++ goto out_nc;
++ memset(nc, 0, sizeof(*nc));
++ nc->name = net_class.name;
++ nc->release = net_class.release;
++ nc->uevent = net_class.uevent;
++ err = class_register(nc);
++ if (err != 0)
++ goto out_class_register;
++ ve->net_class = nc;
++
++ return err;
++
++out_class_register:
++ kfree(nc);
++out_nc:
++ subsystem_unregister(subsys);
++out_register:
++ kfree(ve->class_subsys);
++out_class_subsys:
++ kfree(ve->class_obj_subsys);
++out_class_obj:
++#ifdef CONFIG_SYSFS
++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
++out_fs_type:
++ kfree(ve->sysfs_root);
++ ve->sysfs_root = NULL;
++#endif
++ ve->class_subsys = NULL;
++ ve->class_obj_subsys = NULL;
++out:
++ return err;
++}
++
++static void fini_ve_sysfs(struct ve_struct *ve)
++{
++ class_unregister(ve->net_class);
++ subsystem_unregister(ve->class_subsys);
++
++ kfree(ve->net_class);
++ kfree(ve->class_subsys);
++ kfree(ve->class_obj_subsys);
++
++ ve->net_class = NULL;
++ ve->class_subsys = NULL;
++ ve->class_obj_subsys = NULL;
++#ifdef CONFIG_SYSFS
++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt);
++ ve->sysfs_mnt = NULL;
++ kfree(ve->sysfs_root);
++ ve->sysfs_root = NULL;
++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */
++#endif
++}
++
++static void free_ve_filesystems(struct ve_struct *ve)
++{
++#ifdef CONFIG_SYSFS
++ kfree(ve->sysfs_fstype);
++ ve->sysfs_fstype = NULL;
++#endif
++ kfree(ve->shmem_fstype);
++ ve->shmem_fstype = NULL;
++
++ kfree(ve->devpts_fstype);
++ ve->devpts_fstype = NULL;
++
++ free_ve_proc(ve);
++}
++
++static int init_printk(struct ve_struct *ve)
++{
++ struct ve_prep_printk {
++ wait_queue_head_t log_wait;
++ unsigned long log_start;
++ unsigned long log_end;
++ unsigned long logged_chars;
++ } *tmp;
++
++ tmp = kmalloc(sizeof(struct ve_prep_printk), GFP_KERNEL);
++ if (!tmp)
++ return -ENOMEM;
++ memset(tmp, 0, sizeof(struct ve_prep_printk));
++ init_waitqueue_head(&tmp->log_wait);
++ ve->_log_wait = &tmp->log_wait;
++ ve->_log_start = &tmp->log_start;
++ ve->_log_end = &tmp->log_end;
++ ve->_logged_chars = &tmp->logged_chars;
++ /* ve->log_buf will be initialized later by ve_log_init() */
++ return 0;
++}
++
++static void fini_printk(struct ve_struct *ve)
++{
++ /*
++ * there is no spinlock protection here because nobody can use
++ * log_buf at the moments when this code is called.
++ */
++ kfree(ve->log_buf);
++ kfree(ve->_log_wait);
++}
++
++static void fini_venet(struct ve_struct *ve)
++{
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ tcp_v4_kill_ve_sockets(ve);
++#endif
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ ve_mapped_devs_cleanup(ve);
++#endif
++}
++
++static int init_ve_sched(struct ve_struct *ve)
++{
++#ifdef CONFIG_FAIRSCHED
++ int err;
++
++ /*
++ * We refuse to switch to an already existing node since nodes
++ * keep a pointer to their ve_struct...
++ */
++ err = sys_fairsched_mknod(0, 1, ve->veid);
++ if (err < 0) {
++ printk(KERN_WARNING "Can't create fairsched node %d\n",
++ ve->veid);
++ return err;
++ }
++ err = sys_fairsched_mvpr(current->pid, ve->veid);
++ if (err) {
++ printk(KERN_WARNING "Can't switch to fairsched node %d\n",
++ ve->veid);
++ if (sys_fairsched_rmnod(ve->veid))
++ printk(KERN_ERR "Can't clean fairsched node %d\n",
++ ve->veid);
++ return err;
++ }
++#endif
++ ve_sched_attach(ve);
++ return 0;
++}
++
++static void fini_ve_sched(struct ve_struct *ve)
++{
++#ifdef CONFIG_FAIRSCHED
++ if (task_vsched_id(current) == ve->veid)
++ if (sys_fairsched_mvpr(current->pid, fairsched_init_node.id))
++ printk(KERN_WARNING "Can't leave fairsched node %d\n",
++ ve->veid);
++ if (sys_fairsched_rmnod(ve->veid))
++ printk(KERN_ERR "Can't remove fairsched node %d\n",
++ ve->veid);
++#endif
++}
++
++static int init_ve_struct(struct ve_struct *ve, envid_t veid,
++ u32 class_id, env_create_param_t *data,
++ struct task_struct *init_tsk)
++{
++ int n;
++
++ memset(ve, 0, sizeof(*ve));
++ (void)get_ve(ve);
++ ve->veid = veid;
++ ve->class_id = class_id;
++ ve->init_entry = init_tsk;
++ ve->features = data->feature_mask;
++ INIT_LIST_HEAD(&ve->vetask_lh);
++ init_rwsem(&ve->op_sem);
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ ve->ifindex = -1;
++#endif
++
++ for(n = 0; n < UIDHASH_SZ_VE; ++n)
++ INIT_LIST_HEAD(&ve->uidhash_table[n]);
++
++ do_posix_clock_monotonic_gettime(&ve->start_timespec);
++ ve->start_jiffies = jiffies;
++ ve->start_cycles = get_cycles();
++ ve->virt_pids = glob_virt_pids;
++
++ return 0;
++}
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * /proc/meminfo virtualization
++ *
++ **********************************************************************
++ **********************************************************************/
++static int ve_set_meminfo(envid_t veid, unsigned long val)
++{
++ struct ve_struct *ve;
++
++ ve = get_ve_by_id(veid);
++ if (!ve)
++ return -EINVAL;
++
++ ve->meminfo_val = val;
++ real_put_ve(ve);
++ return 0;
++}
++
++static int init_ve_meminfo(struct ve_struct *ve)
++{
++ ve->meminfo_val = 0;
++ return 0;
++}
++
++static inline void fini_ve_meminfo(struct ve_struct *ve)
++{
++}
++
++static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk)
++{
++ read_lock(&tsk->fs->lock);
++ ve->fs_rootmnt = tsk->fs->rootmnt;
++ ve->fs_root = tsk->fs->root;
++ read_unlock(&tsk->fs->lock);
++ mark_tree_virtual(ve->fs_rootmnt, ve->fs_root);
++}
++
++static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk)
++{
++ /* required for real_setdevperms from register_ve_<fs> above */
++ memcpy(&ve->cap_default, &tsk->cap_effective, sizeof(kernel_cap_t));
++ cap_lower(ve->cap_default, CAP_SETVEID);
++}
++
++static int ve_list_add(struct ve_struct *ve)
++{
++ write_lock_irq(&ve_list_guard);
++ if (__find_ve_by_id(ve->veid) != NULL)
++ goto err_exists;
++
++ ve->prev = NULL;
++ ve->next = ve_list_head;
++ if (ve_list_head)
++ ve_list_head->prev = ve;
++ ve_list_head = ve;
++ nr_ve++;
++ write_unlock_irq(&ve_list_guard);
++ return 0;
++
++err_exists:
++ write_unlock_irq(&ve_list_guard);
++ return -EEXIST;
++}
++
++static void ve_list_del(struct ve_struct *ve)
++{
++ write_lock_irq(&ve_list_guard);
++ if (ve->prev)
++ ve->prev->next = ve->next;
++ else
++ ve_list_head = ve->next;
++ if (ve->next)
++ ve->next->prev = ve->prev;
++ nr_ve--;
++ write_unlock_irq(&ve_list_guard);
++}
++
++static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve)
++{
++ spin_lock(&task_capability_lock);
++ cap_mask(tsk->cap_effective, ve->cap_default);
++ cap_mask(tsk->cap_inheritable, ve->cap_default);
++ cap_mask(tsk->cap_permitted, ve->cap_default);
++ spin_unlock(&task_capability_lock);
++}
++
++static void move_task(struct task_struct *tsk, struct ve_struct *new,
++ struct ve_struct *old)
++{
++ /* this probihibts ptracing of task entered to VPS from host system */
++ tsk->mm->vps_dumpable = 0;
++ /* setup capabilities before enter */
++ set_task_ve_caps(tsk, new);
++
++ write_lock_irq(&tasklist_lock);
++ VE_TASK_INFO(tsk)->owner_env = new;
++ VE_TASK_INFO(tsk)->exec_env = new;
++ REMOVE_VE_LINKS(tsk);
++ SET_VE_LINKS(tsk);
++
++ atomic_dec(&old->pcounter);
++ atomic_inc(&new->pcounter);
++ real_put_ve(old);
++ get_ve(new);
++ write_unlock_irq(&tasklist_lock);
++}
++
++#ifdef CONFIG_VE_IPTABLES
++extern int init_netfilter(void);
++extern void fini_netfilter(void);
++#define init_ve_netfilter() init_netfilter()
++#define fini_ve_netfilter() fini_netfilter()
++
++#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \
++({ \
++ int ret = 0; \
++ if (VE_IPT_CMP(mask, full_mask) && \
++ VE_IPT_CMP((ve)->_iptables_modules, \
++ full_mask & ~(full_mask##_MOD))) { \
++ ret = KSYMERRCALL(1, mod, name, args); \
++ if (ret == 0) \
++ (ve)->_iptables_modules |= \
++ full_mask##_MOD; \
++ if (ret == 1) \
++ ret = 0; \
++ } \
++ ret; \
++})
++
++#define KSYMIPTFINI(mask, full_mask, mod, name, args) \
++({ \
++ if (VE_IPT_CMP(mask, full_mask##_MOD)) \
++ KSYMSAFECALL_VOID(mod, name, args); \
++})
++
++
++static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask,
++ int init_or_cleanup)
++{
++ int err;
++
++ err = 0;
++ if (!init_or_cleanup)
++ goto cleanup;
++
++ /* init part */
++#if defined(CONFIG_NETFILTER_XTABLES) || \
++ defined(CONFIG_NETFILTER_XTABLES_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES,
++ x_tables, init_xtables, ());
++ if (err < 0)
++ goto err_xtables;
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES,
++ xt_tcpudp, init_xt_tcpudp, ());
++ if (err < 0)
++ goto err_xt_tcpudp;
++#endif
++#if defined(CONFIG_IP_NF_IPTABLES) || \
++ defined(CONFIG_IP_NF_IPTABLES_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES,
++ ip_tables, init_iptables, ());
++ if (err < 0)
++ goto err_iptables;
++#endif
++#if defined(CONFIG_IP6_NF_IPTABLES) || \
++ defined(CONFIG_IP6_NF_IPTABLES_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES,
++ ip6_tables, init_ip6tables, ());
++ if (err < 0)
++ goto err_ip6tables;
++#endif
++#if defined(CONFIG_IP_NF_CONNTRACK) || \
++ defined(CONFIG_IP_NF_CONNTRACK_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK,
++ ip_conntrack, init_iptable_conntrack, ());
++ if (err < 0)
++ goto err_iptable_conntrack;
++#endif
++#if defined(CONFIG_IP_NF_FTP) || \
++ defined(CONFIG_IP_NF_FTP_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_FTP,
++ ip_conntrack_ftp, init_iptable_ftp, ());
++ if (err < 0)
++ goto err_iptable_ftp;
++#endif
++#if defined(CONFIG_IP_NF_IRC) || \
++ defined(CONFIG_IP_NF_IRC_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_IRC,
++ ip_conntrack_irc, init_iptable_irc, ());
++ if (err < 0)
++ goto err_iptable_irc;
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_CONNTRACK,
++ xt_conntrack, init_xt_conntrack_match, ());
++ if (err < 0)
++ goto err_xt_conntrack_match;
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_STATE) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_STATE_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_STATE,
++ xt_state, init_xt_state, ());
++ if (err < 0)
++ goto err_xt_state;
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_HELPER) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_HELPER_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_HELPER,
++ xt_helper, init_xt_helper, ());
++ if (err < 0)
++ goto err_xt_helper;
++#endif
++#if defined(CONFIG_IP_NF_NAT) || \
++ defined(CONFIG_IP_NF_NAT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT,
++ ip_nat, ip_nat_init, ());
++ if (err < 0)
++ goto err_iptable_nat;
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT,
++ iptable_nat, init_iptable_nat, ());
++ if (err < 0)
++ goto err_iptable_nat2;
++#endif
++#if defined(CONFIG_IP_NF_NAT_FTP) || \
++ defined(CONFIG_IP_NF_NAT_FTP_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_FTP,
++ ip_nat_ftp, init_iptable_nat_ftp, ());
++ if (err < 0)
++ goto err_iptable_nat_ftp;
++#endif
++#if defined(CONFIG_IP_NF_NAT_IRC) || \
++ defined(CONFIG_IP_NF_NAT_IRC_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_IRC,
++ ip_nat_irc, init_iptable_nat_irc, ());
++ if (err < 0)
++ goto err_iptable_nat_irc;
++#endif
++#if defined(CONFIG_IP_NF_FILTER) || \
++ defined(CONFIG_IP_NF_FILTER_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER,
++ iptable_filter, init_iptable_filter, ());
++ if (err < 0)
++ goto err_iptable_filter;
++#endif
++#if defined(CONFIG_IP6_NF_FILTER) || \
++ defined(CONFIG_IP6_NF_FILTER_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER,
++ ip6table_filter, init_ip6table_filter, ());
++ if (err < 0)
++ goto err_ip6table_filter;
++#endif
++#if defined(CONFIG_IP_NF_MANGLE) || \
++ defined(CONFIG_IP_NF_MANGLE_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE,
++ iptable_mangle, init_iptable_mangle, ());
++ if (err < 0)
++ goto err_iptable_mangle;
++#endif
++#if defined(CONFIG_IP6_NF_MANGLE) || \
++ defined(CONFIG_IP6_NF_MANGLE_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE,
++ ip6table_mangle, init_ip6table_mangle, ());
++ if (err < 0)
++ goto err_ip6table_mangle;
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_LIMIT) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_LIMIT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LIMIT,
++ xt_limit, init_xt_limit, ());
++ if (err < 0)
++ goto err_xt_limit;
++#endif
++#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \
++ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_MULTIPORT,
++ ipt_multiport, init_iptable_multiport, ());
++ if (err < 0)
++ goto err_iptable_multiport;
++#endif
++#if defined(CONFIG_IP6_NF_MATCH_MULTIPORT) || \
++ defined(CONFIG_IP6_NF_MATCH_MULTIPORT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_MULTIPORT,
++ ip6t_multiport, init_ip6table_multiport, ());
++ if (err < 0)
++ goto err_ip6table_multiport;
++#endif
++#if defined(CONFIG_IP_NF_MATCH_TOS) || \
++ defined(CONFIG_IP_NF_MATCH_TOS_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TOS,
++ ipt_tos, init_iptable_tos, ());
++ if (err < 0)
++ goto err_iptable_tos;
++#endif
++#if defined(CONFIG_IP_NF_TARGET_TOS) || \
++ defined(CONFIG_IP_NF_TARGET_TOS_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TOS,
++ ipt_TOS, init_iptable_TOS, ());
++ if (err < 0)
++ goto err_iptable_TOS;
++#endif
++#if defined(CONFIG_IP_NF_TARGET_REJECT) || \
++ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REJECT,
++ ipt_REJECT, init_iptable_REJECT, ());
++ if (err < 0)
++ goto err_iptable_REJECT;
++#endif
++#if defined(CONFIG_IP6_NF_TARGET_REJECT) || \
++ defined(CONFIG_IP6_NF_TARGET_REJECT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REJECT,
++ ip6t_REJECT, init_ip6table_REJECT, ());
++ if (err < 0)
++ goto err_ip6table_REJECT;
++#endif
++#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \
++ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TCPMSS,
++ ipt_TCPMSS, init_iptable_TCPMSS, ());
++ if (err < 0)
++ goto err_iptable_TCPMSS;
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TCPMSS,
++ xt_tcpmss, init_xt_tcpmss, ());
++ if (err < 0)
++ goto err_xt_tcpmss;
++#endif
++#if defined(CONFIG_IP_NF_MATCH_TTL) || \
++ defined(CONFIG_IP_NF_MATCH_TTL_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TTL,
++ ipt_ttl, init_iptable_ttl, ());
++ if (err < 0)
++ goto err_iptable_ttl;
++#endif
++#if defined(CONFIG_IP_NF_TARGET_LOG) || \
++ defined(CONFIG_IP_NF_TARGET_LOG_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_LOG,
++ ipt_LOG, init_iptable_LOG, ());
++ if (err < 0)
++ goto err_iptable_LOG;
++#endif
++#if defined(CONFIG_IP6_NF_TARGET_LOG) || \
++ defined(CONFIG_IP6_NF_TARGET_LOG_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_LOG,
++ ip6t_LOG, init_ip6table_LOG, ());
++ if (err < 0)
++ goto err_ip6table_LOG;
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_LENGTH) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_LENGTH_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LENGTH,
++ xt_length, init_xt_length, ());
++ if (err < 0)
++ goto err_xt_length;
++#endif
++#if defined(CONFIG_IP_NF_TARGET_REDIRECT) || \
++ defined(CONFIG_IP_NF_TARGET_REDIRECT_MODULE)
++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REDIRECT,
++ ipt_REDIRECT, init_iptable_REDIRECT, ());
++ if (err < 0)
++ goto err_iptable_REDIRECT;
++#endif
++ return 0;
++
++/* ------------------------------------------------------------------------- */
++
++cleanup:
++#if defined(CONFIG_IP_NF_TARGET_REDIRECT) || \
++ defined(CONFIG_IP_NF_TARGET_REDIRECT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REDIRECT,
++ ipt_REDIRECT, fini_iptable_REDIRECT, ());
++err_iptable_REDIRECT:
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_LENGTH) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_LENGTH_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LENGTH,
++ xt_length, fini_xt_length, ());
++err_xt_length:
++#endif
++#if defined(CONFIG_IP6_NF_TARGET_LOG) || \
++ defined(CONFIG_IP6_NF_TARGET_LOG_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_LOG,
++ ip6t_LOG, fini_ip6table_LOG, ());
++err_ip6table_LOG:
++#endif
++#if defined(CONFIG_IP_NF_TARGET_LOG) || \
++ defined(CONFIG_IP_NF_TARGET_LOG_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_LOG,
++ ipt_LOG, fini_iptable_LOG, ());
++err_iptable_LOG:
++#endif
++#if defined(CONFIG_IP_NF_MATCH_TTL) || \
++ defined(CONFIG_IP_NF_MATCH_TTL_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TTL,
++ ipt_ttl, fini_iptable_ttl, ());
++err_iptable_ttl:
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TCPMSS,
++ xt_tcpmss, fini_xt_tcpmss, ());
++err_xt_tcpmss:
++#endif
++#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \
++ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TCPMSS,
++ ipt_TCPMSS, fini_iptable_TCPMSS, ());
++err_iptable_TCPMSS:
++#endif
++#if defined(CONFIG_IP6_NF_TARGET_REJECT) || \
++ defined(CONFIG_IP6_NF_TARGET_REJECT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REJECT,
++ ip6t_REJECT, fini_ip6table_REJECT, ());
++err_ip6table_REJECT:
++#endif
++#if defined(CONFIG_IP_NF_TARGET_REJECT) || \
++ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REJECT,
++ ipt_REJECT, fini_iptable_REJECT, ());
++err_iptable_REJECT:
++#endif
++#if defined(CONFIG_IP_NF_TARGET_TOS) || \
++ defined(CONFIG_IP_NF_TARGET_TOS_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TOS,
++ ipt_TOS, fini_iptable_TOS, ());
++err_iptable_TOS:
++#endif
++#if defined(CONFIG_IP_NF_MATCH_TOS) || \
++ defined(CONFIG_IP_NF_MATCH_TOS_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TOS,
++ ipt_tos, fini_iptable_tos, ());
++err_iptable_tos:
++#endif
++#if defined(CONFIG_IP6_NF_MATCH_MULTIPORT) || \
++ defined(CONFIG_IP6_NF_MATCH_MULTIPORT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_MULTIPORT,
++ ip6t_multiport, fini_ip6table_multiport, ());
++err_ip6table_multiport:
++#endif
++#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \
++ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_MULTIPORT,
++ ipt_multiport, fini_iptable_multiport, ());
++err_iptable_multiport:
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_LIMIT) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_LIMIT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LIMIT,
++ xt_limit, fini_xt_limit, ());
++err_xt_limit:
++#endif
++#if defined(CONFIG_IP6_NF_MANGLE) || \
++ defined(CONFIG_IP6_NF_MANGLE_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE,
++ ip6table_mangle, fini_ip6table_mangle, ());
++err_ip6table_mangle:
++#endif
++#if defined(CONFIG_IP_NF_MANGLE) || \
++ defined(CONFIG_IP_NF_MANGLE_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE,
++ iptable_mangle, fini_iptable_mangle, ());
++err_iptable_mangle:
++#endif
++#if defined(CONFIG_IP6_NF_FILTER) || \
++ defined(CONFIG_IP6_NF_FILTER_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER,
++ ip6table_filter, fini_ip6table_filter, ());
++err_ip6table_filter:
++#endif
++#if defined(CONFIG_IP_NF_FILTER) || \
++ defined(CONFIG_IP_NF_FILTER_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER,
++ iptable_filter, fini_iptable_filter, ());
++err_iptable_filter:
++#endif
++#if defined(CONFIG_IP_NF_NAT_IRC) || \
++ defined(CONFIG_IP_NF_NAT_IRC_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_IRC,
++ ip_nat_irc, fini_iptable_nat_irc, ());
++err_iptable_nat_irc:
++#endif
++#if defined(CONFIG_IP_NF_NAT_FTP) || \
++ defined(CONFIG_IP_NF_NAT_FTP_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_FTP,
++ ip_nat_ftp, fini_iptable_nat_ftp, ());
++err_iptable_nat_ftp:
++#endif
++#if defined(CONFIG_IP_NF_NAT) || \
++ defined(CONFIG_IP_NF_NAT_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT,
++ iptable_nat, fini_iptable_nat, ());
++err_iptable_nat2:
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT,
++ ip_nat, ip_nat_cleanup, ());
++err_iptable_nat:
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_HELPER) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_HELPER_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_HELPER,
++ xt_helper, fini_xt_helper, ());
++err_xt_helper:
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_STATE) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_STATE_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_STATE,
++ xt_state, fini_xt_state, ());
++err_xt_state:
++#endif
++#if defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) || \
++ defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_CONNTRACK,
++ xt_conntrack, fini_xt_conntrack_match, ());
++err_xt_conntrack_match:
++#endif
++#if defined(CONFIG_IP_NF_IRC) || \
++ defined(CONFIG_IP_NF_IRC_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_IRC,
++ ip_conntrack_irc, fini_iptable_irc, ());
++err_iptable_irc:
++#endif
++#if defined(CONFIG_IP_NF_FTP) || \
++ defined(CONFIG_IP_NF_FTP_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_FTP,
++ ip_conntrack_ftp, fini_iptable_ftp, ());
++err_iptable_ftp:
++#endif
++#if defined(CONFIG_IP_NF_CONNTRACK) || \
++ defined(CONFIG_IP_NF_CONNTRACK_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK,
++ ip_conntrack, fini_iptable_conntrack, ());
++err_iptable_conntrack:
++#endif
++#if defined(CONFIG_IP6_NF_IPTABLES) || \
++ defined(CONFIG_IP6_NF_IPTABLES_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES,
++ ip6_tables, fini_ip6tables, ());
++err_ip6tables:
++#endif
++#if defined(CONFIG_IP_NF_IPTABLES) || \
++ defined(CONFIG_IP_NF_IPTABLES_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES,
++ ip_tables, fini_iptables, ());
++err_iptables:
++#endif
++#if defined(CONFIG_NETFILTER_XTABLES) || \
++ defined(CONFIG_NETFILTER_XTABLES_MODULE)
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES,
++ xt_tcpudp, fini_xt_tcpudp, ());
++err_xt_tcpudp:
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES,
++ x_tables, fini_xtables, ());
++err_xtables:
++#endif
++ ve->_iptables_modules = 0;
++
++ return err;
++}
++
++static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask)
++{
++ return do_ve_iptables(ve, init_mask, 1);
++}
++
++static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask)
++{
++ (void)do_ve_iptables(ve, init_mask, 0);
++}
++
++static void flush_ve_iptables(struct ve_struct *ve)
++{
++ /*
++ * flush all rule tables first,
++ * this helps us to avoid refs to freed objs
++ */
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ip_tables,
++ ipt_flush_table, (ve->_ipt_mangle_table));
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ip6_tables,
++ ip6t_flush_table, (ve->_ip6t_mangle_table));
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ip_tables,
++ ipt_flush_table, (ve->_ve_ipt_filter_pf));
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ip6_tables,
++ ip6t_flush_table, (ve->_ve_ip6t_filter_pf));
++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ip_tables,
++ ipt_flush_table, (ve->_ip_conntrack->_ip_nat_table));
++}
++#else
++#define init_ve_iptables(x, y) (0)
++#define fini_ve_iptables(x, y) do { } while (0)
++#define flush_ve_iptables(x) do { } while (0)
++#define init_ve_netfilter() (0)
++#define fini_ve_netfilter() do { } while (0)
++#endif
++
++static struct list_head ve_hooks[VE_MAX_HOOKS];
++static DECLARE_RWSEM(ve_hook_sem);
++
++int ve_hook_register(struct ve_hook *vh)
++{
++ struct list_head *lh;
++ struct ve_hook *tmp;
++
++ down_write(&ve_hook_sem);
++ list_for_each(lh, &ve_hooks[vh->hooknum]) {
++ tmp = list_entry(lh, struct ve_hook, list);
++ if (vh->priority < tmp->priority)
++ break;
++ }
++ list_add_tail(&vh->list, lh);
++ up_write(&ve_hook_sem);
++ return 0;
++}
++EXPORT_SYMBOL(ve_hook_register);
++
++void ve_hook_unregister(struct ve_hook *vh)
++{
++ down_write(&ve_hook_sem);
++ list_del(&vh->list);
++ up_write(&ve_hook_sem);
++}
++EXPORT_SYMBOL(ve_hook_unregister);
++
++static int ve_hook_iterate(unsigned int hooknum, void *data)
++{
++ struct ve_hook *vh;
++ int err;
++
++ err = 0;
++ down_read(&ve_hook_sem);
++ list_for_each_entry(vh, &ve_hooks[hooknum], list) {
++ if (!try_module_get(vh->owner))
++ continue;
++ err = vh->hook(hooknum, data);
++ module_put(vh->owner);
++ if (err)
++ break;
++ }
++
++ if (err) {
++ list_for_each_entry_continue_reverse(vh,
++ &ve_hooks[hooknum], list) {
++ if (!try_module_get(vh->owner))
++ continue;
++ if (vh->undo)
++ vh->undo(hooknum, data);
++ module_put(vh->owner);
++ }
++ }
++ up_read(&ve_hook_sem);
++ return err;
++}
++
++static void ve_hook_iterate_cleanup(unsigned int hooknum, void *data)
++{
++ struct ve_hook *vh;
++
++ down_read(&ve_hook_sem);
++ list_for_each_entry_reverse(vh, &ve_hooks[hooknum], list) {
++ if (!try_module_get(vh->owner))
++ continue;
++ (void)vh->hook(hooknum, data);
++ module_put(vh->owner);
++ }
++ up_read(&ve_hook_sem);
++}
++
++static int do_env_create(envid_t veid, unsigned int flags, u32 class_id,
++ env_create_param_t *data, int datalen)
++{
++ struct task_struct *tsk;
++ struct ve_struct *old;
++ struct ve_struct *old_exec;
++ struct ve_struct *ve;
++ __u64 init_mask;
++ int err;
++
++ tsk = current;
++ old = VE_TASK_INFO(tsk)->owner_env;
++
++ if (!thread_group_leader(tsk))
++ return -EINVAL;
++
++ if (tsk->signal->tty) {
++ printk("ERR: VE init has controlling terminal\n");
++ return -EINVAL;
++ }
++ if (tsk->signal->pgrp != tsk->pid || tsk->signal->session != tsk->pid) {
++ int may_setsid;
++ read_lock(&tasklist_lock);
++ may_setsid = (find_pid(PIDTYPE_PGID, tsk->pid) == NULL);
++ read_unlock(&tasklist_lock);
++ if (!may_setsid) {
++ printk("ERR: VE init is process group leader\n");
++ return -EINVAL;
++ }
++ }
++
++
++ VZTRACE("%s: veid=%d classid=%d pid=%d\n",
++ __FUNCTION__, veid, class_id, current->pid);
++
++ err = -ENOMEM;
++ ve = kmalloc(sizeof(struct ve_struct), GFP_KERNEL);
++ if (ve == NULL)
++ goto err_struct;
++
++ init_ve_struct(ve, veid, class_id, data, tsk);
++ __module_get(THIS_MODULE);
++ down_write(&ve->op_sem);
++ if (flags & VE_LOCK)
++ ve->is_locked = 1;
++ if ((err = ve_list_add(ve)) < 0)
++ goto err_exist;
++
++ /* this should be done before context switching */
++ if ((err = init_printk(ve)) < 0)
++ goto err_log_wait;
++
++ old_exec = set_exec_env(ve);
++
++ if ((err = init_ve_sched(ve)) < 0)
++ goto err_sched;
++
++ /* move user to VE */
++ if ((err = set_user(0, 0)) < 0)
++ goto err_set_user;
++
++ set_ve_root(ve, tsk);
++
++ if ((err = init_ve_utsname(ve)))
++ goto err_utsname;
++
++ if ((err = init_ve_mibs(ve)))
++ goto err_mibs;
++
++ if ((err = init_ve_proc(ve)))
++ goto err_proc;
++
++ if ((err = init_ve_sysctl(ve)))
++ goto err_sysctl;
++
++ if ((err = init_ve_sysfs(ve)))
++ goto err_sysfs;
++
++ if ((err = ve_arp_init(ve)) < 0)
++ goto err_route;
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ if ((err = ve_ndisc_init(ve)) < 0)
++ goto err_route;
++#endif
++
++ if ((err = init_ve_route(ve)) < 0)
++ goto err_route;
++
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ if ((err = init_ve_route6(ve)) < 0)
++ goto err_route;
++#endif
++
++ if ((err = init_ve_netdev()))
++ goto err_dev;
++
++ if ((err = init_ve_tty_drivers(ve)) < 0)
++ goto err_tty;
++
++ if ((err = init_ve_shmem(ve)))
++ goto err_shmem;
++
++ if ((err = init_ve_devpts(ve)))
++ goto err_devpts;
++
++ if((err = init_ve_meminfo(ve)))
++ goto err_meminf;
++
++ /* init SYSV IPC variables */
++ if ((err = init_ve_ipc(ve)) < 0)
++ goto err_ipc;
++
++ set_ve_caps(ve, tsk);
++
++ /* It is safe to initialize netfilter here as routing initialization and
++ interface setup will be done below. This means that NO skb can be
++ passed inside. Den */
++ /* iptables ve initialization for non ve0;
++ ve0 init is in module_init */
++ if ((err = init_ve_netfilter()) < 0)
++ goto err_netfilter;
++
++ init_mask = data ? data->iptables_mask : VE_IP_DEFAULT;
++ if ((err = init_ve_iptables(ve, init_mask)) < 0)
++ goto err_iptables;
++
++ if ((err = alloc_vpid(tsk->pid, 1)) < 0)
++ goto err_vpid;
++
++ if ((err = ve_hook_iterate(VE_HOOK_INIT, (void *)ve)) < 0)
++ goto err_ve_hook;
++
++ /* finally: set vpids and move inside */
++ move_task(tsk, ve, old);
++
++ set_virt_pid(tsk, 1);
++ set_virt_tgid(tsk, 1);
++
++ set_special_pids(tsk->pid, tsk->pid);
++ current->signal->tty_old_pgrp = 0;
++ set_virt_pgid(tsk, 1);
++ set_virt_sid(tsk, 1);
++
++ ve->is_running = 1;
++ up_write(&ve->op_sem);
++
++ printk(KERN_INFO "VPS: %d: started\n", veid);
++ return veid;
++
++err_ve_hook:
++ free_vpid(1, ve);
++err_vpid:
++ fini_venet(ve);
++ fini_ve_iptables(ve, init_mask);
++err_iptables:
++ fini_ve_netfilter();
++err_netfilter:
++ fini_ve_ipc(ve);
++err_ipc:
++ fini_ve_meminfo(ve);
++err_meminf:
++ fini_ve_devpts(ve);
++err_devpts:
++ fini_ve_shmem(ve);
++err_shmem:
++ fini_ve_tty_drivers(ve);
++err_tty:
++ fini_ve_netdev();
++err_dev:
++ fini_ve_route(ve);
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ fini_ve_route6(ve);
++#endif
++err_route:
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ ve_ndisc_fini(ve);
++#endif
++ ve_arp_fini(ve);
++ fini_ve_sysfs(ve);
++err_sysfs:
++ fini_ve_sysctl(ve);
++err_sysctl:
++ fini_ve_proc(ve);
++err_proc:
++ do_clean_devperms(ve->veid); /* register procfs adds devperms */
++ fini_ve_mibs(ve);
++err_mibs:
++ /* free_ve_utsname() is called inside real_put_ve() */ ;
++err_utsname:
++ /* It is safe to restore current->envid here because
++ * ve_fairsched_detach does not use current->envid. */
++ /* Really fairsched code uses current->envid in sys_fairsched_mknod
++ * only. It is correct if sys_fairsched_mknod is called from
++ * userspace. If sys_fairsched_mknod is called from
++ * ve_fairsched_attach, then node->envid and node->parent_node->envid
++ * are explicitly set to valid value after the call. */
++ /* FIXME */
++ VE_TASK_INFO(tsk)->owner_env = old;
++ VE_TASK_INFO(tsk)->exec_env = old_exec;
++ /* move user back */
++ if (set_user(0, 0) < 0)
++ printk(KERN_WARNING"Can't restore UID\n");
++
++err_set_user:
++ fini_ve_sched(ve);
++err_sched:
++ (void)set_exec_env(old_exec);
++
++ /* we can jump here having incorrect envid */
++ VE_TASK_INFO(tsk)->owner_env = old;
++ fini_printk(ve);
++err_log_wait:
++ ve_list_del(ve);
++ up_write(&ve->op_sem);
++
++ real_put_ve(ve);
++err_struct:
++ printk(KERN_INFO "VPS: %d: failed to start with err=%d\n", veid, err);
++ return err;
++
++err_exist:
++ kfree(ve);
++ goto err_struct;
++}
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE start/stop callbacks
++ *
++ **********************************************************************
++ **********************************************************************/
++
++int real_env_create(envid_t veid, unsigned flags, u32 class_id,
++ env_create_param_t *data, int datalen)
++{
++ int status;
++ struct ve_struct *ve;
++
++ if (!flags) {
++ status = get_exec_env()->veid;
++ goto out;
++ }
++
++ status = -EPERM;
++ if (!capable(CAP_SETVEID))
++ goto out;
++
++ status = -EINVAL;
++ if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE)))
++ goto out;
++
++ status = -EINVAL;
++ ve = get_ve_by_id(veid);
++ if (ve) {
++ if (flags & VE_TEST) {
++ status = 0;
++ goto out_put;
++ }
++ if (flags & VE_EXCLUSIVE) {
++ status = -EACCES;
++ goto out_put;
++ }
++ if (flags & VE_CREATE) {
++ flags &= ~VE_CREATE;
++ flags |= VE_ENTER;
++ }
++ } else {
++ if (flags & (VE_TEST|VE_ENTER)) {
++ status = -ESRCH;
++ goto out;
++ }
++ }
++
++ if (flags & VE_CREATE) {
++ status = do_env_create(veid, flags, class_id, data, datalen);
++ goto out;
++ } else if (flags & VE_ENTER)
++ status = do_env_enter(ve, flags);
++
++ /* else: returning EINVAL */
++
++out_put:
++ real_put_ve(ve);
++out:
++ return status;
++}
++
++static int do_env_enter(struct ve_struct *ve, unsigned int flags)
++{
++ struct task_struct *tsk = current;
++ int err;
++
++ VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid);
++
++ err = -EBUSY;
++ down_read(&ve->op_sem);
++ if (!ve->is_running)
++ goto out_up;
++ if (ve->is_locked && !(flags & VE_SKIPLOCK))
++ goto out_up;
++
++#ifdef CONFIG_FAIRSCHED
++ err = sys_fairsched_mvpr(current->pid, ve->veid);
++ if (err)
++ goto out_up;
++#endif
++
++ ve_sched_attach(ve);
++ move_task(current, ve, VE_TASK_INFO(tsk)->owner_env);
++ err = VE_TASK_INFO(tsk)->owner_env->veid;
++
++out_up:
++ up_read(&ve->op_sem);
++ return err;
++}
++
++static void env_cleanup(struct ve_struct *ve)
++{
++ struct ve_struct *old_ve;
++
++ VZTRACE("real_do_env_cleanup\n");
++
++ down_read(&ve->op_sem);
++ old_ve = set_exec_env(ve);
++
++ ve_hook_iterate_cleanup(VE_HOOK_FINI, (void *)ve);
++
++ fini_venet(ve);
++
++ /* no new packets in flight beyond this point */
++ synchronize_net();
++ /* skb hold dst_entry, and in turn lies in the ip fragment queue */
++ ip_fragment_cleanup(ve);
++
++ fini_ve_netdev();
++ fini_ve_route(ve);
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ fini_ve_route6(ve);
++#endif
++ ve_arp_fini(ve);
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ ve_ndisc_fini(ve);
++#endif
++
++ /* kill iptables */
++ /* No skb belonging to VE can exist at this point as unregister_netdev
++ is an operation awaiting until ALL skb's gone */
++ flush_ve_iptables(ve);
++ fini_ve_iptables(ve, ve->_iptables_modules);
++ fini_ve_netfilter();
++
++ ve_ipc_cleanup();
++
++ fini_ve_sched(ve);
++ do_clean_devperms(ve->veid);
++
++ fini_ve_devpts(ve);
++ fini_ve_shmem(ve);
++ fini_ve_sysfs(ve);
++ unregister_ve_tty_drivers(ve);
++ fini_ve_sysctl(ve);
++ fini_ve_proc(ve);
++ fini_ve_meminfo(ve);
++
++ fini_ve_mibs(ve);
++
++ (void)set_exec_env(old_ve);
++ fini_printk(ve); /* no printk can happen in ve context anymore */
++
++ ve_list_del(ve);
++ up_read(&ve->op_sem);
++
++ real_put_ve(ve);
++}
++
++static struct list_head ve_cleanup_list;
++static spinlock_t ve_cleanup_lock;
++
++static DECLARE_COMPLETION(vzmond_complete);
++static struct task_struct *vzmond_thread;
++static volatile int stop_vzmond;
++
++void real_do_env_cleanup(struct ve_struct *ve)
++{
++ spin_lock(&ve_cleanup_lock);
++ list_add_tail(&ve->cleanup_list, &ve_cleanup_list);
++ spin_unlock(&ve_cleanup_lock);
++ wake_up_process(vzmond_thread);
++}
++
++static void do_pending_env_cleanups(void)
++{
++ struct ve_struct *ve;
++
++ spin_lock(&ve_cleanup_lock);
++ while (1) {
++ if (list_empty(&ve_cleanup_list) || need_resched())
++ break;
++ ve = list_entry(ve_cleanup_list.next, struct ve_struct,
++ cleanup_list);
++ list_del(&ve->cleanup_list);
++ spin_unlock(&ve_cleanup_lock);
++ env_cleanup(ve);
++ spin_lock(&ve_cleanup_lock);
++ }
++ spin_unlock(&ve_cleanup_lock);
++}
++
++static int have_pending_cleanups(void)
++{
++ return !list_empty(&ve_cleanup_list);
++}
++
++static int vzmond(void *arg)
++{
++ daemonize("vzmond");
++ vzmond_thread = current;
++ set_current_state(TASK_INTERRUPTIBLE);
++
++ while (!stop_vzmond) {
++ schedule();
++ try_to_freeze();
++ if (signal_pending(current))
++ flush_signals(current);
++
++ do_pending_env_cleanups();
++ set_current_state(TASK_INTERRUPTIBLE);
++ if (have_pending_cleanups())
++ __set_current_state(TASK_RUNNING);
++ }
++
++ __set_task_state(current, TASK_RUNNING);
++ complete_and_exit(&vzmond_complete, 0);
++}
++
++static int __init init_vzmond(void)
++{
++ INIT_LIST_HEAD(&ve_cleanup_list);
++ spin_lock_init(&ve_cleanup_lock);
++ stop_vzmond = 0;
++ return kernel_thread(vzmond, NULL, 0);
++}
++
++static void fini_vzmond(void)
++{
++ stop_vzmond = 1;
++ wake_up_process(vzmond_thread);
++ wait_for_completion(&vzmond_complete);
++ WARN_ON(!list_empty(&ve_cleanup_list));
++}
++
++void real_do_env_free(struct ve_struct *ve)
++{
++ VZTRACE("real_do_env_free\n");
++
++ ve_ipc_free(ve); /* free SYSV IPC resources */
++ free_ve_tty_drivers(ve);
++ free_ve_utsname(ve);
++ free_ve_sysctl(ve); /* free per ve sysctl data */
++ free_ve_filesystems(ve);
++ printk(KERN_INFO "VPS: %d: stopped\n", VEID(ve));
++ kfree(ve);
++
++ module_put(THIS_MODULE);
++}
++EXPORT_SYMBOL(real_do_env_free);
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE TTY handling
++ *
++ **********************************************************************
++ **********************************************************************/
++
++DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env)
++
++static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base,
++ struct ve_struct *ve)
++{
++ size_t size;
++ struct tty_driver *driver;
++
++ driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL);
++ if (!driver)
++ goto out;
++
++ memcpy(driver, base, sizeof(struct tty_driver));
++
++ driver->driver_state = NULL;
++
++ size = base->num * 3 * sizeof(void *);
++ if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) {
++ void **p;
++ p = kmalloc(size, GFP_KERNEL);
++ if (!p)
++ goto out_free;
++ memset(p, 0, size);
++ driver->ttys = (struct tty_struct **)p;
++ driver->termios = (struct termios **)(p + driver->num);
++ driver->termios_locked = (struct termios **)(p + driver->num * 2);
++ } else {
++ driver->ttys = NULL;
++ driver->termios = NULL;
++ driver->termios_locked = NULL;
++ }
++
++ SET_VE_OWNER_TTYDRV(driver, ve);
++ driver->flags |= TTY_DRIVER_INSTALLED;
++
++ return driver;
++
++out_free:
++ kfree(driver);
++out:
++ return NULL;
++}
++
++static void free_ve_tty_driver(struct tty_driver *driver)
++{
++ if (!driver)
++ return;
++
++ clear_termios(driver);
++ kfree(driver->ttys);
++ kfree(driver);
++}
++
++static int alloc_ve_tty_drivers(struct ve_struct* ve)
++{
++#ifdef CONFIG_LEGACY_PTYS
++ /* Traditional BSD devices */
++ ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve);
++ if (!ve->pty_driver)
++ goto out_mem;
++
++ ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve);
++ if (!ve->pty_slave_driver)
++ goto out_mem;
++
++ ve->pty_driver->other = ve->pty_slave_driver;
++ ve->pty_slave_driver->other = ve->pty_driver;
++#endif
++
++#ifdef CONFIG_UNIX98_PTYS
++ ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve);
++ if (!ve->ptm_driver)
++ goto out_mem;
++
++ ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve);
++ if (!ve->pts_driver)
++ goto out_mem;
++
++ ve->ptm_driver->other = ve->pts_driver;
++ ve->pts_driver->other = ve->ptm_driver;
++
++ ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), GFP_KERNEL);
++ if (!ve->allocated_ptys)
++ goto out_mem;
++ idr_init(ve->allocated_ptys);
++#endif
++ return 0;
++
++out_mem:
++ free_ve_tty_drivers(ve);
++ return -ENOMEM;
++}
++
++static void free_ve_tty_drivers(struct ve_struct* ve)
++{
++#ifdef CONFIG_LEGACY_PTYS
++ free_ve_tty_driver(ve->pty_driver);
++ free_ve_tty_driver(ve->pty_slave_driver);
++ ve->pty_driver = ve->pty_slave_driver = NULL;
++#endif
++#ifdef CONFIG_UNIX98_PTYS
++ free_ve_tty_driver(ve->ptm_driver);
++ free_ve_tty_driver(ve->pts_driver);
++ kfree(ve->allocated_ptys);
++ ve->ptm_driver = ve->pts_driver = NULL;
++ ve->allocated_ptys = NULL;
++#endif
++}
++
++static inline void __register_tty_driver(struct tty_driver *driver)
++{
++ list_add(&driver->tty_drivers, &tty_drivers);
++}
++
++static inline void __unregister_tty_driver(struct tty_driver *driver)
++{
++ if (!driver)
++ return;
++ list_del(&driver->tty_drivers);
++}
++
++static int register_ve_tty_drivers(struct ve_struct* ve)
++{
++ write_lock_irq(&tty_driver_guard);
++#ifdef CONFIG_UNIX98_PTYS
++ __register_tty_driver(ve->ptm_driver);
++ __register_tty_driver(ve->pts_driver);
++#endif
++#ifdef CONFIG_LEGACY_PTYS
++ __register_tty_driver(ve->pty_driver);
++ __register_tty_driver(ve->pty_slave_driver);
++#endif
++ write_unlock_irq(&tty_driver_guard);
++
++ return 0;
++}
++
++static void unregister_ve_tty_drivers(struct ve_struct* ve)
++{
++ VZTRACE("unregister_ve_tty_drivers\n");
++
++ write_lock_irq(&tty_driver_guard);
++ __unregister_tty_driver(ve->pty_driver);
++ __unregister_tty_driver(ve->pty_slave_driver);
++#ifdef CONFIG_UNIX98_PTYS
++ __unregister_tty_driver(ve->ptm_driver);
++ __unregister_tty_driver(ve->pts_driver);
++#endif
++ write_unlock_irq(&tty_driver_guard);
++}
++
++static int init_ve_tty_drivers(struct ve_struct *ve)
++{
++ int err;
++
++ if ((err = alloc_ve_tty_drivers(ve)))
++ goto err_ttyalloc;
++ if ((err = register_ve_tty_drivers(ve)))
++ goto err_ttyreg;
++ return 0;
++
++err_ttyreg:
++ free_ve_tty_drivers(ve);
++err_ttyalloc:
++ return err;
++}
++
++static void fini_ve_tty_drivers(struct ve_struct *ve)
++{
++ unregister_ve_tty_drivers(ve);
++ free_ve_tty_drivers(ve);
++}
++
++/*
++ * Free the termios and termios_locked structures because
++ * we don't want to get memory leaks when modular tty
++ * drivers are removed from the kernel.
++ */
++static void clear_termios(struct tty_driver *driver)
++{
++ int i;
++ struct termios *tp;
++
++ if (driver->termios == NULL)
++ return;
++ for (i = 0; i < driver->num; i++) {
++ tp = driver->termios[i];
++ if (tp) {
++ driver->termios[i] = NULL;
++ kfree(tp);
++ }
++ tp = driver->termios_locked[i];
++ if (tp) {
++ driver->termios_locked[i] = NULL;
++ kfree(tp);
++ }
++ }
++}
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * Pieces of VE network
++ *
++ **********************************************************************
++ **********************************************************************/
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#include <asm/uaccess.h>
++#include <net/sock.h>
++#include <linux/netlink.h>
++#include <linux/rtnetlink.h>
++#include <net/route.h>
++#include <net/ip_fib.h>
++#endif
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++static void ve_del_ip_addrs(struct net_device *dev)
++{
++ struct in_device *in_dev;
++
++ in_dev = in_dev_get(dev);
++ if (in_dev == NULL)
++ return;
++
++ while (in_dev->ifa_list != NULL) {
++ inet_del_ifa(in_dev, &in_dev->ifa_list, 1);
++ }
++ in_dev_put(in_dev);
++}
++
++static void ve_del_ipv6_addrs(struct net_device *dev)
++{
++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
++ addrconf_ifdown(dev, 2);
++#endif
++}
++
++static int ve_netdev_cleanup(struct net_device *dev, int to_ve)
++{
++ int err;
++
++ err = 0;
++ ve_del_ip_addrs(dev);
++ ve_del_ipv6_addrs(dev);
++ if ((dev->flags & IFF_UP) != 0)
++ err = dev_close(dev);
++ synchronize_net();
++ dev_shutdown(dev);
++ dev_mc_discard(dev);
++ free_divert_blk(dev);
++ synchronize_net();
++
++ if (to_ve)
++ dev->orig_mtu = dev->mtu;
++ else {
++ int rc = dev_set_mtu(dev, dev->orig_mtu);
++ if (err == 0)
++ err = rc;
++ }
++
++ return err;
++}
++
++static void __ve_dev_move(struct net_device *dev, struct ve_struct *ve_src,
++ struct ve_struct *ve_dst, struct user_beancounter *exec_ub)
++{
++ struct net_device **dp, *d;
++ struct user_beancounter *ub;
++
++ for (d = ve_src->_net_dev_base, dp = NULL; d != NULL;
++ dp = &d->next, d = d->next) {
++ if (d == dev) {
++ hlist_del(&dev->name_hlist);
++ hlist_del(&dev->index_hlist);
++ if (ve_src->_net_dev_tail == &dev->next)
++ ve_src->_net_dev_tail = dp;
++ if (dp)
++ *dp = dev->next;
++ dev->next = NULL;
++ break;
++ }
++ }
++ *ve_dst->_net_dev_tail = dev;
++ ve_dst->_net_dev_tail = &dev->next;
++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ve_dst));
++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ve_dst));
++ dev->owner_env = ve_dst;
++
++ ub = netdev_bc(dev)->exec_ub;
++ netdev_bc(dev)->exec_ub = get_beancounter(exec_ub);
++ put_beancounter(ub);
++}
++
++static int ve_dev_add(envid_t veid, char *dev_name)
++{
++ int err;
++ struct net_device *dev;
++ struct ve_struct *ve;
++ struct hlist_node *p;
++
++ dev = NULL;
++ err = -ESRCH;
++
++ ve = get_ve_by_id(veid);
++ if (ve == NULL)
++ goto out;
++
++ rtnl_lock();
++
++ read_lock(&dev_base_lock);
++ hlist_for_each(p, dev_name_hash(dev_name, get_ve0())) {
++ struct net_device *d = hlist_entry(p, struct net_device,
++ name_hlist);
++ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) {
++ dev = d;
++ break;
++ }
++ }
++ read_unlock(&dev_base_lock);
++ if (dev == NULL)
++ goto out_unlock;
++
++ err = -EPERM;
++ if (!ve_is_dev_movable(dev))
++ goto out_unlock;
++
++ err = -EINVAL;
++ if (dev->flags & (IFF_SLAVE|IFF_MASTER))
++ goto out_unlock;
++
++ ve_netdev_cleanup(dev, 1);
++
++ write_lock_bh(&dev_base_lock);
++ __ve_dev_move(dev, get_ve0(), ve, get_exec_ub());
++ write_unlock_bh(&dev_base_lock);
++
++ err = 0;
++
++out_unlock:
++ rtnl_unlock();
++ real_put_ve(ve);
++
++ if (dev == NULL)
++ printk(KERN_WARNING "Device %s not found\n", dev_name);
++
++out:
++ return err;
++}
++
++static int ve_dev_del(envid_t veid, char *dev_name)
++{
++ int err;
++ struct net_device *dev;
++ struct ve_struct *ve, *old_exec;
++ struct hlist_node *p;
++
++ dev = NULL;
++ err = -ESRCH;
++
++ ve = get_ve_by_id(veid);
++ if (ve == NULL)
++ goto out;
++
++ rtnl_lock();
++
++ read_lock(&dev_base_lock);
++ hlist_for_each(p, dev_name_hash(dev_name, ve)) {
++ struct net_device *d = hlist_entry(p, struct net_device,
++ name_hlist);
++ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) {
++ dev = d;
++ break;
++ }
++ }
++ read_unlock(&dev_base_lock);
++ if (dev == NULL)
++ goto out_unlock;
++
++ err = -EPERM;
++ if (!ve_is_dev_movable(dev))
++ goto out_unlock;
++
++ old_exec = set_exec_env(ve);
++ ve_netdev_cleanup(dev, 0);
++ (void)set_exec_env(old_exec);
++
++ write_lock_bh(&dev_base_lock);
++ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub);
++ write_unlock_bh(&dev_base_lock);
++
++ err = 0;
++
++out_unlock:
++ rtnl_unlock();
++ real_put_ve(ve);
++
++ if (dev == NULL)
++ printk(KERN_WARNING "Device %s not found\n", dev_name);
++
++out:
++ return err;
++}
++
++int real_ve_dev_map(envid_t veid, int op, char *dev_name)
++{
++ int err;
++ err = -EPERM;
++ if (!capable(CAP_SETVEID))
++ goto out;
++ switch (op)
++ {
++ case VE_NETDEV_ADD:
++ err = ve_dev_add(veid, dev_name);
++ break;
++ case VE_NETDEV_DEL:
++ err = ve_dev_del(veid, dev_name);
++ break;
++ default:
++ err = -EINVAL;
++ break;
++ }
++out:
++ return err;
++}
++
++static void ve_mapped_devs_cleanup(struct ve_struct *ve)
++{
++ struct net_device *dev;
++
++ rtnl_lock();
++ write_lock_bh(&dev_base_lock);
++restart:
++ for (dev = ve->_net_dev_base; dev != NULL; dev = dev->next)
++ {
++ if ((dev->features & NETIF_F_VENET) ||
++ (dev == ve->_loopback_dev)) /* Skip loopback dev */
++ continue;
++ write_unlock_bh(&dev_base_lock);
++ ve_netdev_cleanup(dev, 0);
++ write_lock_bh(&dev_base_lock);
++ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub);
++ goto restart;
++ }
++ write_unlock_bh(&dev_base_lock);
++ rtnl_unlock();
++}
++#endif
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * VE information via /proc
++ *
++ **********************************************************************
++ **********************************************************************/
++#ifdef CONFIG_PROC_FS
++static int devperms_seq_show(struct seq_file *m, void *v)
++{
++ struct devperms_struct *dp;
++ char dev_s[32], type_c;
++ unsigned use, type;
++ dev_t dev;
++
++ dp = (struct devperms_struct *)v;
++ if (dp == (struct devperms_struct *)1L) {
++ seq_printf(m, "Version: 2.7\n");
++ return 0;
++ }
++
++ use = dp->type & VE_USE_MASK;
++ type = dp->type & S_IFMT;
++ dev = dp->dev;
++
++ if ((use | VE_USE_MINOR) == use)
++ snprintf(dev_s, sizeof(dev_s), "%d:%d", MAJOR(dev), MINOR(dev));
++ else if ((use | VE_USE_MAJOR) == use)
++ snprintf(dev_s, sizeof(dev_s), "%d:*", MAJOR(dp->dev));
++ else
++ snprintf(dev_s, sizeof(dev_s), "*:*");
++
++ if (type == S_IFCHR)
++ type_c = 'c';
++ else if (type == S_IFBLK)
++ type_c = 'b';
++ else
++ type_c = '?';
++
++ seq_printf(m, "%10u %c %03o %s\n", dp->veid, type_c, dp->mask, dev_s);
++ return 0;
++}
++
++static void *devperms_seq_start(struct seq_file *m, loff_t *pos)
++{
++ loff_t cpos;
++ long slot;
++ struct devperms_struct *dp;
++
++ cpos = *pos;
++ read_lock(&devperms_hash_guard);
++ if (cpos-- == 0)
++ return (void *)1L;
++
++ for (slot = 0; slot < DEVPERMS_HASH_SZ; slot++)
++ for (dp = devperms_hash[slot]; dp; dp = dp->devhash_next)
++ if (cpos-- == 0) {
++ m->private = (void *)slot;
++ return dp;
++ }
++ return NULL;
++}
++
++static void *devperms_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ long slot;
++ struct devperms_struct *dp;
++
++ dp = (struct devperms_struct *)v;
++
++ if (dp == (struct devperms_struct *)1L)
++ slot = 0;
++ else if (dp->devhash_next == NULL)
++ slot = (long)m->private + 1;
++ else {
++ (*pos)++;
++ return dp->devhash_next;
++ }
++
++ for (; slot < DEVPERMS_HASH_SZ; slot++)
++ if (devperms_hash[slot]) {
++ (*pos)++;
++ m->private = (void *)slot;
++ return devperms_hash[slot];
++ }
++ return NULL;
++}
++
++static void devperms_seq_stop(struct seq_file *m, void *v)
++{
++ read_unlock(&devperms_hash_guard);
++}
++
++static struct seq_operations devperms_seq_op = {
++ .start = devperms_seq_start,
++ .next = devperms_seq_next,
++ .stop = devperms_seq_stop,
++ .show = devperms_seq_show,
++};
++
++static int devperms_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &devperms_seq_op);
++}
++
++static struct file_operations proc_devperms_ops = {
++ .open = devperms_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++#if BITS_PER_LONG == 32
++#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
++#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n"
++#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n"
++#else
++#define VESTAT_LINE_WIDTH (12 * 21)
++#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n"
++#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n"
++#endif
++
++static int vestat_seq_show(struct seq_file *m, void *v)
++{
++ struct ve_struct *ve = (struct ve_struct *)v;
++ struct ve_struct *curve;
++ int cpu;
++ unsigned long user_ve, nice_ve, system_ve, uptime;
++ cycles_t uptime_cycles, idle_time, strv_time, used;
++
++ curve = get_exec_env();
++ if (ve == ve_list_head ||
++ (!ve_is_super(curve) && ve == curve)) {
++ /* print header */
++ seq_printf(m, "%-*s\n",
++ VESTAT_LINE_WIDTH - 1,
++ "Version: 2.2");
++ seq_printf(m, VESTAT_HEAD_FMT, "VEID",
++ "user", "nice", "system",
++ "uptime", "idle",
++ "strv", "uptime", "used",
++ "maxlat", "totlat", "numsched");
++ }
++
++ if (ve == get_ve0())
++ return 0;
++
++ user_ve = nice_ve = system_ve = 0;
++ idle_time = strv_time = used = 0;
++
++ for (cpu = 0; cpu < NR_CPUS; cpu++) {
++ struct ve_cpu_stats *st;
++
++ st = VE_CPU_STATS(ve, cpu);
++ user_ve += st->user;
++ nice_ve += st->nice;
++ system_ve += st->system;
++ used += VE_CPU_STATS(ve, cpu)->used_time;
++ idle_time += ve_sched_get_idle_time(ve, cpu);
++ }
++ uptime_cycles = get_cycles() - ve->start_cycles;
++ uptime = jiffies - ve->start_jiffies;
++
++ seq_printf(m, VESTAT_LINE_FMT, ve->veid,
++ user_ve, nice_ve, system_ve,
++ uptime, idle_time,
++ strv_time, uptime_cycles, used,
++ ve->sched_lat_ve.last.maxlat,
++ ve->sched_lat_ve.last.totlat,
++ ve->sched_lat_ve.last.count);
++ return 0;
++}
++
++static void *ve_seq_start(struct seq_file *m, loff_t *pos)
++{
++ struct ve_struct *ve, *curve;
++ loff_t l;
++
++ curve = get_exec_env();
++ read_lock(&ve_list_guard);
++ if (!ve_is_super(curve)) {
++ if (*pos != 0)
++ return NULL;
++ return curve;
++ }
++ for (ve = ve_list_head, l = *pos;
++ ve != NULL && l > 0;
++ ve = ve->next, l--);
++ return ve;
++}
++
++static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ struct ve_struct *ve = (struct ve_struct *)v;
++
++ if (!ve_is_super(get_exec_env()))
++ return NULL;
++ (*pos)++;
++ return ve->next;
++}
++
++static void ve_seq_stop(struct seq_file *m, void *v)
++{
++ read_unlock(&ve_list_guard);
++}
++
++static struct seq_operations vestat_seq_op = {
++ start: ve_seq_start,
++ next: ve_seq_next,
++ stop: ve_seq_stop,
++ show: vestat_seq_show
++};
++
++static int vestat_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &vestat_seq_op);
++}
++
++static struct file_operations proc_vestat_operations = {
++ open: vestat_open,
++ read: seq_read,
++ llseek: seq_lseek,
++ release: seq_release
++};
++
++static inline unsigned long ve_used_mem(struct user_beancounter *ub)
++{
++ return ub->ub_parms[UB_OOMGUARPAGES].held;
++}
++
++static inline void ve_mi_replace(struct meminfo *mi)
++{
++ struct user_beancounter *ub;
++ unsigned long meminfo_val;
++ unsigned long nodettram;
++ unsigned long usedmem;
++
++ meminfo_val = get_exec_env()->meminfo_val;
++
++ if(!meminfo_val)
++ return; /* No virtualization */
++
++ nodettram = mi->si.totalram;
++ ub = current->mm->mm_ub;
++ usedmem = ve_used_mem(ub);
++
++ memset(mi, 0, sizeof(*mi));
++
++ mi->si.totalram = (meminfo_val > nodettram) ?
++ nodettram : meminfo_val;
++ mi->si.freeram = (mi->si.totalram > usedmem) ?
++ (mi->si.totalram - usedmem) : 0;
++}
++
++static int meminfo_call(struct vnotifier_block *self,
++ unsigned long event, void *arg, int old_ret)
++{
++ if (event != VIRTINFO_MEMINFO)
++ return old_ret;
++
++ ve_mi_replace((struct meminfo *)arg);
++
++ return NOTIFY_OK;
++}
++
++
++static struct vnotifier_block meminfo_notifier_block = {
++ .notifier_call = meminfo_call
++};
++
++static int __init init_vecalls_proc(void)
++{
++ struct proc_dir_entry *de;
++
++ de = create_proc_glob_entry("vz/vestat",
++ S_IFREG|S_IRUSR, NULL);
++ if (de == NULL) {
++ /* create "vz" subdirectory, if not exist */
++ (void) create_proc_glob_entry("vz",
++ S_IFDIR|S_IRUGO|S_IXUGO, NULL);
++ de = create_proc_glob_entry("vz/vestat",
++ S_IFREG|S_IRUSR, NULL);
++ }
++ if (de)
++ de->proc_fops = &proc_vestat_operations;
++ else
++ printk(KERN_WARNING
++ "VZMON: can't make vestat proc entry\n");
++
++ de = create_proc_entry("vz/devperms", S_IFREG | S_IRUSR, NULL);
++ if (de)
++ de->proc_fops = &proc_devperms_ops;
++ else
++ printk(KERN_WARNING
++ "VZMON: can't make devperms proc entry\n");
++
++ virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block);
++
++ return 0;
++}
++
++static void fini_vecalls_proc(void)
++{
++ remove_proc_entry("vz/devperms", NULL);
++ remove_proc_entry("vz/vestat", NULL);
++ virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block);
++}
++#else
++#define init_vecalls_proc() (0)
++#define fini_vecalls_proc() do { } while (0)
++#endif /* CONFIG_PROC_FS */
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * User ctl
++ *
++ **********************************************************************
++ **********************************************************************/
++
++int vzcalls_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
++static struct vzioctlinfo vzcalls = {
++ type: VZCTLTYPE,
++ func: vzcalls_ioctl,
++ owner: THIS_MODULE,
++};
++
++int vzcalls_ioctl(struct inode *ino, struct file *file, unsigned int cmd,
++ unsigned long arg)
++{
++ int err;
++
++ err = -ENOTTY;
++ switch(cmd) {
++ case VZCTL_MARK_ENV_TO_DOWN: {
++ /* Compatibility issue */
++ err = 0;
++ }
++ break;
++ case VZCTL_SETDEVPERMS: {
++ /* Device type was mistakenly declared as dev_t
++ * in the old user-kernel interface.
++ * That's wrong, dev_t is a kernel internal type.
++ * I use `unsigned' not having anything better in mind.
++ * 2001/08/11 SAW */
++ struct vzctl_setdevperms s;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err = real_setdevperms(s.veid, s.type,
++ new_decode_dev(s.dev), s.mask);
++ }
++ break;
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ case VZCTL_VE_NETDEV: {
++ struct vzctl_ve_netdev d;
++ char *s;
++ err = -EFAULT;
++ if (copy_from_user(&d, (void *)arg, sizeof(d)))
++ break;
++ err = -ENOMEM;
++ s = kmalloc(IFNAMSIZ+1, GFP_KERNEL);
++ if (s == NULL)
++ break;
++ err = -EFAULT;
++ if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) {
++ s[IFNAMSIZ] = 0;
++ err = real_ve_dev_map(d.veid, d.op, s);
++ }
++ kfree(s);
++ }
++ break;
++#endif
++ case VZCTL_ENV_CREATE: {
++ struct vzctl_env_create s;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err = real_env_create(s.veid, s.flags, s.class_id,
++ NULL, 0);
++ }
++ break;
++ case VZCTL_ENV_CREATE_DATA: {
++ struct vzctl_env_create_data s;
++ env_create_param_t *data;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err=-EINVAL;
++ if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN ||
++ s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN ||
++ s.data == 0)
++ break;
++ err = -ENOMEM;
++ data = kmalloc(sizeof(*data), GFP_KERNEL);
++ if (!data)
++ break;
++ memset(data, 0, sizeof(*data));
++ err = -EFAULT;
++ if (copy_from_user(data, (void *)s.data, s.datalen))
++ goto free_data;
++ err = real_env_create(s.veid, s.flags, s.class_id,
++ data, s.datalen);
++free_data:
++ kfree(data);
++ }
++ break;
++ case VZCTL_GET_CPU_STAT: {
++ struct vzctl_cpustatctl s;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err = ve_get_cpu_stat(s.veid, s.cpustat);
++ }
++ break;
++ case VZCTL_VE_MEMINFO: {
++ struct vzctl_ve_meminfo s;
++ err = -EFAULT;
++ if (copy_from_user(&s, (void *)arg, sizeof(s)))
++ break;
++ err = ve_set_meminfo(s.veid, s.val);
++ }
++ break;
++ }
++ return err;
++}
++EXPORT_SYMBOL(real_env_create);
++
++
++/**********************************************************************
++ **********************************************************************
++ *
++ * Init/exit stuff
++ *
++ **********************************************************************
++ **********************************************************************/
++
++#ifdef CONFIG_VE_CALLS_MODULE
++static int __init init_vecalls_symbols(void)
++{
++ KSYMRESOLVE(real_get_device_perms_ve);
++ KSYMRESOLVE(real_do_env_cleanup);
++ KSYMRESOLVE(real_do_env_free);
++ KSYMRESOLVE(real_update_load_avg_ve);
++ KSYMMODRESOLVE(vzmon);
++ return 0;
++}
++
++static void fini_vecalls_symbols(void)
++{
++ KSYMMODUNRESOLVE(vzmon);
++ KSYMUNRESOLVE(real_get_device_perms_ve);
++ KSYMUNRESOLVE(real_do_env_cleanup);
++ KSYMUNRESOLVE(real_do_env_free);
++ KSYMUNRESOLVE(real_update_load_avg_ve);
++}
++#else
++#define init_vecalls_symbols() (0)
++#define fini_vecalls_symbols() do { } while (0)
++#endif
++
++static inline __init int init_vecalls_ioctls(void)
++{
++ vzioctl_register(&vzcalls);
++ return 0;
++}
++
++static inline void fini_vecalls_ioctls(void)
++{
++ vzioctl_unregister(&vzcalls);
++}
++
++static int __init vecalls_init(void)
++{
++ int err;
++ int i;
++
++ ve_list_head = get_ve0();
++
++ err = init_vzmond();
++ if (err < 0)
++ goto out_vzmond;
++
++ err = init_devperms_hash();
++ if (err < 0)
++ goto out_perms;
++
++ err = init_vecalls_symbols();
++ if (err < 0)
++ goto out_sym;
++
++ err = init_vecalls_proc();
++ if (err < 0)
++ goto out_proc;
++
++ err = init_vecalls_ioctls();
++ if (err < 0)
++ goto out_ioctls;
++
++ for (i = 0; i < VE_MAX_HOOKS; i++)
++ INIT_LIST_HEAD(&ve_hooks[i]);
++
++ return 0;
++
++out_ioctls:
++ fini_vecalls_proc();
++out_proc:
++ fini_vecalls_symbols();
++out_sym:
++ fini_devperms_hash();
++out_perms:
++ fini_vzmond();
++out_vzmond:
++ return err;
++}
++
++static void vecalls_exit(void)
++{
++ fini_vecalls_ioctls();
++ fini_vecalls_proc();
++ fini_vecalls_symbols();
++ fini_devperms_hash();
++ fini_vzmond();
++}
++
++EXPORT_SYMBOL(get_ve_by_id);
++EXPORT_SYMBOL(__find_ve_by_id);
++EXPORT_SYMBOL(ve_list_guard);
++EXPORT_SYMBOL(ve_list_head);
++EXPORT_SYMBOL(nr_ve);
++
++MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Control");
++MODULE_LICENSE("GPL v2");
++
++module_init(vecalls_init)
++module_exit(vecalls_exit)
+diff -upr linux-2.6.16.orig/kernel/veowner.c linux-2.6.16-026test015/kernel/veowner.c
+--- linux-2.6.16.orig/kernel/veowner.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/veowner.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,308 @@
++/*
++ * kernel/veowner.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/ve.h>
++#include <linux/ve_owner.h>
++#include <linux/ve_proto.h>
++#include <linux/ipc.h>
++#include <linux/fs.h>
++#include <linux/proc_fs.h>
++#include <linux/file.h>
++#include <linux/mm.h>
++#include <linux/delay.h>
++#include <linux/vmalloc.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/list.h>
++#include <linux/inetdevice.h>
++#include <asm/system.h>
++#include <asm/io.h>
++
++#include <net/tcp.h>
++
++void prepare_ve0_process(struct task_struct *tsk)
++{
++ set_virt_pid(tsk, tsk->pid);
++ set_virt_tgid(tsk, tsk->tgid);
++ if (tsk->signal) {
++ set_virt_pgid(tsk, tsk->signal->pgrp);
++ set_virt_sid(tsk, tsk->signal->session);
++ }
++ VE_TASK_INFO(tsk)->exec_env = get_ve0();
++ VE_TASK_INFO(tsk)->owner_env = get_ve0();
++ VE_TASK_INFO(tsk)->sleep_time = 0;
++ VE_TASK_INFO(tsk)->wakeup_stamp = 0;
++ VE_TASK_INFO(tsk)->sched_time = 0;
++ seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock);
++
++ if (tsk->pid) {
++ SET_VE_LINKS(tsk);
++ atomic_inc(&get_ve0()->pcounter);
++ }
++}
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++void prepare_ve0_loopback(void)
++{
++ get_ve0()->_loopback_dev = &loopback_dev;
++}
++#endif
++
++/*
++ * ------------------------------------------------------------------------
++ * proc entries
++ * ------------------------------------------------------------------------
++ */
++
++#ifdef CONFIG_PROC_FS
++static void proc_move(struct proc_dir_entry *ddir,
++ struct proc_dir_entry *sdir,
++ const char *name)
++{
++ struct proc_dir_entry **p, *q;
++ int len;
++
++ len = strlen(name);
++ for (p = &sdir->subdir, q = *p; q != NULL; p = &q->next, q = *p)
++ if (proc_match(len, name, q))
++ break;
++ if (q == NULL)
++ return;
++ *p = q->next;
++ q->parent = ddir;
++ q->next = ddir->subdir;
++ ddir->subdir = q;
++}
++static void prepare_proc_misc(void)
++{
++ static char *table[] = {
++ "loadavg",
++ "uptime",
++ "meminfo",
++ "version",
++ "stat",
++ "filesystems",
++ "locks",
++ "swaps",
++ "mounts",
++ "net",
++ "cpuinfo",
++ "sysvipc",
++ "sys",
++ "fs",
++ "vz",
++ "user_beancounters",
++ "cmdline",
++ "vmstat",
++ "modules",
++ "kmsg",
++ NULL,
++ };
++ char **p;
++
++ for (p = table; *p != NULL; p++)
++ proc_move(&proc_root, ve0.proc_root, *p);
++}
++int prepare_proc(void)
++{
++ struct ve_struct *envid;
++ struct proc_dir_entry *de;
++ struct proc_dir_entry *ve_root;
++
++ envid = set_exec_env(&ve0);
++ ve_root = ve0.proc_root->subdir;
++ /* move the whole tree to be visible in VE0 only */
++ ve0.proc_root->subdir = proc_root.subdir;
++ for (de = ve0.proc_root->subdir; de->next != NULL; de = de->next)
++ de->parent = ve0.proc_root;
++ de->parent = ve0.proc_root;
++ de->next = ve_root;
++
++ /* move back into the global scope some specific entries */
++ proc_root.subdir = NULL;
++ prepare_proc_misc();
++ proc_net = proc_mkdir("net", ve0.proc_root);
++ proc_net_stat = proc_mkdir("stat", proc_net);
++ proc_mkdir("vz", 0);
++#ifdef CONFIG_SYSVIPC
++ proc_mkdir("sysvipc", 0);
++#endif
++ proc_root_fs = proc_mkdir("fs", 0);
++ /* XXX proc_tty_init(); */
++
++ /* XXX process inodes */
++
++ (void)set_exec_env(envid);
++
++ (void)create_proc_glob_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL);
++ return 0;
++}
++
++static struct proc_dir_entry ve0_proc_root = {
++ .name = "/proc",
++ .namelen = 5,
++ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
++ .nlink = 2
++};
++
++void prepare_ve0_proc_root(void)
++{
++ ve0.proc_root = &ve0_proc_root;
++}
++#endif
++
++/*
++ * ------------------------------------------------------------------------
++ * Virtualized sysctl
++ * ------------------------------------------------------------------------
++ */
++
++static int semmin[4] = { 1, 1, 1, 1 };
++static int semmax[4] = { 8000, INT_MAX, 1000, IPCMNI };
++static ctl_table kern_table[] = {
++ {KERN_NODENAME, "hostname", system_utsname.nodename, 64,
++ 0644, NULL, &proc_doutsstring, &sysctl_string},
++ {KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64,
++ 0644, NULL, &proc_doutsstring, &sysctl_string},
++#ifdef CONFIG_SYSVIPC
++#define get_ve0_field(fname) &ve0._##fname
++ {KERN_SHMMAX, "shmmax", get_ve0_field(shm_ctlmax), sizeof (size_t),
++ 0644, NULL, &proc_doulongvec_minmax },
++ {KERN_SHMALL, "shmall", get_ve0_field(shm_ctlall), sizeof (size_t),
++ 0644, NULL, &proc_doulongvec_minmax },
++ {KERN_SHMMNI, "shmmni", get_ve0_field(shm_ctlmni), sizeof (int),
++ 0644, NULL, &proc_dointvec_minmax, NULL,
++ NULL, &semmin[0], &semmax[3] },
++ {KERN_MSGMAX, "msgmax", get_ve0_field(msg_ctlmax), sizeof (int),
++ 0644, NULL, &proc_dointvec },
++ {KERN_MSGMNI, "msgmni", get_ve0_field(msg_ctlmni), sizeof (int),
++ 0644, NULL, &proc_dointvec_minmax, NULL,
++ NULL, &semmin[0], &semmax[3] },
++ {KERN_MSGMNB, "msgmnb", get_ve0_field(msg_ctlmnb), sizeof (int),
++ 0644, NULL, &proc_dointvec },
++ {KERN_SEM, "sem", get_ve0_field(sem_ctls), 4*sizeof (int),
++ 0644, NULL, &proc_dointvec },
++#endif
++ {0}
++};
++static ctl_table root_table[] = {
++ {CTL_KERN, "kernel", NULL, 0, 0555, kern_table},
++ {0}
++};
++extern int ip_rt_src_check;
++extern int ve_area_access_check;
++static ctl_table vz_ipv4_route_table[] = {
++ {
++ ctl_name: NET_IPV4_ROUTE_SRC_CHECK,
++ procname: "src_check",
++ data: &ip_rt_src_check,
++ maxlen: sizeof(int),
++ mode: 0644,
++ proc_handler: &proc_dointvec,
++ },
++ { 0 }
++};
++static ctl_table vz_ipv4_table[] = {
++ {NET_IPV4_ROUTE, "route", NULL, 0, 0555, vz_ipv4_route_table},
++ { 0 }
++};
++static ctl_table vz_net_table[] = {
++ {NET_IPV4, "ipv4", NULL, 0, 0555, vz_ipv4_table},
++ { 0 }
++};
++static ctl_table vz_fs_table[] = {
++ {
++ ctl_name: 226,
++ procname: "ve-area-access-check",
++ data: &ve_area_access_check,
++ maxlen: sizeof(int),
++ mode: 0644,
++ proc_handler: &proc_dointvec,
++ },
++ { 0 }
++};
++static ctl_table root_table2[] = {
++ {CTL_NET, "net", NULL, 0, 0555, vz_net_table},
++ {CTL_FS, "fs", NULL, 0, 0555, vz_fs_table},
++ { 0 }
++};
++int prepare_sysctl(void)
++{
++ struct ve_struct *envid;
++
++ envid = set_exec_env(&ve0);
++ ve0.kern_header = register_sysctl_table(root_table, 1);
++ register_sysctl_table(root_table2, 0);
++ (void)set_exec_env(envid);
++ return 0;
++}
++
++void prepare_ve0_sysctl(void)
++{
++ INIT_LIST_HEAD(&ve0.sysctl_lh);
++#ifdef CONFIG_SYSCTL
++ ve0.proc_sys_root = proc_mkdir("sys", 0);
++#endif
++}
++
++/*
++ * ------------------------------------------------------------------------
++ * XXX init_ve_system
++ * ------------------------------------------------------------------------
++ */
++
++void init_ve_system(void)
++{
++ struct task_struct *init_entry, *p, *tsk;
++ struct ve_struct *ptr;
++ unsigned long flags;
++ int i;
++
++ ptr = get_ve0();
++ (void)get_ve(ptr);
++ atomic_set(&ptr->pcounter, 1);
++
++ /* Don't forget about idle tasks */
++ write_lock_irqsave(&tasklist_lock, flags);
++ for (i = 0; i < NR_CPUS; i++) {
++ tsk = idle_task(i);
++ if (tsk == NULL)
++ continue;
++
++ prepare_ve0_process(tsk);
++ }
++ do_each_thread_all(p, tsk) {
++ prepare_ve0_process(tsk);
++ } while_each_thread_all(p, tsk);
++ write_unlock_irqrestore(&tasklist_lock, flags);
++
++ init_entry = child_reaper;
++ ptr->init_entry = init_entry;
++ /* XXX: why? */
++ cap_set_full(ptr->cap_default);
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ ptr->_ipv4_devconf = &ipv4_devconf;
++ ptr->_ipv4_devconf_dflt = &ipv4_devconf_dflt;
++#endif
++
++ read_lock(&init_entry->fs->lock);
++ ptr->fs_rootmnt = init_entry->fs->rootmnt;
++ ptr->fs_root = init_entry->fs->root;
++ read_unlock(&init_entry->fs->lock);
++
++ /* common prepares */
++#ifdef CONFIG_PROC_FS
++ prepare_proc();
++#endif
++ prepare_sysctl();
++ prepare_ipc();
++}
+diff -upr linux-2.6.16.orig/kernel/vzdev.c linux-2.6.16-026test015/kernel/vzdev.c
+--- linux-2.6.16.orig/kernel/vzdev.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/vzdev.c 2006-07-04 14:41:39.000000000 +0400
+@@ -0,0 +1,129 @@
++/*
++ * kernel/vzdev.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/fs.h>
++#include <linux/list.h>
++#include <linux/init.h>
++#include <linux/module.h>
++#include <linux/vzctl.h>
++#include <linux/slab.h>
++#include <linux/vmalloc.h>
++#include <linux/vzcalluser.h>
++#include <asm/uaccess.h>
++#include <asm/pgalloc.h>
++#include <linux/device.h>
++#include <linux/smp_lock.h>
++
++#define VZCTL_MAJOR 126
++#define VZCTL_NAME "vzctl"
++
++MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo Interface");
++MODULE_LICENSE("GPL v2");
++
++static LIST_HEAD(ioctls);
++static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED;
++
++int vzctl_ioctl(struct inode *ino, struct file *file, unsigned int cmd,
++ unsigned long arg)
++{
++ int err;
++ struct list_head *p;
++ struct vzioctlinfo *inf;
++
++ err = -ENOTTY;
++ spin_lock(&ioctl_lock);
++ list_for_each(p, &ioctls) {
++ inf = list_entry(p, struct vzioctlinfo, list);
++ if (inf->type != _IOC_TYPE(cmd))
++ continue;
++
++ err = try_module_get(inf->owner) ? 0 : -EBUSY;
++ spin_unlock(&ioctl_lock);
++ if (!err) {
++ unlock_kernel();
++ err = (*inf->func)(ino, file, cmd, arg);
++ lock_kernel();
++ module_put(inf->owner);
++ }
++ return err;
++ }
++ spin_unlock(&ioctl_lock);
++ return err;
++}
++
++void vzioctl_register(struct vzioctlinfo *inf)
++{
++ spin_lock(&ioctl_lock);
++ list_add(&inf->list, &ioctls);
++ spin_unlock(&ioctl_lock);
++}
++
++void vzioctl_unregister(struct vzioctlinfo *inf)
++{
++ spin_lock(&ioctl_lock);
++ list_del_init(&inf->list);
++ spin_unlock(&ioctl_lock);
++}
++
++EXPORT_SYMBOL(vzioctl_register);
++EXPORT_SYMBOL(vzioctl_unregister);
++
++/*
++ * Init/exit stuff.
++ */
++static struct file_operations vzctl_fops = {
++ .owner = THIS_MODULE,
++ .ioctl = vzctl_ioctl,
++};
++
++static struct class *vzctl_class;
++
++static void __exit vzctl_exit(void)
++{
++ class_device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0));
++ class_destroy(vzctl_class);
++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
++}
++
++static int __init vzctl_init(void)
++{
++ int ret;
++ struct class_device *class_err;
++
++ ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops);
++ if (ret < 0)
++ goto out;
++
++ vzctl_class = class_create(THIS_MODULE, "vzctl");
++ if (IS_ERR(vzctl_class)) {
++ ret = PTR_ERR(vzctl_class);
++ goto out_cleandev;
++ }
++
++ class_err = class_device_create(vzctl_class, NULL, MKDEV(VZCTL_MAJOR, 0),
++ NULL, VZCTL_NAME);
++ if (IS_ERR(class_err)) {
++ ret = PTR_ERR(class_err);
++ goto out_rmclass;
++ }
++
++ goto out;
++
++out_rmclass:
++ class_destroy(vzctl_class);
++out_cleandev:
++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
++out:
++ return ret;
++}
++
++module_init(vzctl_init)
++module_exit(vzctl_exit);
+diff -upr linux-2.6.16.orig/kernel/vzwdog.c linux-2.6.16-026test015/kernel/vzwdog.c
+--- linux-2.6.16.orig/kernel/vzwdog.c 2006-07-04 14:41:41.000000000 +0400
++++ linux-2.6.16-026test015/kernel/vzwdog.c 2006-07-04 14:41:38.000000000 +0400
+@@ -0,0 +1,278 @@
++/*
++ * kernel/vzwdog.c
++ *
++ * Copyright (C) 2000-2005 SWsoft
++ * All rights reserved.
++ *
++ * Licensing governed by "linux/COPYING.SWsoft" file.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/fs.h>
++#include <linux/list.h>
++#include <linux/ctype.h>
++#include <linux/kobject.h>
++#include <linux/genhd.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/kernel.h>
++#include <linux/kernel_stat.h>
++#include <linux/smp_lock.h>
++#include <linux/errno.h>
++#include <linux/suspend.h>
++#include <linux/ve.h>
++#include <linux/vzstat.h>
++
++/* Staff regading kernel thread polling VE validity */
++static int sleep_timeout = 60;
++static pid_t wdog_thread_pid;
++static int wdog_thread_continue = 1;
++static DECLARE_COMPLETION(license_thread_exited);
++
++extern void show_mem(void);
++extern struct ve_struct *ve_list_head;
++
++#if 0
++static char page[PAGE_SIZE];
++
++static void parse_irq_list(int len)
++{
++ int i, k, skip;
++ for (i = 0; i < len; ) {
++ k = i;
++ while (i < len && page[i] != '\n' && page[i] != ':')
++ i++;
++ skip = 0;
++ if (i < len && page[i] != '\n') {
++ i++; /* skip ':' */
++ while (i < len && (page[i] == ' ' || page[i] == '0'))
++ i++;
++ skip = (i < len && (page[i] < '0' || page[i] > '9'));
++ while (i < len && page[i] != '\n')
++ i++;
++ }
++ if (!skip)
++ printk("\n%.*s", i - k, page + k);
++ if (i < len)
++ i++; /* skip '\n' */
++ }
++}
++#endif
++
++static void show_irq_list(void)
++{
++#if 0
++ i = KSYMSAFECALL(int, get_irq_list, (page));
++ parse_irq_list(i); /* Safe, zero was returned if unassigned */
++#endif
++}
++
++static void show_alloc_latency(void)
++{
++ static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
++ "A0",
++ "L0",
++ "H0",
++ "L1",
++ "H1"
++ };
++ int i;
++
++ printk("lat: ");
++ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
++ struct kstat_lat_struct *p;
++ cycles_t maxlat, avg0, avg1, avg2;
++
++ p = &kstat_glob.alloc_lat[i];
++ spin_lock_irq(&kstat_glb_lock);
++ maxlat = p->last.maxlat;
++ avg0 = p->avg[0];
++ avg1 = p->avg[1];
++ avg2 = p->avg[2];
++ spin_unlock_irq(&kstat_glb_lock);
++
++ printk("%s %Lu (%Lu %Lu %Lu)",
++ alloc_descr[i],
++ maxlat,
++ avg0,
++ avg1,
++ avg2);
++ }
++ printk("\n");
++}
++
++static void show_schedule_latency(void)
++{
++ struct kstat_lat_pcpu_struct *p;
++ cycles_t maxlat, totlat, avg0, avg1, avg2;
++ unsigned long count;
++
++ p = &kstat_glob.sched_lat;
++ spin_lock_irq(&kstat_glb_lock);
++ maxlat = p->last.maxlat;
++ totlat = p->last.totlat;
++ count = p->last.count;
++ avg0 = p->avg[0];
++ avg1 = p->avg[1];
++ avg2 = p->avg[2];
++ spin_unlock_irq(&kstat_glb_lock);
++
++ printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n",
++ maxlat,
++ totlat,
++ count,
++ avg0,
++ avg1,
++ avg2);
++}
++
++static void show_header(void)
++{
++ struct timeval tv;
++
++ do_gettimeofday(&tv);
++ printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n",
++ tv.tv_sec, tv.tv_usec,
++ get_jiffies_64(), smp_processor_id());
++#ifdef CONFIG_FAIRSCHED
++ printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n",
++ cycles_per_jiffy, HZ);
++#else
++ printk("*** jiffies_per_second %u ***\n", HZ);
++#endif
++}
++
++static void show_pgdatinfo(void)
++{
++ pg_data_t *pgdat;
++
++ printk("pgdat:");
++ for_each_pgdat(pgdat) {
++ printk(" %d: %lu,%lu,%lu,%p",
++ pgdat->node_id,
++ pgdat->node_start_pfn,
++ pgdat->node_present_pages,
++ pgdat->node_spanned_pages,
++ pgdat->node_mem_map);
++ }
++ printk("\n");
++}
++
++static void show_diskio(void)
++{
++ struct gendisk *gd;
++ char buf[BDEVNAME_SIZE];
++
++ printk("disk_io: ");
++
++ down_read(&block_subsys.rwsem);
++ list_for_each_entry(gd, &block_subsys.kset.list, kobj.entry) {
++ char *name;
++ name = disk_name(gd, 0, buf);
++ if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) &&
++ isdigit(name[4]))
++ continue;
++ if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) &&
++ isdigit(name[3]))
++ continue;
++ printk("(%u,%u) %s r(%u %u %u) w(%u %u %u)\n",
++ gd->major, gd->first_minor,
++ name,
++ disk_stat_read(gd, ios[READ]),
++ disk_stat_read(gd, sectors[READ]),
++ disk_stat_read(gd, merges[READ]),
++ disk_stat_read(gd, ios[WRITE]),
++ disk_stat_read(gd, sectors[WRITE]),
++ disk_stat_read(gd, merges[WRITE]));
++ }
++ up_read(&block_subsys.rwsem);
++
++ printk("\n");
++}
++
++static void show_nrprocs(void)
++{
++ unsigned long _nr_running, _nr_sleeping,
++ _nr_unint, _nr_zombie, _nr_dead, _nr_stopped;
++
++ _nr_running = nr_running();
++ _nr_unint = nr_uninterruptible();
++ _nr_sleeping = nr_sleeping();
++ _nr_zombie = nr_zombie;
++ _nr_dead = atomic_read(&nr_dead);
++ _nr_stopped = nr_stopped();
++
++ printk("VEnum: %d, proc R %lu, S %lu, D %lu, "
++ "Z %lu, X %lu, T %lu (tot %d)\n",
++ nr_ve, _nr_running, _nr_sleeping, _nr_unint,
++ _nr_zombie, _nr_dead, _nr_stopped, nr_threads);
++}
++
++static void wdog_print(void)
++{
++ show_header();
++ show_irq_list();
++ show_pgdatinfo();
++ show_mem();
++ show_diskio();
++ show_schedule_latency();
++ show_alloc_latency();
++ show_nrprocs();
++}
++
++static int wdog_loop(void* data)
++{
++ struct task_struct *tsk = current;
++ DECLARE_WAIT_QUEUE_HEAD(thread_wait_queue);
++
++ /*
++ * This thread doesn't need any user-level access,
++ * so get rid of all our resources
++ */
++ daemonize("wdogd");
++
++ spin_lock_irq(&tsk->sighand->siglock);
++ sigfillset(&tsk->blocked);
++ sigdelset(&tsk->blocked, SIGHUP);
++ recalc_sigpending();
++ spin_unlock_irq(&tsk->sighand->siglock);
++
++ while (wdog_thread_continue) {
++ wdog_print();
++ interruptible_sleep_on_timeout(&thread_wait_queue,
++ sleep_timeout*HZ);
++ try_to_freeze();
++ /* clear all signals */
++ if (signal_pending(tsk))
++ flush_signals(tsk);
++ }
++
++ complete_and_exit(&license_thread_exited, 0);
++}
++
++static int __init wdog_init(void)
++{
++ wdog_thread_pid = kernel_thread(wdog_loop, NULL, 0);
++ if (wdog_thread_pid < 0)
++ return wdog_thread_pid;
++
++ return 0;
++}
++
++static void __exit wdog_exit(void)
++{
++ wdog_thread_continue = 0;
++ if (wdog_thread_pid > 0) {
++ kill_proc(wdog_thread_pid, SIGHUP, 1);
++ wait_for_completion(&license_thread_exited);
++ }
++}
++
++module_param(sleep_timeout, int, 0);
++MODULE_AUTHOR("SWsoft <info@sw-soft.com>");
++MODULE_DESCRIPTION("Virtuozzo WDOG");
++MODULE_LICENSE("GPL v2");
++
++module_init(wdog_init)
++module_exit(wdog_exit)
+diff -upr linux-2.6.16.orig/lib/Kconfig.debug linux-2.6.16-026test015/lib/Kconfig.debug
+--- linux-2.6.16.orig/lib/Kconfig.debug 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/lib/Kconfig.debug 2006-07-04 14:41:39.000000000 +0400
+@@ -48,7 +48,7 @@ config LOG_BUF_SHIFT
+
+ config DETECT_SOFTLOCKUP
+ bool "Detect Soft Lockups"
+- depends on DEBUG_KERNEL
++ depends on DEBUG_KERNEL && !SCHED_VCPU
+ default y
+ help
+ Say Y here to enable the kernel to detect "soft lockups",
+diff -upr linux-2.6.16.orig/lib/bust_spinlocks.c linux-2.6.16-026test015/lib/bust_spinlocks.c
+--- linux-2.6.16.orig/lib/bust_spinlocks.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/lib/bust_spinlocks.c 2006-07-04 14:41:37.000000000 +0400
+@@ -20,19 +20,11 @@ void bust_spinlocks(int yes)
+ if (yes) {
+ oops_in_progress = 1;
+ } else {
+- int loglevel_save = console_loglevel;
+ #ifdef CONFIG_VT
+ unblank_screen();
+ #endif
+ oops_in_progress = 0;
+- /*
+- * OK, the message is on the console. Now we call printk()
+- * without oops_in_progress set so that printk() will give klogd
+- * and the blanked console a poke. Hold onto your hats...
+- */
+- console_loglevel = 15; /* NMI oopser may have shut the console up */
+- printk(" ");
+- console_loglevel = loglevel_save;
++ wake_up_klogd();
+ }
+ }
+
+diff -upr linux-2.6.16.orig/mm/filemap_xip.c linux-2.6.16-026test015/mm/filemap_xip.c
+--- linux-2.6.16.orig/mm/filemap_xip.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/filemap_xip.c 2006-07-04 14:41:37.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/rmap.h>
+ #include <asm/tlbflush.h>
+ #include "filemap.h"
++#include <ub/ub_vmpages.h>
+
+ /*
+ * This is a file read routine for execute in place files, and uses
+@@ -190,7 +191,10 @@ __xip_unmap (struct address_space * mapp
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ pteval = ptep_clear_flush(vma, address, pte);
+ page_remove_rmap(page);
++ pb_remove_ref(page, mm);
++ ub_unused_privvm_inc(mm, vma);
+ dec_mm_counter(mm, file_rss);
++ dec_vma_rss(vma);
+ BUG_ON(pte_dirty(pteval));
+ pte_unmap_unlock(pte, ptl);
+ page_cache_release(page);
+diff -upr linux-2.6.16.orig/mm/fremap.c linux-2.6.16-026test015/mm/fremap.c
+--- linux-2.6.16.orig/mm/fremap.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/fremap.c 2006-07-04 14:41:39.000000000 +0400
+@@ -20,6 +20,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+
++#include <ub/ub_vmpages.h>
++
+ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+ {
+@@ -34,6 +36,7 @@ static int zap_pte(struct mm_struct *mm,
+ if (pte_dirty(pte))
+ set_page_dirty(page);
+ page_remove_rmap(page);
++ pb_remove_ref(page, mm);
+ page_cache_release(page);
+ }
+ } else {
+@@ -57,6 +60,10 @@ int install_page(struct mm_struct *mm, s
+ pte_t *pte;
+ pte_t pte_val;
+ spinlock_t *ptl;
++ struct page_beancounter *pbc;
++
++ if (unlikely(pb_alloc(&pbc)))
++ goto out_nopb;
+
+ pte = get_locked_pte(mm, addr, &ptl);
+ if (!pte)
+@@ -75,11 +82,15 @@ int install_page(struct mm_struct *mm, s
+ if (page_mapcount(page) > INT_MAX/2)
+ goto unlock;
+
+- if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
++ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) {
++ ub_unused_privvm_dec(mm, vma);
+ inc_mm_counter(mm, file_rss);
++ inc_vma_rss(vma);
++ }
+
+ flush_icache_page(vma, page);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
++ pb_add_ref(page, mm, &pbc);
+ page_add_file_rmap(page);
+ pte_val = *pte;
+ update_mmu_cache(vma, addr, pte_val);
+@@ -87,6 +98,8 @@ int install_page(struct mm_struct *mm, s
+ unlock:
+ pte_unmap_unlock(pte, ptl);
+ out:
++ pb_free(&pbc);
++out_nopb:
+ return err;
+ }
+ EXPORT_SYMBOL(install_page);
+@@ -109,7 +122,9 @@ int install_file_pte(struct mm_struct *m
+
+ if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+ update_hiwater_rss(mm);
++ ub_unused_privvm_inc(mm, vma);
+ dec_mm_counter(mm, file_rss);
++ dec_vma_rss(vma);
+ }
+
+ set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
+@@ -220,4 +235,5 @@ asmlinkage long sys_remap_file_pages(uns
+
+ return err;
+ }
++EXPORT_SYMBOL_GPL(sys_remap_file_pages);
+
+diff -upr linux-2.6.16.orig/mm/madvise.c linux-2.6.16-026test015/mm/madvise.c
+--- linux-2.6.16.orig/mm/madvise.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/madvise.c 2006-07-04 14:41:36.000000000 +0400
+@@ -168,6 +168,9 @@ static long madvise_remove(struct vm_are
+ return -EINVAL;
+ }
+
++ if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
++ return -EACCES;
++
+ mapping = vma->vm_file->f_mapping;
+
+ offset = (loff_t)(start - vma->vm_start)
+diff -upr linux-2.6.16.orig/mm/memory.c linux-2.6.16-026test015/mm/memory.c
+--- linux-2.6.16.orig/mm/memory.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/memory.c 2006-07-04 14:41:39.000000000 +0400
+@@ -58,6 +58,8 @@
+ #include <linux/swapops.h>
+ #include <linux/elf.h>
+
++#include <ub/ub_vmpages.h>
++
+ #ifndef CONFIG_NEED_MULTIPLE_NODES
+ /* use the per-pgdat data instead for discontigmem - mbligh */
+ unsigned long max_mapnr;
+@@ -81,6 +83,7 @@ unsigned long vmalloc_earlyreserve;
+ EXPORT_SYMBOL(num_physpages);
+ EXPORT_SYMBOL(high_memory);
+ EXPORT_SYMBOL(vmalloc_earlyreserve);
++EXPORT_SYMBOL_GPL(empty_zero_page);
+
+ int randomize_va_space __read_mostly = 1;
+
+@@ -103,18 +106,21 @@ void pgd_clear_bad(pgd_t *pgd)
+ pgd_ERROR(*pgd);
+ pgd_clear(pgd);
+ }
++EXPORT_SYMBOL_GPL(pgd_clear_bad);
+
+ void pud_clear_bad(pud_t *pud)
+ {
+ pud_ERROR(*pud);
+ pud_clear(pud);
+ }
++EXPORT_SYMBOL_GPL(pud_clear_bad);
+
+ void pmd_clear_bad(pmd_t *pmd)
+ {
+ pmd_ERROR(*pmd);
+ pmd_clear(pmd);
+ }
++EXPORT_SYMBOL_GPL(pmd_clear_bad);
+
+ /*
+ * Note: this doesn't free the actual pages themselves. That
+@@ -318,6 +324,7 @@ int __pte_alloc(struct mm_struct *mm, pm
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(__pte_alloc);
+
+ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+ {
+@@ -418,6 +425,7 @@ struct page *vm_normal_page(struct vm_ar
+ */
+ return pfn_to_page(pfn);
+ }
++EXPORT_SYMBOL_GPL(vm_normal_page);
+
+ /*
+ * copy one vm_area from one task to the other. Assumes the page tables
+@@ -428,7 +436,7 @@ struct page *vm_normal_page(struct vm_ar
+ static inline void
+ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+- unsigned long addr, int *rss)
++ unsigned long addr, int *rss, struct page_beancounter **pbc)
+ {
+ unsigned long vm_flags = vma->vm_flags;
+ pte_t pte = *src_pte;
+@@ -471,6 +479,7 @@ copy_one_pte(struct mm_struct *dst_mm, s
+ if (page) {
+ get_page(page);
+ page_dup_rmap(page);
++ pb_dup_ref(page, dst_mm, pbc);
+ rss[!!PageAnon(page)]++;
+ }
+
+@@ -478,20 +487,36 @@ out_set_pte:
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ }
+
++#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1)))
++#ifdef CONFIG_USER_RESOURCE
++#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub)
++#else
++#define same_ub(mm1, mm2) (1)
++#endif
++
+ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
++ pmd_t *dst_pmd, pmd_t *src_pmd,
++ struct vm_area_struct *dst_vma,
++ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+ {
+ pte_t *src_pte, *dst_pte;
+ spinlock_t *src_ptl, *dst_ptl;
+ int progress = 0;
+- int rss[2];
++ int rss[2], rss_tot;
++ struct page_beancounter *pbc;
++ int err;
+
++ err = -ENOMEM;
++ pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL;
+ again:
++ if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr)))
++ goto out;
+ rss[1] = rss[0] = 0;
+ dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
+ if (!dst_pte)
+- return -ENOMEM;
++ goto out;
++
+ src_pte = pte_offset_map_nested(src_pmd, addr);
+ src_ptl = pte_lockptr(src_mm, src_pmd);
+ spin_lock(src_ptl);
+@@ -512,22 +537,32 @@ again:
+ progress++;
+ continue;
+ }
+- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
++ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
++ vma, addr, rss, &pbc);
+ progress += 8;
+ } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+
+ spin_unlock(src_ptl);
+ pte_unmap_nested(src_pte - 1);
++ rss_tot = rss[0] + rss[1];
++ add_vma_rss(dst_vma, rss_tot);
++ ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot);
+ add_mm_rss(dst_mm, rss[0], rss[1]);
+ pte_unmap_unlock(dst_pte - 1, dst_ptl);
+ cond_resched();
+ if (addr != end)
+ goto again;
+- return 0;
++
++ err = 0;
++out:
++ pb_free_list(&pbc);
++ return err;
+ }
+
+ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
++ pud_t *dst_pud, pud_t *src_pud,
++ struct vm_area_struct *dst_vma,
++ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+ {
+ pmd_t *src_pmd, *dst_pmd;
+@@ -542,14 +577,16 @@ static inline int copy_pmd_range(struct
+ if (pmd_none_or_clear_bad(src_pmd))
+ continue;
+ if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
+- vma, addr, next))
++ dst_vma, vma, addr, next))
+ return -ENOMEM;
+ } while (dst_pmd++, src_pmd++, addr = next, addr != end);
+ return 0;
+ }
+
+ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
++ pgd_t *dst_pgd, pgd_t *src_pgd,
++ struct vm_area_struct *dst_vma,
++ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+ {
+ pud_t *src_pud, *dst_pud;
+@@ -564,19 +601,20 @@ static inline int copy_pud_range(struct
+ if (pud_none_or_clear_bad(src_pud))
+ continue;
+ if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
+- vma, addr, next))
++ dst_vma, vma, addr, next))
+ return -ENOMEM;
+ } while (dst_pud++, src_pud++, addr = next, addr != end);
+ return 0;
+ }
+
+-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+- struct vm_area_struct *vma)
++int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma,
++ unsigned long addr, size_t size)
+ {
++ struct mm_struct *dst_mm = dst_vma->vm_mm;
++ struct mm_struct *src_mm = vma->vm_mm;
+ pgd_t *src_pgd, *dst_pgd;
+ unsigned long next;
+- unsigned long addr = vma->vm_start;
+- unsigned long end = vma->vm_end;
++ unsigned long end = addr + size;
+
+ /*
+ * Don't copy ptes where a page fault will fill them correctly.
+@@ -599,11 +637,22 @@ int copy_page_range(struct mm_struct *ds
+ if (pgd_none_or_clear_bad(src_pgd))
+ continue;
+ if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+- vma, addr, next))
++ dst_vma, vma, addr, next))
+ return -ENOMEM;
+ } while (dst_pgd++, src_pgd++, addr = next, addr != end);
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(__copy_page_range);
++
++int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
++ struct vm_area_struct *dst_vma, struct vm_area_struct *vma)
++{
++ if (dst_vma->vm_mm != dst)
++ BUG();
++ if (vma->vm_mm != src)
++ BUG();
++ return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start);
++}
+
+ static unsigned long zap_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd,
+@@ -615,6 +664,7 @@ static unsigned long zap_pte_range(struc
+ spinlock_t *ptl;
+ int file_rss = 0;
+ int anon_rss = 0;
++ int rss;
+
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ do {
+@@ -668,6 +718,7 @@ static unsigned long zap_pte_range(struc
+ file_rss--;
+ }
+ page_remove_rmap(page);
++ pb_remove_ref(page, mm);
+ tlb_remove_page(tlb, page);
+ continue;
+ }
+@@ -682,6 +733,9 @@ static unsigned long zap_pte_range(struc
+ pte_clear_full(mm, addr, pte, tlb->fullmm);
+ } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+
++ rss = -(file_rss + anon_rss);
++ ub_unused_privvm_add(mm, vma, rss);
++ sub_vma_rss(vma, rss);
+ add_mm_rss(mm, file_rss, anon_rss);
+ pte_unmap_unlock(pte - 1, ptl);
+
+@@ -1087,12 +1141,14 @@ int get_user_pages(struct task_struct *t
+ }
+ EXPORT_SYMBOL(get_user_pages);
+
+-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
++static int zeromap_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end, pgprot_t prot)
+ {
+ pte_t *pte;
+ spinlock_t *ptl;
++ struct mm_struct *mm;
+
++ mm = vma->vm_mm;
+ pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ if (!pte)
+ return -ENOMEM;
+@@ -1102,6 +1158,8 @@ static int zeromap_pte_range(struct mm_s
+ page_cache_get(page);
+ page_add_file_rmap(page);
+ inc_mm_counter(mm, file_rss);
++ inc_vma_rss(vma);
++ ub_unused_privvm_dec(mm, vma);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, addr, pte, zero_pte);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+@@ -1109,35 +1167,35 @@ static int zeromap_pte_range(struct mm_s
+ return 0;
+ }
+
+-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
++static inline int zeromap_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end, pgprot_t prot)
+ {
+ pmd_t *pmd;
+ unsigned long next;
+
+- pmd = pmd_alloc(mm, pud, addr);
++ pmd = pmd_alloc(vma->vm_mm, pud, addr);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+- if (zeromap_pte_range(mm, pmd, addr, next, prot))
++ if (zeromap_pte_range(vma, pmd, addr, next, prot))
+ return -ENOMEM;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+ }
+
+-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
++static inline int zeromap_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end, pgprot_t prot)
+ {
+ pud_t *pud;
+ unsigned long next;
+
+- pud = pud_alloc(mm, pgd, addr);
++ pud = pud_alloc(vma->vm_mm, pgd, addr);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+- if (zeromap_pmd_range(mm, pud, addr, next, prot))
++ if (zeromap_pmd_range(vma, pud, addr, next, prot))
+ return -ENOMEM;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+@@ -1149,15 +1207,14 @@ int zeromap_page_range(struct vm_area_st
+ pgd_t *pgd;
+ unsigned long next;
+ unsigned long end = addr + size;
+- struct mm_struct *mm = vma->vm_mm;
+ int err;
+
+ BUG_ON(addr >= end);
+- pgd = pgd_offset(mm, addr);
++ pgd = pgd_offset(vma->vm_mm, addr);
+ flush_cache_range(vma, addr, end);
+ do {
+ next = pgd_addr_end(addr, end);
+- err = zeromap_pud_range(mm, pgd, addr, next, prot);
++ err = zeromap_pud_range(vma, pgd, addr, next, prot);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+@@ -1183,11 +1240,14 @@ pte_t * fastcall get_locked_pte(struct m
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
++static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot)
+ {
+ int retval;
+ pte_t *pte;
+- spinlock_t *ptl;
++ spinlock_t *ptl;
++ struct mm_struct *mm;
++
++ mm = vma->vm_mm;
+
+ retval = -EINVAL;
+ if (PageAnon(page))
+@@ -1204,6 +1264,7 @@ static int insert_page(struct mm_struct
+ /* Ok, finally just insert the thing.. */
+ get_page(page);
+ inc_mm_counter(mm, file_rss);
++ inc_vma_rss(vma);
+ page_add_file_rmap(page);
+ set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+@@ -1240,7 +1301,7 @@ int vm_insert_page(struct vm_area_struct
+ if (!page_count(page))
+ return -EINVAL;
+ vma->vm_flags |= VM_INSERTPAGE;
+- return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
++ return insert_page(vma, addr, page, vma->vm_page_prot);
+ }
+ EXPORT_SYMBOL(vm_insert_page);
+
+@@ -1449,6 +1510,7 @@ static int do_wp_page(struct mm_struct *
+ struct page *old_page, *new_page;
+ pte_t entry;
+ int ret = VM_FAULT_MINOR;
++ struct page_beancounter *pbc;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page)
+@@ -1476,6 +1538,9 @@ static int do_wp_page(struct mm_struct *
+ gotten:
+ pte_unmap_unlock(page_table, ptl);
+
++ if (unlikely(pb_alloc(&pbc)))
++ goto oom_nopb;
++
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ if (old_page == ZERO_PAGE(address)) {
+@@ -1496,12 +1561,16 @@ gotten:
+ if (likely(pte_same(*page_table, orig_pte))) {
+ if (old_page) {
+ page_remove_rmap(old_page);
++ pb_remove_ref(old_page, mm);
+ if (!PageAnon(old_page)) {
+ dec_mm_counter(mm, file_rss);
+ inc_mm_counter(mm, anon_rss);
+ }
+- } else
++ } else {
++ ub_unused_privvm_dec(mm, vma);
+ inc_mm_counter(mm, anon_rss);
++ inc_vma_rss(vma);
++ }
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+@@ -1510,6 +1579,7 @@ gotten:
+ lazy_mmu_prot_update(entry);
+ lru_cache_add_active(new_page);
+ page_add_new_anon_rmap(new_page, vma, address);
++ pb_add_ref(new_page, mm, &pbc);
+
+ /* Free the old page.. */
+ new_page = old_page;
+@@ -1519,10 +1589,13 @@ gotten:
+ page_cache_release(new_page);
+ if (old_page)
+ page_cache_release(old_page);
++ pb_free(&pbc);
+ unlock:
+ pte_unmap_unlock(page_table, ptl);
+ return ret;
+ oom:
++ pb_free(&pbc);
++oom_nopb:
+ if (old_page)
+ page_cache_release(old_page);
+ return VM_FAULT_OOM;
+@@ -1877,10 +1950,16 @@ static int do_swap_page(struct mm_struct
+ swp_entry_t entry;
+ pte_t pte;
+ int ret = VM_FAULT_MINOR;
++ struct page_beancounter *pbc;
++ cycles_t start;
+
+ if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
+- goto out;
++ goto out_nostat;
++
++ if (unlikely(pb_alloc(&pbc)))
++ return VM_FAULT_OOM;
+
++ start = get_cycles();
+ entry = pte_to_swp_entry(orig_pte);
+ again:
+ page = lookup_swap_cache(entry);
+@@ -1928,6 +2007,8 @@ again:
+ /* The page isn't present yet, go ahead with the fault. */
+
+ inc_mm_counter(mm, anon_rss);
++ inc_vma_rss(vma);
++ ub_swapin_inc(mm);
+ pte = mk_pte(page, vma->vm_page_prot);
+ if (write_access && can_share_swap_page(page)) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+@@ -1937,6 +2018,8 @@ again:
+ flush_icache_page(vma, page);
+ set_pte_at(mm, address, page_table, pte);
+ page_add_anon_rmap(page, vma, address);
++ pb_add_ref(page, mm, &pbc);
++ ub_unused_privvm_dec(mm, vma);
+
+ swap_free(entry);
+ if (vm_swap_full())
+@@ -1947,7 +2030,7 @@ again:
+ if (do_wp_page(mm, vma, address,
+ page_table, pmd, ptl, pte) == VM_FAULT_OOM)
+ ret = VM_FAULT_OOM;
+- goto out;
++ goto out_wp;
+ }
+
+ /* No need to invalidate - it was non-present before */
+@@ -1955,10 +2038,16 @@ again:
+ lazy_mmu_prot_update(pte);
+ unlock:
+ pte_unmap_unlock(page_table, ptl);
+-out:
++out_wp:
++ pb_free(&pbc);
++ spin_lock_irq(&kstat_glb_lock);
++ KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start);
++ spin_unlock_irq(&kstat_glb_lock);
++out_nostat:
+ return ret;
+ out_nomap:
+ pte_unmap_unlock(page_table, ptl);
++ pb_free(&pbc);
+ unlock_page(page);
+ page_cache_release(page);
+ return ret;
+@@ -1976,11 +2065,15 @@ static int do_anonymous_page(struct mm_s
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t entry;
++ struct page_beancounter *pbc;
+
+ if (write_access) {
+ /* Allocate our own private page. */
+ pte_unmap(page_table);
+
++ if (unlikely(pb_alloc(&pbc)))
++ goto oom_nopb;
++
+ if (unlikely(anon_vma_prepare(vma)))
+ goto oom;
+ page = alloc_zeroed_user_highpage(vma, address);
+@@ -1996,7 +2089,10 @@ static int do_anonymous_page(struct mm_s
+ inc_mm_counter(mm, anon_rss);
+ lru_cache_add_active(page);
+ page_add_new_anon_rmap(page, vma, address);
++ pb_add_ref(page, mm, &pbc);
+ } else {
++ pbc = NULL;
++
+ /* Map the ZERO_PAGE - vm_page_prot is readonly */
+ page = ZERO_PAGE(address);
+ page_cache_get(page);
+@@ -2010,18 +2106,23 @@ static int do_anonymous_page(struct mm_s
+ page_add_file_rmap(page);
+ }
+
++ inc_vma_rss(vma);
++ ub_unused_privvm_dec(mm, vma);
+ set_pte_at(mm, address, page_table, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ unlock:
++ pb_free(&pbc);
+ pte_unmap_unlock(page_table, ptl);
+ return VM_FAULT_MINOR;
+ release:
+ page_cache_release(page);
+ goto unlock;
+ oom:
++ pb_free(&pbc);
++oom_nopb:
+ return VM_FAULT_OOM;
+ }
+
+@@ -2049,6 +2150,7 @@ static int do_no_page(struct mm_struct *
+ unsigned int sequence = 0;
+ int ret = VM_FAULT_MINOR;
+ int anon = 0;
++ struct page_beancounter *pbc;
+
+ pte_unmap(page_table);
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+@@ -2058,6 +2160,9 @@ static int do_no_page(struct mm_struct *
+ sequence = mapping->truncate_count;
+ smp_rmb(); /* serializes i_size against truncate_count */
+ }
++
++ if (unlikely(pb_alloc(&pbc)))
++ goto oom_nopb;
+ retry:
+ new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+ /*
+@@ -2070,9 +2175,9 @@ retry:
+
+ /* no page was available -- either SIGBUS or OOM */
+ if (new_page == NOPAGE_SIGBUS)
+- return VM_FAULT_SIGBUS;
++ goto bus_nopg;
+ if (new_page == NOPAGE_OOM)
+- return VM_FAULT_OOM;
++ goto oom_nopg;
+
+ /*
+ * Should we do an early C-O-W break?
+@@ -2131,6 +2236,9 @@ retry:
+ inc_mm_counter(mm, file_rss);
+ page_add_file_rmap(new_page);
+ }
++ inc_vma_rss(vma);
++ pb_add_ref(new_page, mm, &pbc);
++ ub_unused_privvm_dec(mm, vma);
+ } else {
+ /* One of our sibling threads was faster, back out. */
+ page_cache_release(new_page);
+@@ -2142,10 +2250,18 @@ retry:
+ lazy_mmu_prot_update(entry);
+ unlock:
+ pte_unmap_unlock(page_table, ptl);
++ pb_free(&pbc);
+ return ret;
+ oom:
+ page_cache_release(new_page);
++oom_nopg:
++ pb_free(&pbc);
++oom_nopb:
+ return VM_FAULT_OOM;
++
++bus_nopg:
++ pb_free(&pbc);
++ return VM_FAULT_SIGBUS;
+ }
+
+ /*
+@@ -2314,6 +2430,8 @@ int __pud_alloc(struct mm_struct *mm, pg
+ }
+ #endif /* __PAGETABLE_PUD_FOLDED */
+
++EXPORT_SYMBOL_GPL(__pud_alloc);
++
+ #ifndef __PAGETABLE_PMD_FOLDED
+ /*
+ * Allocate page middle directory.
+@@ -2348,6 +2466,8 @@ int __pmd_alloc(struct mm_struct *mm, pu
+ }
+ #endif /* __PAGETABLE_PMD_FOLDED */
+
++EXPORT_SYMBOL_GPL(__pmd_alloc);
++
+ int make_pages_present(unsigned long addr, unsigned long end)
+ {
+ int ret, len, write;
+diff -upr linux-2.6.16.orig/mm/mempolicy.c linux-2.6.16-026test015/mm/mempolicy.c
+--- linux-2.6.16.orig/mm/mempolicy.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/mempolicy.c 2006-07-04 14:41:38.000000000 +0400
+@@ -933,7 +933,7 @@ asmlinkage long sys_migrate_pages(pid_t
+
+ /* Find the mm_struct */
+ read_lock(&tasklist_lock);
+- task = pid ? find_task_by_pid(pid) : current;
++ task = pid ? find_task_by_pid_ve(pid) : current;
+ if (!task) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+@@ -1796,7 +1796,6 @@ static void gather_stats(struct page *pa
+ md->mapcount_max = count;
+
+ md->node[page_to_nid(page)]++;
+- cond_resched();
+ }
+
+ #ifdef CONFIG_HUGETLB_PAGE
+diff -upr linux-2.6.16.orig/mm/mempool.c linux-2.6.16-026test015/mm/mempool.c
+--- linux-2.6.16.orig/mm/mempool.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/mempool.c 2006-07-04 14:41:37.000000000 +0400
+@@ -14,6 +14,7 @@
+ #include <linux/mempool.h>
+ #include <linux/blkdev.h>
+ #include <linux/writeback.h>
++#include <linux/kmem_cache.h>
+
+ static void add_element(mempool_t *pool, void *element)
+ {
+@@ -78,6 +79,8 @@ mempool_t *mempool_create_node(int min_n
+ init_waitqueue_head(&pool->wait);
+ pool->alloc = alloc_fn;
+ pool->free = free_fn;
++ if (alloc_fn == mempool_alloc_slab)
++ kmem_mark_nocharge((kmem_cache_t *)pool_data);
+
+ /*
+ * First pre-allocate the guaranteed number of buffers.
+@@ -119,6 +122,7 @@ int mempool_resize(mempool_t *pool, int
+ unsigned long flags;
+
+ BUG_ON(new_min_nr <= 0);
++ gfp_mask &= ~__GFP_UBC;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (new_min_nr <= pool->min_nr) {
+@@ -212,6 +216,7 @@ void * mempool_alloc(mempool_t *pool, gf
+ gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
+ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
+ gfp_mask |= __GFP_NOWARN; /* failures are OK */
++ gfp_mask &= ~__GFP_UBC;
+
+ gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
+
+diff -upr linux-2.6.16.orig/mm/mlock.c linux-2.6.16-026test015/mm/mlock.c
+--- linux-2.6.16.orig/mm/mlock.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/mlock.c 2006-07-04 14:41:39.000000000 +0400
+@@ -8,9 +8,11 @@
+ #include <linux/capability.h>
+ #include <linux/mman.h>
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/mempolicy.h>
+ #include <linux/syscalls.h>
+
++#include <ub/ub_vmpages.h>
+
+ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end, unsigned int newflags)
+@@ -25,6 +27,14 @@ static int mlock_fixup(struct vm_area_st
+ goto out;
+ }
+
++ if (newflags & VM_LOCKED) {
++ ret = ub_locked_charge(mm, end - start);
++ if (ret < 0) {
++ *prev = vma;
++ goto out;
++ }
++ }
++
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma));
+@@ -38,13 +48,13 @@ static int mlock_fixup(struct vm_area_st
+ if (start != vma->vm_start) {
+ ret = split_vma(mm, vma, start, 1);
+ if (ret)
+- goto out;
++ goto out_uncharge;
+ }
+
+ if (end != vma->vm_end) {
+ ret = split_vma(mm, vma, end, 0);
+ if (ret)
+- goto out;
++ goto out_uncharge;
+ }
+
+ success:
+@@ -63,13 +73,19 @@ success:
+ pages = -pages;
+ if (!(newflags & VM_IO))
+ ret = make_pages_present(start, end);
+- }
++ } else
++ ub_locked_uncharge(mm, end - start);
+
+ vma->vm_mm->locked_vm -= pages;
+ out:
+ if (ret == -ENOMEM)
+ ret = -EAGAIN;
+ return ret;
++
++out_uncharge:
++ if (newflags & VM_LOCKED)
++ ub_locked_uncharge(mm, end - start);
++ goto out;
+ }
+
+ static int do_mlock(unsigned long start, size_t len, int on)
+@@ -146,6 +162,7 @@ asmlinkage long sys_mlock(unsigned long
+ up_write(&current->mm->mmap_sem);
+ return error;
+ }
++EXPORT_SYMBOL_GPL(sys_mlock);
+
+ asmlinkage long sys_munlock(unsigned long start, size_t len)
+ {
+@@ -158,6 +175,7 @@ asmlinkage long sys_munlock(unsigned lon
+ up_write(&current->mm->mmap_sem);
+ return ret;
+ }
++EXPORT_SYMBOL_GPL(sys_munlock);
+
+ static int do_mlockall(int flags)
+ {
+diff -upr linux-2.6.16.orig/mm/mmap.c linux-2.6.16-026test015/mm/mmap.c
+--- linux-2.6.16.orig/mm/mmap.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/mmap.c 2006-07-04 14:41:39.000000000 +0400
+@@ -25,14 +25,18 @@
+ #include <linux/mount.h>
+ #include <linux/mempolicy.h>
+ #include <linux/rmap.h>
++#include <linux/virtinfo.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+ #include <asm/tlb.h>
+
++#include <ub/ub_vmpages.h>
++
+ static void unmap_region(struct mm_struct *mm,
+ struct vm_area_struct *vma, struct vm_area_struct *prev,
+ unsigned long start, unsigned long end);
++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft);
+
+ /*
+ * WARNING: the debugging will use recursive algorithms so never enable this
+@@ -87,6 +91,16 @@ int __vm_enough_memory(long pages, int c
+
+ vm_acct_memory(pages);
+
++ switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM,
++ (void *)pages)
++ & (NOTIFY_OK | NOTIFY_FAIL)) {
++ case NOTIFY_OK:
++ return 0;
++ case NOTIFY_FAIL:
++ vm_unacct_memory(pages);
++ return -ENOMEM;
++ }
++
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+@@ -201,11 +215,16 @@ static struct vm_area_struct *remove_vma
+ struct vm_area_struct *next = vma->vm_next;
+
+ might_sleep();
++
++ ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start,
++ vma->vm_flags, vma->vm_file);
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_free(vma_policy(vma));
++ if (get_vma_rss(vma))
++ warn_bad_rss(vma, 0);
+ kmem_cache_free(vm_area_cachep, vma);
+ return next;
+ }
+@@ -242,7 +261,7 @@ asmlinkage unsigned long sys_brk(unsigne
+ goto out;
+
+ /* Ok, looks good - let it rip. */
+- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
++ if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk)
+ goto out;
+ set_brk:
+ mm->brk = brk;
+@@ -726,7 +745,7 @@ struct vm_area_struct *vma_merge(struct
+ else
+ next = mm->mmap;
+ area = next;
+- if (next && next->vm_end == end) /* cases 6, 7, 8 */
++ if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ next = next->vm_next;
+
+ /*
+@@ -746,11 +765,22 @@ struct vm_area_struct *vma_merge(struct
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma)) {
+ /* cases 1, 6 */
++ add_vma_rss(prev, get_vma_rss(next));
++ if (area != next) /* case 6 */
++ add_vma_rss(prev, get_vma_rss(area));
+ vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL);
+- } else /* cases 2, 5, 7 */
++ } else { /* cases 2, 5, 7 */
++ if (next && addr == next->vm_start) { /* case 5 */
++ unsigned long rss;
++ rss = pages_in_vma_range(next, addr, end);
++ sub_vma_rss(next, rss);
++ add_vma_rss(prev, rss);
++ } else if (area != next) /* case 7 */
++ add_vma_rss(prev, get_vma_rss(area));
+ vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL);
++ }
+ return prev;
+ }
+
+@@ -761,12 +791,19 @@ struct vm_area_struct *vma_merge(struct
+ mpol_equal(policy, vma_policy(next)) &&
+ can_vma_merge_before(next, vm_flags,
+ anon_vma, file, pgoff+pglen)) {
+- if (prev && addr < prev->vm_end) /* case 4 */
++ if (prev && addr < prev->vm_end) { /* case 4 */
++ unsigned long rss;
++ rss = pages_in_vma_range(prev, addr, end);
++ sub_vma_rss(prev, rss);
++ add_vma_rss(next, rss);
+ vma_adjust(prev, prev->vm_start,
+ addr, prev->vm_pgoff, NULL);
+- else /* cases 3, 8 */
++ } else { /* cases 3, 8 */
++ if (area != next) /* case 8 */
++ add_vma_rss(area, get_vma_rss(next));
+ vma_adjust(area, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL);
++ }
+ return area;
+ }
+
+@@ -1033,6 +1070,10 @@ munmap_back:
+ }
+ }
+
++ if (ub_memory_charge(mm, len, vm_flags, file,
++ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD)))
++ goto charge_error;
++
+ /*
+ * Can we just expand an old private anonymous mapping?
+ * The VM_SHARED test is necessary because shmem_zero_setup
+@@ -1048,7 +1089,8 @@ munmap_back:
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
++ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL |
++ (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0));
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+@@ -1107,6 +1149,19 @@ munmap_back:
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+ } else {
++ unsigned long rss;
++
++ rss = get_vma_rss(vma);
++ if (rss > 0) {
++ if (prev->vm_next && prev->vm_next->vm_start == addr)
++ /* vma_merge expanded next vm_area */
++ add_vma_rss(prev->vm_next, rss);
++ else
++ /* vma_merge expanded prev vm_area
++ * and probably splitted it with next
++ */
++ add_vma_rss(prev, rss);
++ }
+ if (file) {
+ if (correct_wcount)
+ atomic_inc(&inode->i_writecount);
+@@ -1142,6 +1197,8 @@ unmap_and_free_vma:
+ free_vma:
+ kmem_cache_free(vm_area_cachep, vma);
+ unacct_error:
++ ub_memory_uncharge(mm, len, vm_flags, file);
++charge_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ return error;
+@@ -1471,12 +1528,16 @@ static int acct_stack_growth(struct vm_a
+ return -ENOMEM;
+ }
+
++ if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags,
++ vma->vm_file, UB_SOFT))
++ goto fail_charge;
++
+ /*
+ * Overcommit.. This must be the final test, as it will
+ * update security statistics.
+ */
+ if (security_vm_enough_memory(grow))
+- return -ENOMEM;
++ goto fail_sec;
+
+ /* Ok, everything looks good - let it rip */
+ mm->total_vm += grow;
+@@ -1484,6 +1545,11 @@ static int acct_stack_growth(struct vm_a
+ mm->locked_vm += grow;
+ vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+ return 0;
++
++fail_sec:
++ ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file);
++fail_charge:
++ return -ENOMEM;
+ }
+
+ #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
+@@ -1744,8 +1810,13 @@ int split_vma(struct mm_struct * mm, str
+ else
+ vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+
++ /* protected with mmap sem */
++ set_vma_rss(vma, pages_in_vma(vma));
++ set_vma_rss(new, pages_in_vma(new));
++
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(split_vma);
+
+ /* Munmap is split into 2 main parts -- this part which finds
+ * what needs doing, and the areas themselves, which do the
+@@ -1839,7 +1910,7 @@ static inline void verify_mm_writelocked
+ * anonymous maps. eventually we may be able to do some
+ * brk-specific accounting here.
+ */
+-unsigned long do_brk(unsigned long addr, unsigned long len)
++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft)
+ {
+ struct mm_struct * mm = current->mm;
+ struct vm_area_struct * vma, * prev;
+@@ -1891,11 +1962,14 @@ unsigned long do_brk(unsigned long addr,
+ if (mm->map_count > sysctl_max_map_count)
+ return -ENOMEM;
+
+- if (security_vm_enough_memory(len >> PAGE_SHIFT))
+- return -ENOMEM;
+-
+ flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+
++ if (ub_memory_charge(mm, len, flags, NULL, soft))
++ goto fail_charge;
++
++ if (security_vm_enough_memory(len >> PAGE_SHIFT))
++ goto fail_sec;
++
+ /* Can we just expand an old private anonymous mapping? */
+ if (vma_merge(mm, prev, addr, addr + len, flags,
+ NULL, NULL, pgoff, NULL))
+@@ -1904,11 +1978,11 @@ unsigned long do_brk(unsigned long addr,
+ /*
+ * create a vma struct for an anonymous mapping
+ */
+- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+- if (!vma) {
+- vm_unacct_memory(len >> PAGE_SHIFT);
+- return -ENOMEM;
+- }
++ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL |
++ (soft == UB_SOFT ? __GFP_SOFT_UBC : 0));
++ if (!vma)
++ goto fail_alloc;
++
+ memset(vma, 0, sizeof(*vma));
+
+ vma->vm_mm = mm;
+@@ -1925,8 +1999,19 @@ out:
+ make_pages_present(addr, addr + len);
+ }
+ return addr;
++
++fail_alloc:
++ vm_unacct_memory(len >> PAGE_SHIFT);
++fail_sec:
++ ub_memory_uncharge(mm, len, flags, NULL);
++fail_charge:
++ return -ENOMEM;
+ }
+
++unsigned long do_brk(unsigned long addr, unsigned long len)
++{
++ return __do_brk(addr, len, UB_SOFT);
++}
+ EXPORT_SYMBOL(do_brk);
+
+ /* Release all mmaps. */
+@@ -2036,6 +2121,7 @@ struct vm_area_struct *copy_vma(struct v
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
++ set_vma_rss(new_vma, 0);
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+diff -upr linux-2.6.16.orig/mm/mprotect.c linux-2.6.16-026test015/mm/mprotect.c
+--- linux-2.6.16.orig/mm/mprotect.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/mprotect.c 2006-07-04 14:41:39.000000000 +0400
+@@ -9,6 +9,7 @@
+ */
+
+ #include <linux/mm.h>
++#include <linux/module.h>
+ #include <linux/hugetlb.h>
+ #include <linux/slab.h>
+ #include <linux/shm.h>
+@@ -25,6 +26,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+
++#include <ub/ub_vmpages.h>
++
+ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, unsigned long end, pgprot_t newprot)
+ {
+@@ -109,12 +112,20 @@ mprotect_fixup(struct vm_area_struct *vm
+ pgprot_t newprot;
+ pgoff_t pgoff;
+ int error;
++ unsigned long ch_size;
++ int ch_dir;
+
+ if (newflags == oldflags) {
+ *pprev = vma;
+ return 0;
+ }
+
++ error = -ENOMEM;
++ ch_size = nrpages - pages_in_vma_range(vma, start, end);
++ ch_dir = ub_protected_charge(mm, ch_size, newflags, vma);
++ if (ch_dir == PRIVVM_ERROR)
++ goto fail_ch;
++
+ /*
+ * If we make a private mapping writable we increase our commit;
+ * but (without finer accounting) cannot reduce our commit if we
+@@ -127,7 +138,7 @@ mprotect_fixup(struct vm_area_struct *vm
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+ charged = nrpages;
+ if (security_vm_enough_memory(charged))
+- return -ENOMEM;
++ goto fail_sec;
+ newflags |= VM_ACCOUNT;
+ }
+ }
+@@ -169,10 +180,16 @@ success:
+ change_protection(vma, start, end, newprot);
+ vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, newflags, vma->vm_file, nrpages);
++ if (ch_dir == PRIVVM_TO_SHARED)
++ __ub_unused_privvm_dec(mm, ch_size);
+ return 0;
+
+ fail:
+ vm_unacct_memory(charged);
++fail_sec:
++ if (ch_dir == PRIVVM_TO_PRIVATE)
++ __ub_unused_privvm_dec(mm, ch_size);
++fail_ch:
+ return error;
+ }
+
+@@ -280,3 +297,4 @@ out:
+ up_write(&current->mm->mmap_sem);
+ return error;
+ }
++EXPORT_SYMBOL_GPL(sys_mprotect);
+diff -upr linux-2.6.16.orig/mm/mremap.c linux-2.6.16-026test015/mm/mremap.c
+--- linux-2.6.16.orig/mm/mremap.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/mremap.c 2006-07-04 14:41:37.000000000 +0400
+@@ -23,6 +23,8 @@
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+
++#include <ub/ub_vmpages.h>
++
+ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+ {
+ pgd_t *pgd;
+@@ -106,6 +108,8 @@ static void move_ptes(struct vm_area_str
+ pte = ptep_clear_flush(vma, old_addr, old_pte);
+ /* ZERO_PAGE can be dependant on virtual addr */
+ pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
++ dec_vma_rss(vma);
++ inc_vma_rss(new_vma);
+ set_pte_at(mm, new_addr, new_pte, pte);
+ }
+
+@@ -166,17 +170,21 @@ static unsigned long move_vma(struct vm_
+ unsigned long hiwater_vm;
+ int split = 0;
+
++ if (ub_memory_charge(mm, new_len, vm_flags,
++ vma->vm_file, UB_HARD))
++ goto err;
++
+ /*
+ * We'd prefer to avoid failure later on in do_munmap:
+ * which may split one vma into three before unmapping.
+ */
+ if (mm->map_count >= sysctl_max_map_count - 3)
+- return -ENOMEM;
++ goto err_nomem;
+
+ new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+ if (!new_vma)
+- return -ENOMEM;
++ goto err_nomem;
+
+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+ if (moved_len < old_len) {
+@@ -235,7 +243,13 @@ static unsigned long move_vma(struct vm_
+ new_addr + new_len);
+ }
+
+- return new_addr;
++ if (new_addr != -ENOMEM)
++ return new_addr;
++
++err_nomem:
++ ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file);
++err:
++ return -ENOMEM;
+ }
+
+ /*
+@@ -359,7 +373,15 @@ unsigned long do_mremap(unsigned long ad
+ max_addr = vma->vm_next->vm_start;
+ /* can we just expand the current mapping? */
+ if (max_addr - addr >= new_len) {
+- int pages = (new_len - old_len) >> PAGE_SHIFT;
++ int len;
++ int pages;
++
++ len = new_len - old_len;
++ pages = len >> PAGE_SHIFT;
++ ret = -ENOMEM;
++ if (ub_memory_charge(mm, len, vma->vm_flags,
++ vma->vm_file, UB_HARD))
++ goto out;
+
+ vma_adjust(vma, vma->vm_start,
+ addr + new_len, vma->vm_pgoff, NULL);
+diff -upr linux-2.6.16.orig/mm/oom_kill.c linux-2.6.16-026test015/mm/oom_kill.c
+--- linux-2.6.16.orig/mm/oom_kill.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/oom_kill.c 2006-07-04 14:41:38.000000000 +0400
+@@ -176,7 +176,7 @@ static struct task_struct *select_bad_pr
+ *ppoints = 0;
+
+ do_posix_clock_monotonic_gettime(&uptime);
+- do_each_thread(g, p) {
++ do_each_thread_all(g, p) {
+ unsigned long points;
+ int releasing;
+
+@@ -205,7 +205,7 @@ static struct task_struct *select_bad_pr
+ chosen = p;
+ *ppoints = points;
+ }
+- } while_each_thread(g, p);
++ } while_each_thread_all(g, p);
+ return chosen;
+ }
+
+@@ -261,10 +261,10 @@ static struct mm_struct *oom_kill_task(t
+ * kill all processes that share the ->mm (i.e. all threads),
+ * but are in a different thread group
+ */
+- do_each_thread(g, q)
++ do_each_thread_all(g, q) {
+ if (q->mm == mm && q->tgid != p->tgid)
+ __oom_kill_task(q, message);
+- while_each_thread(g, q);
++ } while_each_thread_all(g, q);
+
+ return mm;
+ }
+diff -upr linux-2.6.16.orig/mm/page_alloc.c linux-2.6.16-026test015/mm/page_alloc.c
+--- linux-2.6.16.orig/mm/page_alloc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/page_alloc.c 2006-07-04 14:41:38.000000000 +0400
+@@ -41,6 +41,8 @@
+ #include <asm/tlbflush.h>
+ #include "internal.h"
+
++#include <ub/ub_mem.h>
++
+ /*
+ * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * initializer cleaner
+@@ -50,6 +52,7 @@ EXPORT_SYMBOL(node_online_map);
+ nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
+ EXPORT_SYMBOL(node_possible_map);
+ struct pglist_data *pgdat_list __read_mostly;
++EXPORT_SYMBOL(pgdat_list);
+ unsigned long totalram_pages __read_mostly;
+ unsigned long totalhigh_pages __read_mostly;
+ long nr_swap_pages;
+@@ -153,7 +156,8 @@ static void bad_page(struct page *page)
+ 1 << PG_reclaim |
+ 1 << PG_slab |
+ 1 << PG_swapcache |
+- 1 << PG_writeback );
++ 1 << PG_writeback |
++ 1 << PG_buddy );
+ set_page_count(page, 0);
+ reset_page_mapcount(page);
+ page->mapping = NULL;
+@@ -224,12 +228,12 @@ static inline unsigned long page_order(s
+
+ static inline void set_page_order(struct page *page, int order) {
+ set_page_private(page, order);
+- __SetPagePrivate(page);
++ __SetPageBuddy(page);
+ }
+
+ static inline void rmv_page_order(struct page *page)
+ {
+- __ClearPagePrivate(page);
++ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+ }
+
+@@ -268,11 +272,13 @@ __find_combined_index(unsigned long page
+ * This function checks whether a page is free && is the buddy
+ * we can do coalesce a page and its buddy if
+ * (a) the buddy is not in a hole &&
+- * (b) the buddy is free &&
+- * (c) the buddy is on the buddy system &&
+- * (d) a page and its buddy have the same order.
+- * for recording page's order, we use page_private(page) and PG_private.
++ * (b) the buddy is in the buddy system &&
++ * (c) a page and its buddy have the same order.
+ *
++ * For recording whether a page is in the buddy system, we use PG_buddy.
++ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
++ *
++ * For recording page's order, we use page_private(page).
+ */
+ static inline int page_is_buddy(struct page *page, int order)
+ {
+@@ -281,10 +287,10 @@ static inline int page_is_buddy(struct p
+ return 0;
+ #endif
+
+- if (PagePrivate(page) &&
+- (page_order(page) == order) &&
+- page_count(page) == 0)
++ if (PageBuddy(page) && page_order(page) == order) {
++ BUG_ON(page_count(page) != 0);
+ return 1;
++ }
+ return 0;
+ }
+
+@@ -301,7 +307,7 @@ static inline int page_is_buddy(struct p
+ * as necessary, plus some accounting needed to play nicely with other
+ * parts of the VM system.
+ * At each level, we keep a list of pages, which are heads of continuous
+- * free pages of length of (1 << order) and marked with PG_Private.Page's
++ * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * order is recorded in page_private(page) field.
+ * So when we are allocating or freeing one, we can derive the state of the
+ * other. That is, if we allocate a small block, and both were
+@@ -364,7 +370,8 @@ static inline int free_pages_check(struc
+ 1 << PG_slab |
+ 1 << PG_swapcache |
+ 1 << PG_writeback |
+- 1 << PG_reserved ))))
++ 1 << PG_reserved |
++ 1 << PG_buddy ))))
+ bad_page(page);
+ if (PageDirty(page))
+ __ClearPageDirty(page);
+@@ -434,6 +441,7 @@ static void __free_pages_ok(struct page
+ return;
+
+ kernel_map_pages(page, 1 << order, 0);
++ ub_page_uncharge(page, order);
+ local_irq_save(flags);
+ __mod_page_state(pgfree, 1 << order);
+ free_one_page(page_zone(page), page, order);
+@@ -522,7 +530,8 @@ static int prep_new_page(struct page *pa
+ 1 << PG_slab |
+ 1 << PG_swapcache |
+ 1 << PG_writeback |
+- 1 << PG_reserved ))))
++ 1 << PG_reserved |
++ 1 << PG_buddy ))))
+ bad_page(page);
+
+ /*
+@@ -721,6 +730,7 @@ static void fastcall free_hot_cold_page(
+ kernel_map_pages(page, 1, 0);
+
+ pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
++ ub_page_uncharge(page, 0);
+ local_irq_save(flags);
+ __inc_page_state(pgfree);
+ list_add(&page->lru, &pcp->list);
+@@ -894,6 +904,28 @@ get_page_from_freelist(gfp_t gfp_mask, u
+ return page;
+ }
+
++static void __alloc_collect_stats(unsigned int gfp_mask,
++ unsigned int order, struct page *page, cycles_t time)
++{
++ int ind;
++ unsigned long flags;
++
++ time = get_cycles() - time;
++ if (!(gfp_mask & __GFP_WAIT))
++ ind = 0;
++ else if (!(gfp_mask & __GFP_HIGHMEM))
++ ind = (order > 0 ? 2 : 1);
++ else
++ ind = (order > 0 ? 4 : 3);
++ spin_lock_irqsave(&kstat_glb_lock, flags);
++ KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time);
++ if (!page)
++ kstat_glob.alloc_fails[ind]++;
++ spin_unlock_irqrestore(&kstat_glb_lock, flags);
++}
++
++int alloc_fail_warn;
++
+ /*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+@@ -909,6 +941,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
+ int do_retry;
+ int alloc_flags;
+ int did_some_progress;
++ cycles_t start;
+
+ might_sleep_if(wait);
+
+@@ -920,6 +953,7 @@ restart:
+ return NULL;
+ }
+
++ start = get_cycles();
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+ if (page)
+@@ -944,7 +978,8 @@ restart:
+ alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+- alloc_flags |= ALLOC_CPUSET;
++ if (wait)
++ alloc_flags |= ALLOC_CPUSET;
+
+ /*
+ * Go through the zonelist again. Let __GFP_HIGH and allocations
+@@ -1038,14 +1073,22 @@ rebalance:
+ }
+
+ nopage:
+- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
++ __alloc_collect_stats(gfp_mask, order, page, start);
++ if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) &&
++ printk_ratelimit()) {
+ printk(KERN_WARNING "%s: page allocation failure."
+ " order:%d, mode:0x%x\n",
+ p->comm, order, gfp_mask);
+ dump_stack();
+ show_mem();
+ }
++ return NULL;
++
+ got_pg:
++ if (ub_page_charge(page, order, gfp_mask)) {
++ __free_pages(page, order);
++ page = NULL;
++ }
+ return page;
+ }
+
+@@ -2378,7 +2421,10 @@ static void *vmstat_start(struct seq_fil
+ m->private = ps;
+ if (!ps)
+ return ERR_PTR(-ENOMEM);
+- get_full_page_state(ps);
++ if (ve_is_super(get_exec_env()))
++ get_full_page_state(ps);
++ else
++ memset(ps, 0, sizeof(*ps));
+ ps->pgpgin /= 2; /* sectors -> kbytes */
+ ps->pgpgout /= 2;
+ return (unsigned long *)ps + *pos;
+diff -upr linux-2.6.16.orig/mm/rmap.c linux-2.6.16-026test015/mm/rmap.c
+--- linux-2.6.16.orig/mm/rmap.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/rmap.c 2006-07-04 14:41:39.000000000 +0400
+@@ -56,6 +56,8 @@
+
+ #include <asm/tlbflush.h>
+
++#include <ub/ub_vmpages.h>
++
+ //#define RMAP_DEBUG /* can be enabled only for debugging */
+
+ kmem_cache_t *anon_vma_cachep;
+@@ -117,6 +119,7 @@ int anon_vma_prepare(struct vm_area_stru
+ }
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(anon_vma_prepare);
+
+ void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+ {
+@@ -145,6 +148,7 @@ void anon_vma_link(struct vm_area_struct
+ spin_unlock(&anon_vma->lock);
+ }
+ }
++EXPORT_SYMBOL_GPL(anon_vma_link);
+
+ void anon_vma_unlink(struct vm_area_struct *vma)
+ {
+@@ -180,14 +184,15 @@ static void anon_vma_ctor(void *data, km
+ void __init anon_vma_init(void)
+ {
+ anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
+- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL);
++ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC,
++ anon_vma_ctor, NULL);
+ }
+
+ /*
+ * Getting a lock on a stable anon_vma from a page off the LRU is
+ * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ */
+-static struct anon_vma *page_lock_anon_vma(struct page *page)
++struct anon_vma *page_lock_anon_vma(struct page *page)
+ {
+ struct anon_vma *anon_vma = NULL;
+ unsigned long anon_mapping;
+@@ -205,6 +210,7 @@ out:
+ rcu_read_unlock();
+ return anon_vma;
+ }
++EXPORT_SYMBOL_GPL(page_lock_anon_vma);
+
+ #ifdef CONFIG_MIGRATION
+ /*
+@@ -220,6 +226,7 @@ void remove_from_swap(struct page *page)
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *vma;
+ unsigned long mapping;
++ struct page_beancounter *pb;
+
+ if (!PageSwapCache(page))
+ return;
+@@ -229,6 +236,10 @@ void remove_from_swap(struct page *page)
+ if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+ return;
+
++ pb = NULL;
++ if (pb_alloc_all(&pb))
++ return;
++
+ /*
+ * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+ */
+@@ -236,10 +247,12 @@ void remove_from_swap(struct page *page)
+ spin_lock(&anon_vma->lock);
+
+ list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+- remove_vma_swap(vma, page);
++ remove_vma_swap(vma, page, &pb);
+
+ spin_unlock(&anon_vma->lock);
+ delete_from_swap_cache(page);
++
++ pb_free_list(&pb);
+ }
+ EXPORT_SYMBOL(remove_from_swap);
+ #endif
+@@ -638,7 +651,11 @@ static int try_to_unmap_one(struct page
+ } else
+ dec_mm_counter(mm, file_rss);
+
++ dec_vma_rss(vma);
+ page_remove_rmap(page);
++ ub_unused_privvm_inc(mm, vma);
++ ub_unmap_inc(mm);
++ pb_remove_ref(page, mm);
+ page_cache_release(page);
+
+ out_unmap:
+@@ -729,8 +746,12 @@ static void try_to_unmap_cluster(unsigne
+ set_page_dirty(page);
+
+ page_remove_rmap(page);
++ ub_unmap_inc(mm);
++ pb_remove_ref(page, mm);
++ ub_unused_privvm_inc(mm, vma);
+ page_cache_release(page);
+ dec_mm_counter(mm, file_rss);
++ dec_vma_rss(vma);
+ (*mapcount)--;
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+diff -upr linux-2.6.16.orig/mm/shmem.c linux-2.6.16-026test015/mm/shmem.c
+--- linux-2.6.16.orig/mm/shmem.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/shmem.c 2006-07-04 14:41:39.000000000 +0400
+@@ -50,6 +50,8 @@
+ #include <asm/div64.h>
+ #include <asm/pgtable.h>
+
++#include <ub/ub_vmpages.h>
++
+ /* This magic number is used in glibc for posix shared memory */
+ #define TMPFS_MAGIC 0x01021994
+
+@@ -211,7 +213,7 @@ static void shmem_free_blocks(struct ino
+ *
+ * It has to be called with the spinlock held.
+ */
+-static void shmem_recalc_inode(struct inode *inode)
++static void shmem_recalc_inode(struct inode *inode, long swp_freed)
+ {
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ long freed;
+@@ -221,6 +223,8 @@ static void shmem_recalc_inode(struct in
+ info->alloced -= freed;
+ shmem_unacct_blocks(info->flags, freed);
+ shmem_free_blocks(inode, freed);
++ if (freed > swp_freed)
++ ub_tmpfs_respages_sub(info, freed - swp_freed);
+ }
+ }
+
+@@ -326,6 +330,11 @@ static void shmem_swp_set(struct shmem_i
+ struct page *page = kmap_atomic_to_page(entry);
+ set_page_private(page, page_private(page) + incdec);
+ }
++
++ if (incdec == 1)
++ ub_tmpfs_respages_dec(info);
++ else
++ ub_tmpfs_respages_inc(info);
+ }
+
+ /*
+@@ -342,14 +351,24 @@ static swp_entry_t *shmem_swp_alloc(stru
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ struct page *page = NULL;
+ swp_entry_t *entry;
++ unsigned long ub_val;
+
+ if (sgp != SGP_WRITE &&
+ ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ return ERR_PTR(-EINVAL);
+
++ ub_val = 0;
++ if (info->next_index <= index) {
++ ub_val = index + 1 - info->next_index;
++ if (ub_shmpages_charge(info, ub_val))
++ return ERR_PTR(-ENOSPC);
++ }
++
+ while (!(entry = shmem_swp_entry(info, index, &page))) {
+- if (sgp == SGP_READ)
+- return shmem_swp_map(ZERO_PAGE(0));
++ if (sgp == SGP_READ) {
++ entry = shmem_swp_map(ZERO_PAGE(0));
++ goto out;
++ }
+ /*
+ * Test free_blocks against 1 not 0, since we have 1 data
+ * page (and perhaps indirect index pages) yet to allocate:
+@@ -359,7 +378,8 @@ static swp_entry_t *shmem_swp_alloc(stru
+ spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->free_blocks <= 1) {
+ spin_unlock(&sbinfo->stat_lock);
+- return ERR_PTR(-ENOSPC);
++ entry = ERR_PTR(-ENOSPC);
++ goto out;
+ }
+ sbinfo->free_blocks--;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+@@ -367,31 +387,43 @@ static swp_entry_t *shmem_swp_alloc(stru
+ }
+
+ spin_unlock(&info->lock);
+- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
++ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) |
++ __GFP_ZERO | __GFP_UBC);
+ if (page)
+ set_page_private(page, 0);
+ spin_lock(&info->lock);
+
+ if (!page) {
+- shmem_free_blocks(inode, 1);
+- return ERR_PTR(-ENOMEM);
++ entry = ERR_PTR(-ENOMEM);
++ goto out_block;
+ }
+ if (sgp != SGP_WRITE &&
+ ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ entry = ERR_PTR(-EINVAL);
+- break;
++ goto out_dir;
+ }
+- if (info->next_index <= index)
++ if (info->next_index <= index) {
++ ub_val = 0;
+ info->next_index = index + 1;
++ }
+ }
+ if (page) {
+ /* another task gave its page, or truncated the file */
+ shmem_free_blocks(inode, 1);
+ shmem_dir_free(page);
+ }
+- if (info->next_index <= index && !IS_ERR(entry))
++ if (info->next_index <= index)
+ info->next_index = index + 1;
+ return entry;
++
++out_dir:
++ shmem_dir_free(page);
++out_block:
++ shmem_free_blocks(inode, 1);
++out:
++ if (ub_val)
++ ub_shmpages_uncharge(info, ub_val);
++ return entry;
+ }
+
+ /*
+@@ -484,6 +516,7 @@ static void shmem_truncate_range(struct
+ return;
+
+ spin_lock(&info->lock);
++ ub_shmpages_uncharge(info, info->next_index - idx);
+ info->flags |= SHMEM_TRUNCATE;
+ if (likely(end == (loff_t) -1)) {
+ limit = info->next_index;
+@@ -613,7 +646,7 @@ done2:
+ info->swapped -= nr_swaps_freed;
+ if (nr_pages_to_free)
+ shmem_free_blocks(inode, nr_pages_to_free);
+- shmem_recalc_inode(inode);
++ shmem_recalc_inode(inode, nr_swaps_freed);
+ spin_unlock(&info->lock);
+
+ /*
+@@ -696,6 +729,7 @@ static void shmem_delete_inode(struct in
+ sbinfo->free_inodes++;
+ spin_unlock(&sbinfo->stat_lock);
+ }
++ shmi_ub_put(info);
+ clear_inode(inode);
+ }
+
+@@ -817,6 +851,12 @@ int shmem_unuse(swp_entry_t entry, struc
+ return found;
+ }
+
++#ifdef CONFIG_USER_RESOURCE
++#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub))
++#else
++#define shm_get_swap_page(info) (get_swap_page(NULL))
++#endif
++
+ /*
+ * Move the page from the page cache to the swap cache.
+ */
+@@ -837,12 +877,12 @@ static int shmem_writepage(struct page *
+ info = SHMEM_I(inode);
+ if (info->flags & VM_LOCKED)
+ goto redirty;
+- swap = get_swap_page();
++ swap = shm_get_swap_page(info);
+ if (!swap.val)
+ goto redirty;
+
+ spin_lock(&info->lock);
+- shmem_recalc_inode(inode);
++ shmem_recalc_inode(inode, 0);
+ if (index >= info->next_index) {
+ BUG_ON(!(info->flags & SHMEM_TRUNCATE));
+ goto unlock;
+@@ -1030,7 +1070,7 @@ repeat:
+ goto failed;
+
+ spin_lock(&info->lock);
+- shmem_recalc_inode(inode);
++ shmem_recalc_inode(inode, 0);
+ entry = shmem_swp_alloc(info, idx, sgp);
+ if (IS_ERR(entry)) {
+ spin_unlock(&info->lock);
+@@ -1206,6 +1246,7 @@ repeat:
+ spin_unlock(&info->lock);
+ flush_dcache_page(filepage);
+ SetPageUptodate(filepage);
++ ub_tmpfs_respages_inc(info);
+ }
+ done:
+ if (*pagep != filepage) {
+@@ -1307,28 +1348,6 @@ shmem_get_policy(struct vm_area_struct *
+ }
+ #endif
+
+-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+-{
+- struct inode *inode = file->f_dentry->d_inode;
+- struct shmem_inode_info *info = SHMEM_I(inode);
+- int retval = -ENOMEM;
+-
+- spin_lock(&info->lock);
+- if (lock && !(info->flags & VM_LOCKED)) {
+- if (!user_shm_lock(inode->i_size, user))
+- goto out_nomem;
+- info->flags |= VM_LOCKED;
+- }
+- if (!lock && (info->flags & VM_LOCKED) && user) {
+- user_shm_unlock(inode->i_size, user);
+- info->flags &= ~VM_LOCKED;
+- }
+- retval = 0;
+-out_nomem:
+- spin_unlock(&info->lock);
+- return retval;
+-}
+-
+ int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+ {
+ file_accessed(file);
+@@ -1365,6 +1384,7 @@ shmem_get_inode(struct super_block *sb,
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ info = SHMEM_I(inode);
+ memset(info, 0, (char *)inode - (char *)info);
++ shmi_ub_set(info, get_exec_ub());
+ spin_lock_init(&info->lock);
+ INIT_LIST_HEAD(&info->swaplist);
+
+@@ -2100,6 +2120,7 @@ static int shmem_fill_super(struct super
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = TMPFS_MAGIC;
+ sb->s_op = &shmem_ops;
++ sb->s_time_gran = 1;
+
+ inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
+ if (!inode)
+@@ -2172,6 +2193,7 @@ static struct address_space_operations s
+ .prepare_write = shmem_prepare_write,
+ .commit_write = simple_commit_write,
+ #endif
++ .migratepage = migrate_page,
+ };
+
+ static struct file_operations shmem_file_operations = {
+@@ -2226,6 +2248,10 @@ static struct vm_operations_struct shmem
+ #endif
+ };
+
++int is_shmem_mapping(struct address_space *map)
++{
++ return (map != NULL && map->a_ops == &shmem_aops);
++}
+
+ static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+@@ -2233,13 +2259,19 @@ static struct super_block *shmem_get_sb(
+ return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+ }
+
+-static struct file_system_type tmpfs_fs_type = {
++struct file_system_type tmpfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "tmpfs",
+ .get_sb = shmem_get_sb,
+ .kill_sb = kill_litter_super,
+ };
++EXPORT_SYMBOL(tmpfs_fs_type);
++
++#ifdef CONFIG_VE
++#define shm_mnt (get_exec_env()->shmem_mnt)
++#else
+ static struct vfsmount *shm_mnt;
++#endif
+
+ static int __init init_tmpfs(void)
+ {
+@@ -2276,6 +2308,36 @@ out3:
+ }
+ module_init(init_tmpfs)
+
++static inline int shm_charge_ahead(struct inode *inode)
++{
++#ifdef CONFIG_USER_RESOURCE
++ struct shmem_inode_info *info = SHMEM_I(inode);
++ unsigned long idx;
++ swp_entry_t *entry;
++
++ if (!inode->i_size)
++ return 0;
++ idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
++ /*
++ * Just touch info to allocate space for entry and
++ * make all UBC checks
++ */
++ spin_lock(&info->lock);
++ entry = shmem_swp_alloc(info, idx, SGP_CACHE);
++ if (IS_ERR(entry))
++ goto err;
++ shmem_swp_unmap(entry);
++ spin_unlock(&info->lock);
++ return 0;
++
++err:
++ spin_unlock(&info->lock);
++ return PTR_ERR(entry);
++#else
++ return 0;
++#endif
++}
++
+ /*
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ *
+@@ -2323,6 +2385,10 @@ struct file *shmem_file_setup(char *name
+ d_instantiate(dentry, inode);
+ inode->i_size = size;
+ inode->i_nlink = 0; /* It is unlinked */
++ error = shm_charge_ahead(inode);
++ if (error)
++ goto close_file;
++
+ file->f_vfsmnt = mntget(shm_mnt);
+ file->f_dentry = dentry;
+ file->f_mapping = inode->i_mapping;
+@@ -2338,6 +2404,7 @@ put_memory:
+ shmem_unacct_size(flags, size);
+ return ERR_PTR(error);
+ }
++EXPORT_SYMBOL_GPL(shmem_file_setup);
+
+ /*
+ * shmem_zero_setup - setup a shared anonymous mapping
+@@ -2355,6 +2422,8 @@ int shmem_zero_setup(struct vm_area_stru
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
++ else if (vma->vm_flags & VM_WRITE)
++ __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT);
+ vma->vm_file = file;
+ vma->vm_ops = &shmem_vm_ops;
+ return 0;
+diff -upr linux-2.6.16.orig/mm/slab.c linux-2.6.16-026test015/mm/slab.c
+--- linux-2.6.16.orig/mm/slab.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/slab.c 2006-07-04 14:41:38.000000000 +0400
+@@ -105,32 +105,19 @@
+ #include <linux/nodemask.h>
+ #include <linux/mempolicy.h>
+ #include <linux/mutex.h>
++#include <linux/kmem_slab.h>
++#include <linux/kmem_cache.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+ #include <asm/tlbflush.h>
+ #include <asm/page.h>
+
+-/*
+- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
+- * SLAB_RED_ZONE & SLAB_POISON.
+- * 0 for faster, smaller code (especially in the critical paths).
+- *
+- * STATS - 1 to collect stats for /proc/slabinfo.
+- * 0 for faster, smaller code (especially in the critical paths).
+- *
+- * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
+- */
++#include <ub/ub_mem.h>
+
+-#ifdef CONFIG_DEBUG_SLAB
+-#define DEBUG 1
+-#define STATS 1
+-#define FORCED_DEBUG 1
+-#else
+-#define DEBUG 0
+-#define STATS 0
+-#define FORCED_DEBUG 0
+-#endif
++#define DEBUG SLAB_DEBUG
++#define STATS SLAB_STATS
++#define FORCED_DEBUG SLAB_FORCED_DEBUG
+
+ /* Shouldn't this be in a header file somewhere? */
+ #define BYTES_PER_WORD sizeof(void *)
+@@ -173,134 +160,20 @@
+ SLAB_NO_REAP | SLAB_CACHE_DMA | \
+ SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+- SLAB_DESTROY_BY_RCU)
++ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE)
+ #else
+ # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+ SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
+- SLAB_DESTROY_BY_RCU)
++ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE)
+ #endif
+
+-/*
+- * kmem_bufctl_t:
+- *
+- * Bufctl's are used for linking objs within a slab
+- * linked offsets.
+- *
+- * This implementation relies on "struct page" for locating the cache &
+- * slab an object belongs to.
+- * This allows the bufctl structure to be small (one int), but limits
+- * the number of objects a slab (not a cache) can contain when off-slab
+- * bufctls are used. The limit is the size of the largest general cache
+- * that does not use off-slab slabs.
+- * For 32bit archs with 4 kB pages, is this 56.
+- * This is not serious, as it is only for large objects, when it is unwise
+- * to have too many per slab.
+- * Note: This limit can be raised by introducing a general cache whose size
+- * is less than 512 (PAGE_SIZE<<3), but greater than 256.
+- */
+-
+-typedef unsigned int kmem_bufctl_t;
+-#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
+-#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
+-#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2)
+-
+ /* Max number of objs-per-slab for caches which use off-slab slabs.
+ * Needed to avoid a possible looping condition in cache_grow().
+ */
+ static unsigned long offslab_limit;
+
+ /*
+- * struct slab
+- *
+- * Manages the objs in a slab. Placed either at the beginning of mem allocated
+- * for a slab, or allocated from an general cache.
+- * Slabs are chained into three list: fully used, partial, fully free slabs.
+- */
+-struct slab {
+- struct list_head list;
+- unsigned long colouroff;
+- void *s_mem; /* including colour offset */
+- unsigned int inuse; /* num of objs active in slab */
+- kmem_bufctl_t free;
+- unsigned short nodeid;
+-};
+-
+-/*
+- * struct slab_rcu
+- *
+- * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+- * arrange for kmem_freepages to be called via RCU. This is useful if
+- * we need to approach a kernel structure obliquely, from its address
+- * obtained without the usual locking. We can lock the structure to
+- * stabilize it and check it's still at the given address, only if we
+- * can be sure that the memory has not been meanwhile reused for some
+- * other kind of object (which our subsystem's lock might corrupt).
+- *
+- * rcu_read_lock before reading the address, then rcu_read_unlock after
+- * taking the spinlock within the structure expected at that address.
+- *
+- * We assume struct slab_rcu can overlay struct slab when destroying.
+- */
+-struct slab_rcu {
+- struct rcu_head head;
+- struct kmem_cache *cachep;
+- void *addr;
+-};
+-
+-/*
+- * struct array_cache
+- *
+- * Purpose:
+- * - LIFO ordering, to hand out cache-warm objects from _alloc
+- * - reduce the number of linked list operations
+- * - reduce spinlock operations
+- *
+- * The limit is stored in the per-cpu structure to reduce the data cache
+- * footprint.
+- *
+- */
+-struct array_cache {
+- unsigned int avail;
+- unsigned int limit;
+- unsigned int batchcount;
+- unsigned int touched;
+- spinlock_t lock;
+- void *entry[0]; /*
+- * Must have this definition in here for the proper
+- * alignment of array_cache. Also simplifies accessing
+- * the entries.
+- * [0] is for gcc 2.95. It should really be [].
+- */
+-};
+-
+-/* bootstrap: The caches do not work without cpuarrays anymore,
+- * but the cpuarrays are allocated from the generic caches...
+- */
+-#define BOOT_CPUCACHE_ENTRIES 1
+-struct arraycache_init {
+- struct array_cache cache;
+- void *entries[BOOT_CPUCACHE_ENTRIES];
+-};
+-
+-/*
+- * The slab lists for all objects.
+- */
+-struct kmem_list3 {
+- struct list_head slabs_partial; /* partial list first, better asm code */
+- struct list_head slabs_full;
+- struct list_head slabs_free;
+- unsigned long free_objects;
+- unsigned long next_reap;
+- int free_touched;
+- unsigned int free_limit;
+- unsigned int colour_next; /* Per-node cache coloring */
+- spinlock_t list_lock;
+- struct array_cache *shared; /* shared per node */
+- struct array_cache **alien; /* on other nodes */
+-};
+-
+-/*
+ * Need this for bootstrapping a per node allocator.
+ */
+ #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
+@@ -364,79 +237,6 @@ static void kmem_list3_init(struct kmem_
+ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
+ } while (0)
+
+-/*
+- * struct kmem_cache
+- *
+- * manages a cache.
+- */
+-
+-struct kmem_cache {
+-/* 1) per-cpu data, touched during every alloc/free */
+- struct array_cache *array[NR_CPUS];
+- unsigned int batchcount;
+- unsigned int limit;
+- unsigned int shared;
+- unsigned int buffer_size;
+-/* 2) touched by every alloc & free from the backend */
+- struct kmem_list3 *nodelists[MAX_NUMNODES];
+- unsigned int flags; /* constant flags */
+- unsigned int num; /* # of objs per slab */
+- spinlock_t spinlock;
+-
+-/* 3) cache_grow/shrink */
+- /* order of pgs per slab (2^n) */
+- unsigned int gfporder;
+-
+- /* force GFP flags, e.g. GFP_DMA */
+- gfp_t gfpflags;
+-
+- size_t colour; /* cache colouring range */
+- unsigned int colour_off; /* colour offset */
+- struct kmem_cache *slabp_cache;
+- unsigned int slab_size;
+- unsigned int dflags; /* dynamic flags */
+-
+- /* constructor func */
+- void (*ctor) (void *, struct kmem_cache *, unsigned long);
+-
+- /* de-constructor func */
+- void (*dtor) (void *, struct kmem_cache *, unsigned long);
+-
+-/* 4) cache creation/removal */
+- const char *name;
+- struct list_head next;
+-
+-/* 5) statistics */
+-#if STATS
+- unsigned long num_active;
+- unsigned long num_allocations;
+- unsigned long high_mark;
+- unsigned long grown;
+- unsigned long reaped;
+- unsigned long errors;
+- unsigned long max_freeable;
+- unsigned long node_allocs;
+- unsigned long node_frees;
+- atomic_t allochit;
+- atomic_t allocmiss;
+- atomic_t freehit;
+- atomic_t freemiss;
+-#endif
+-#if DEBUG
+- /*
+- * If debugging is enabled, then the allocator can add additional
+- * fields and/or padding to every object. buffer_size contains the total
+- * object size including these internal fields, the following two
+- * variables contain the offset to the user object and its size.
+- */
+- int obj_offset;
+- int obj_size;
+-#endif
+-};
+-
+-#define CFLGS_OFF_SLAB (0x80000000UL)
+-#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
+-
+ #define BATCHREFILL_LIMIT 16
+ /* Optimization question: fewer reaps means less
+ * probability for unnessary cpucache drain/refill cycles.
+@@ -573,42 +373,6 @@ static void **dbg_userword(struct kmem_c
+ #define BREAK_GFP_ORDER_LO 0
+ static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+
+-/* Functions for storing/retrieving the cachep and or slab from the
+- * global 'mem_map'. These are used to find the slab an obj belongs to.
+- * With kfree(), these are used to find the cache which an obj belongs to.
+- */
+-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
+-{
+- page->lru.next = (struct list_head *)cache;
+-}
+-
+-static inline struct kmem_cache *page_get_cache(struct page *page)
+-{
+- return (struct kmem_cache *)page->lru.next;
+-}
+-
+-static inline void page_set_slab(struct page *page, struct slab *slab)
+-{
+- page->lru.prev = (struct list_head *)slab;
+-}
+-
+-static inline struct slab *page_get_slab(struct page *page)
+-{
+- return (struct slab *)page->lru.prev;
+-}
+-
+-static inline struct kmem_cache *virt_to_cache(const void *obj)
+-{
+- struct page *page = virt_to_page(obj);
+- return page_get_cache(page);
+-}
+-
+-static inline struct slab *virt_to_slab(const void *obj)
+-{
+- struct page *page = virt_to_page(obj);
+- return page_get_slab(page);
+-}
+-
+ /* These are the default caches for kmalloc. Custom caches can have other sizes. */
+ struct cache_sizes malloc_sizes[] = {
+ #define CACHE(x) { .cs_size = (x) },
+@@ -715,9 +479,17 @@ struct kmem_cache *kmem_find_general_cac
+ }
+ EXPORT_SYMBOL(kmem_find_general_cachep);
+
+-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
++static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags)
+ {
+- return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
++ size_t size_noub;
++
++ size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t);
++ return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags);
++}
++
++static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags)
++{
++ return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align);
+ }
+
+ /* Calculate the number of objects and left-over bytes for a given
+@@ -761,20 +533,23 @@ static void cache_estimate(unsigned long
+ * into account.
+ */
+ nr_objs = (slab_size - sizeof(struct slab)) /
+- (buffer_size + sizeof(kmem_bufctl_t));
++ (buffer_size + sizeof(kmem_bufctl_t) +
++ UB_EXTRA(flags));
+
+ /*
+ * This calculated number will be either the right
+ * amount, or one greater than what we want.
+ */
+- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
+- > slab_size)
++ if (slab_mgmt_size(nr_objs, align, flags) +
++ nr_objs * buffer_size > slab_size)
+ nr_objs--;
++ BUG_ON(slab_mgmt_size(nr_objs, align, flags) +
++ nr_objs * buffer_size > slab_size);
+
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
+
+- mgmt_size = slab_mgmt_size(nr_objs, align);
++ mgmt_size = slab_mgmt_size(nr_objs, align, flags);
+ }
+ *num = nr_objs;
+ *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+@@ -1254,6 +1029,7 @@ void __init kmem_cache_init(void)
+ sizes[INDEX_AC].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS |
++ SLAB_UBC|SLAB_NO_CHARGE |
+ SLAB_PANIC), NULL, NULL);
+
+ if (INDEX_AC != INDEX_L3)
+@@ -1261,8 +1037,9 @@ void __init kmem_cache_init(void)
+ kmem_cache_create(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size,
+ ARCH_KMALLOC_MINALIGN,
+- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+- NULL);
++ (ARCH_KMALLOC_FLAGS |
++ SLAB_UBC | SLAB_NO_CHARGE |
++ SLAB_PANIC), NULL, NULL);
+
+ while (sizes->cs_size != ULONG_MAX) {
+ /*
+@@ -1277,14 +1054,14 @@ void __init kmem_cache_init(void)
+ sizes->cs_size,
+ ARCH_KMALLOC_MINALIGN,
+ (ARCH_KMALLOC_FLAGS
++ | SLAB_UBC
++ | SLAB_NO_CHARGE
+ | SLAB_PANIC),
+ NULL, NULL);
+
+ /* Inc off-slab bufctl limit until the ceiling is hit. */
+- if (!(OFF_SLAB(sizes->cs_cachep))) {
+- offslab_limit = sizes->cs_size - sizeof(struct slab);
+- offslab_limit /= sizeof(kmem_bufctl_t);
+- }
++ if (!(OFF_SLAB(sizes->cs_cachep)))
++ offslab_limit = sizes->cs_size;
+
+ sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
+ sizes->cs_size,
+@@ -1704,8 +1481,13 @@ static inline size_t calculate_slab_orde
+ continue;
+
+ /* More than offslab_limit objects will cause problems */
+- if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
+- break;
++ if (flags & CFLGS_OFF_SLAB) {
++ unsigned long slab_size;
++
++ slab_size = slab_mgmt_size_noalign(num, flags);
++ if (slab_size > offslab_limit)
++ break;
++ }
+
+ /* Found something acceptable - save it away */
+ cachep->num = num;
+@@ -1950,8 +1732,7 @@ kmem_cache_create (const char *name, siz
+ cachep = NULL;
+ goto oops;
+ }
+- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+- + sizeof(struct slab), align);
++ slab_size = slab_mgmt_size(cachep->num, align, flags);
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+@@ -1964,8 +1745,7 @@ kmem_cache_create (const char *name, siz
+
+ if (flags & CFLGS_OFF_SLAB) {
+ /* really off slab. No need for manual alignment */
+- slab_size =
+- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
++ slab_size = slab_mgmt_size_noalign(cachep->num, flags);
+ }
+
+ cachep->colour_off = cache_line_size();
+@@ -2045,6 +1825,7 @@ kmem_cache_create (const char *name, siz
+
+ /* cache setup completed, link it into the list */
+ list_add(&cachep->next, &cache_chain);
++ set_cache_objuse(cachep);
+ oops:
+ if (!cachep && (flags & SLAB_PANIC))
+ panic("kmem_cache_create(): failed to create slab `%s'\n",
+@@ -2266,6 +2047,8 @@ int kmem_cache_destroy(struct kmem_cache
+ kfree(l3);
+ }
+ }
++
++ ub_kmemcache_free(cachep);
+ kmem_cache_free(&cache_cache, cachep);
+
+ unlock_cpu_hotplug();
+@@ -2282,7 +2065,8 @@ static struct slab *alloc_slabmgmt(struc
+
+ if (OFF_SLAB(cachep)) {
+ /* Slab management obj is off-slab. */
+- slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
++ slabp = kmem_cache_alloc(cachep->slabp_cache,
++ local_flags & (~__GFP_UBC));
+ if (!slabp)
+ return NULL;
+ } else {
+@@ -2292,15 +2076,11 @@ static struct slab *alloc_slabmgmt(struc
+ slabp->inuse = 0;
+ slabp->colouroff = colour_off;
+ slabp->s_mem = objp + colour_off;
++ init_slab_ubps(cachep, slabp);
+
+ return slabp;
+ }
+
+-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
+-{
+- return (kmem_bufctl_t *) (slabp + 1);
+-}
+-
+ static void cache_init_objs(struct kmem_cache *cachep,
+ struct slab *slabp, unsigned long ctor_flags)
+ {
+@@ -2470,7 +2250,7 @@ static int cache_grow(struct kmem_cache
+ /* Get mem for the objs.
+ * Attempt to allocate a physical page from 'nodeid',
+ */
+- if (!(objp = kmem_getpages(cachep, flags, nodeid)))
++ if (!(objp = kmem_getpages(cachep, flags & (~__GFP_UBC), nodeid)))
+ goto failed;
+
+ /* Get slab management. */
+@@ -2823,6 +2603,11 @@ __cache_alloc(struct kmem_cache *cachep,
+ objp = cache_alloc_debugcheck_after(cachep, flags, objp,
+ caller);
+ prefetchw(objp);
++
++ if (objp && ub_slab_charge(objp, flags)) {
++ kmem_cache_free(cachep, objp);
++ objp = NULL;
++ }
+ return objp;
+ }
+
+@@ -2997,6 +2782,8 @@ static inline void __cache_free(struct k
+ check_irq_off();
+ objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+
++ ub_slab_uncharge(objp);
++
+ /* Make sure we are not freeing a object from another
+ * node to the array cache on this cpu.
+ */
+@@ -3128,6 +2915,10 @@ void *kmem_cache_alloc_node(struct kmem_
+ ptr = cache_alloc_debugcheck_after(cachep, flags, ptr,
+ __builtin_return_address(0));
+
++ if (ptr && ub_slab_charge(ptr, flags)) {
++ kmem_cache_free(cachep, ptr);
++ ptr = NULL;
++ }
+ return ptr;
+ }
+ EXPORT_SYMBOL(kmem_cache_alloc_node);
+@@ -3543,6 +3334,7 @@ static void cache_reap(void *unused)
+ return;
+ }
+
++ {KSTAT_PERF_ENTER(cache_reap)
+ list_for_each(walk, &cache_chain) {
+ struct kmem_cache *searchp;
+ struct list_head *p;
+@@ -3608,6 +3400,7 @@ static void cache_reap(void *unused)
+ check_irq_on();
+ mutex_unlock(&cache_chain_mutex);
+ next_reap_node();
++ KSTAT_PERF_LEAVE(cache_reap)}
+ /* Setup the next iteration */
+ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+ }
+diff -upr linux-2.6.16.orig/mm/swap_state.c linux-2.6.16-026test015/mm/swap_state.c
+--- linux-2.6.16.orig/mm/swap_state.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/swap_state.c 2006-07-04 14:41:38.000000000 +0400
+@@ -18,6 +18,8 @@
+
+ #include <asm/pgtable.h>
+
++#include <ub/ub_vmpages.h>
++
+ /*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_list, to make sync_page look nicer, and to allow
+@@ -52,14 +54,18 @@ static struct {
+ unsigned long find_total;
+ unsigned long noent_race;
+ unsigned long exist_race;
++ unsigned long remove_race;
+ } swap_cache_info;
++EXPORT_SYMBOL(swap_cache_info);
+
+ void show_swap_cache_info(void)
+ {
+- printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
++ printk("Swap cache: add %lu, delete %lu, find %lu/%lu, "
++ "race %lu+%lu+%lu\n",
+ swap_cache_info.add_total, swap_cache_info.del_total,
+ swap_cache_info.find_success, swap_cache_info.find_total,
+- swap_cache_info.noent_race, swap_cache_info.exist_race);
++ swap_cache_info.noent_race, swap_cache_info.exist_race,
++ swap_cache_info.remove_race);
+ printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+ printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
+ }
+@@ -151,7 +157,14 @@ int add_to_swap(struct page * page, gfp_
+ BUG();
+
+ for (;;) {
+- entry = get_swap_page();
++ struct user_beancounter *ub;
++
++ ub = pb_grab_page_ub(page);
++ if (IS_ERR(ub))
++ return 0;
++
++ entry = get_swap_page(ub);
++ put_beancounter(ub);
+ if (!entry.val)
+ return 0;
+
+@@ -252,10 +265,13 @@ int move_from_swap_cache(struct page *pa
+ */
+ static inline void free_swap_cache(struct page *page)
+ {
+- if (PageSwapCache(page) && !TestSetPageLocked(page)) {
++ if (!PageSwapCache(page))
++ return;
++ if (!TestSetPageLocked(page)) {
+ remove_exclusive_swap_page(page);
+ unlock_page(page);
+- }
++ } else
++ INC_CACHE_INFO(remove_race);
+ }
+
+ /*
+diff -upr linux-2.6.16.orig/mm/swapfile.c linux-2.6.16-026test015/mm/swapfile.c
+--- linux-2.6.16.orig/mm/swapfile.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/swapfile.c 2006-07-04 14:41:39.000000000 +0400
+@@ -33,6 +33,8 @@
+ #include <asm/tlbflush.h>
+ #include <linux/swapops.h>
+
++#include <ub/ub_vmpages.h>
++
+ DEFINE_SPINLOCK(swap_lock);
+ unsigned int nr_swapfiles;
+ long total_swap_pages;
+@@ -172,7 +174,7 @@ no_page:
+ return 0;
+ }
+
+-swp_entry_t get_swap_page(void)
++swp_entry_t get_swap_page(struct user_beancounter *ub)
+ {
+ struct swap_info_struct *si;
+ pgoff_t offset;
+@@ -202,6 +204,7 @@ swp_entry_t get_swap_page(void)
+ offset = scan_swap_map(si);
+ if (offset) {
+ spin_unlock(&swap_lock);
++ ub_swapentry_inc(si, offset, ub);
+ return swp_entry(type, offset);
+ }
+ next = swap_list.next;
+@@ -277,6 +280,7 @@ static int swap_entry_free(struct swap_i
+ count--;
+ p->swap_map[offset] = count;
+ if (!count) {
++ ub_swapentry_dec(p, offset);
+ if (offset < p->lowest_bit)
+ p->lowest_bit = offset;
+ if (offset > p->highest_bit)
+@@ -423,11 +427,18 @@ void free_swap_and_cache(swp_entry_t ent
+ * force COW, vm_page_prot omits write permission from any private vma.
+ */
+ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+- unsigned long addr, swp_entry_t entry, struct page *page)
++ unsigned long addr, swp_entry_t entry, struct page *page,
++ struct page_beancounter **pb)
+ {
+- inc_mm_counter(vma->vm_mm, anon_rss);
++ struct mm_struct *mm;
++
++ mm = vma->vm_mm;
++ inc_mm_counter(mm, anon_rss);
++ inc_vma_rss(vma);
++ ub_unused_privvm_dec(mm, vma);
++ pb_add_ref(page, mm, pb);
+ get_page(page);
+- set_pte_at(vma->vm_mm, addr, pte,
++ set_pte_at(mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ page_add_anon_rmap(page, vma, addr);
+ swap_free(entry);
+@@ -440,7 +451,8 @@ static void unuse_pte(struct vm_area_str
+
+ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+- swp_entry_t entry, struct page *page)
++ swp_entry_t entry, struct page *page,
++ struct page_beancounter **pb)
+ {
+ pte_t swp_pte = swp_entry_to_pte(entry);
+ pte_t *pte;
+@@ -454,7 +466,7 @@ static int unuse_pte_range(struct vm_are
+ * Test inline before going to call unuse_pte.
+ */
+ if (unlikely(pte_same(*pte, swp_pte))) {
+- unuse_pte(vma, pte++, addr, entry, page);
++ unuse_pte(vma, pte++, addr, entry, page, pb);
+ found = 1;
+ break;
+ }
+@@ -465,7 +477,8 @@ static int unuse_pte_range(struct vm_are
+
+ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long addr, unsigned long end,
+- swp_entry_t entry, struct page *page)
++ swp_entry_t entry, struct page *page,
++ struct page_beancounter **pb)
+ {
+ pmd_t *pmd;
+ unsigned long next;
+@@ -475,7 +488,7 @@ static inline int unuse_pmd_range(struct
+ next = pmd_addr_end(addr, end);
+ if (pmd_none_or_clear_bad(pmd))
+ continue;
+- if (unuse_pte_range(vma, pmd, addr, next, entry, page))
++ if (unuse_pte_range(vma, pmd, addr, next, entry, page, pb))
+ return 1;
+ } while (pmd++, addr = next, addr != end);
+ return 0;
+@@ -483,7 +496,8 @@ static inline int unuse_pmd_range(struct
+
+ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+- swp_entry_t entry, struct page *page)
++ swp_entry_t entry, struct page *page,
++ struct page_beancounter **pb)
+ {
+ pud_t *pud;
+ unsigned long next;
+@@ -493,14 +507,15 @@ static inline int unuse_pud_range(struct
+ next = pud_addr_end(addr, end);
+ if (pud_none_or_clear_bad(pud))
+ continue;
+- if (unuse_pmd_range(vma, pud, addr, next, entry, page))
++ if (unuse_pmd_range(vma, pud, addr, next, entry, page, pb))
+ return 1;
+ } while (pud++, addr = next, addr != end);
+ return 0;
+ }
+
+ static int unuse_vma(struct vm_area_struct *vma,
+- swp_entry_t entry, struct page *page)
++ swp_entry_t entry, struct page *page,
++ struct page_beancounter **pb)
+ {
+ pgd_t *pgd;
+ unsigned long addr, end, next;
+@@ -521,14 +536,15 @@ static int unuse_vma(struct vm_area_stru
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+ continue;
+- if (unuse_pud_range(vma, pgd, addr, next, entry, page))
++ if (unuse_pud_range(vma, pgd, addr, next, entry, page, pb))
+ return 1;
+ } while (pgd++, addr = next, addr != end);
+ return 0;
+ }
+
+ static int unuse_mm(struct mm_struct *mm,
+- swp_entry_t entry, struct page *page)
++ swp_entry_t entry, struct page *page,
++ struct page_beancounter **pb)
+ {
+ struct vm_area_struct *vma;
+
+@@ -543,7 +559,7 @@ static int unuse_mm(struct mm_struct *mm
+ lock_page(page);
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+- if (vma->anon_vma && unuse_vma(vma, entry, page))
++ if (vma->anon_vma && unuse_vma(vma, entry, page, pb))
+ break;
+ }
+ up_read(&mm->mmap_sem);
+@@ -555,11 +571,12 @@ static int unuse_mm(struct mm_struct *mm
+ }
+
+ #ifdef CONFIG_MIGRATION
+-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
++int remove_vma_swap(struct vm_area_struct *vma, struct page *page,
++ struct page_beancounter **pb)
+ {
+ swp_entry_t entry = { .val = page_private(page) };
+
+- return unuse_vma(vma, entry, page);
++ return unuse_vma(vma, entry, page, pb);
+ }
+ #endif
+
+@@ -618,6 +635,7 @@ static int try_to_unuse(unsigned int typ
+ int retval = 0;
+ int reset_overflow = 0;
+ int shmem;
++ struct page_beancounter *pb;
+
+ /*
+ * When searching mms for an entry, a good strategy is to
+@@ -670,6 +688,13 @@ again:
+ break;
+ }
+
++ pb = NULL;
++ if (pb_alloc_all(&pb)) {
++ page_cache_release(page);
++ retval = -ENOMEM;
++ break;
++ }
++
+ /*
+ * Don't hold on to start_mm if it looks like exiting.
+ */
+@@ -698,6 +723,20 @@ again:
+ }
+ wait_on_page_writeback(page);
+
++ /* If read failed we cannot map not-uptodate page to
++ * user space. Actually, we are in serious troubles,
++ * we do not even know what process to kill. So, the only
++ * variant remains: to stop swapoff() and allow someone
++ * to kill processes to zap invalid pages.
++ */
++ if (unlikely(!PageUptodate(page))) {
++ pb_free_list(&pb);
++ unlock_page(page);
++ page_cache_release(page);
++ retval = -EIO;
++ break;
++ }
++
+ /*
+ * Remove all references to entry.
+ * Whenever we reach init_mm, there's no address space
+@@ -709,7 +748,7 @@ again:
+ if (start_mm == &init_mm)
+ shmem = shmem_unuse(entry, page);
+ else
+- retval = unuse_mm(start_mm, entry, page);
++ retval = unuse_mm(start_mm, entry, page, &pb);
+ }
+ if (*swap_map > 1) {
+ int set_start_mm = (*swap_map >= swcount);
+@@ -741,7 +780,7 @@ again:
+ set_start_mm = 1;
+ shmem = shmem_unuse(entry, page);
+ } else
+- retval = unuse_mm(mm, entry, page);
++ retval = unuse_mm(mm, entry, page, &pb);
+ if (set_start_mm && *swap_map < swcount) {
+ mmput(new_start_mm);
+ atomic_inc(&mm->mm_users);
+@@ -755,6 +794,8 @@ again:
+ mmput(start_mm);
+ start_mm = new_start_mm;
+ }
++
++ pb_free_list(&pb);
+ if (retval) {
+ unlock_page(page);
+ page_cache_release(page);
+@@ -1100,6 +1141,10 @@ asmlinkage long sys_swapoff(const char _
+ int i, type, prev;
+ int err;
+
++ /* VE admin check is just to be on the safe side, the admin may affect
++ * swaps only if he has access to special, i.e. if he has been granted
++ * access to the block device or if the swap file is in the area
++ * visible to him. */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+@@ -1199,6 +1244,7 @@ asmlinkage long sys_swapoff(const char _
+ spin_unlock(&swap_lock);
+ mutex_unlock(&swapon_mutex);
+ vfree(swap_map);
++ ub_swap_fini(p);
+ inode = mapping->host;
+ if (S_ISBLK(inode->i_mode)) {
+ struct block_device *bdev = I_BDEV(inode);
+@@ -1557,6 +1603,11 @@ asmlinkage long sys_swapon(const char __
+ goto bad_swap;
+ }
+
++ if (ub_swap_init(p, maxpages)) {
++ error = -ENOMEM;
++ goto bad_swap;
++ }
++
+ mutex_lock(&swapon_mutex);
+ spin_lock(&swap_lock);
+ p->flags = SWP_ACTIVE;
+diff -upr linux-2.6.16.orig/mm/vmalloc.c linux-2.6.16-026test015/mm/vmalloc.c
+--- linux-2.6.16.orig/mm/vmalloc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/vmalloc.c 2006-07-04 14:41:37.000000000 +0400
+@@ -20,6 +20,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/tlbflush.h>
+
++#include <ub/ub_debug.h>
++
+
+ DEFINE_RWLOCK(vmlist_lock);
+ struct vm_struct *vmlist;
+@@ -256,6 +258,68 @@ struct vm_struct *get_vm_area_node(unsig
+ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node);
+ }
+
++struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags)
++{
++ unsigned long addr, best_addr, delta, best_delta;
++ struct vm_struct **p, **best_p, *tmp, *area;
++
++ area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL);
++ if (!area)
++ return NULL;
++
++ size += PAGE_SIZE; /* one-page gap at the end */
++ addr = VMALLOC_START;
++ best_addr = 0UL;
++ best_p = NULL;
++ best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START;
++
++ write_lock(&vmlist_lock);
++ for (p = &vmlist; (tmp = *p) &&
++ (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END));
++ p = &tmp->next) {
++ if ((size + addr) < addr)
++ break;
++ delta = (unsigned long) tmp->addr - (size + addr);
++ if (delta < best_delta) {
++ best_delta = delta;
++ best_addr = addr;
++ best_p = p;
++ }
++ addr = tmp->size + (unsigned long) tmp->addr;
++ if (addr > VMALLOC_END-size)
++ break;
++ }
++
++ if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) {
++ /* check free area after list end */
++ delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr);
++ if (delta < best_delta) {
++ best_delta = delta;
++ best_addr = addr;
++ best_p = p;
++ }
++ }
++ if (best_addr) {
++ area->flags = flags;
++ /* allocate at the end of this area */
++ area->addr = (void *)(best_addr + best_delta);
++ area->size = size;
++ area->next = *best_p;
++ area->pages = NULL;
++ area->nr_pages = 0;
++ area->phys_addr = 0;
++ *best_p = area;
++ /* check like in __vunmap */
++ WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr);
++ } else {
++ kfree(area);
++ area = NULL;
++ }
++ write_unlock(&vmlist_lock);
++
++ return area;
++}
++
+ /* Caller must hold vmlist_lock */
+ struct vm_struct *__remove_vm_area(void *addr)
+ {
+@@ -296,7 +360,7 @@ struct vm_struct *remove_vm_area(void *a
+ return v;
+ }
+
+-void __vunmap(void *addr, int deallocate_pages)
++void __vunmap(void *addr, int deallocate_pages, int uncharge)
+ {
+ struct vm_struct *area;
+
+@@ -320,6 +384,8 @@ void __vunmap(void *addr, int deallocate
+ if (deallocate_pages) {
+ int i;
+
++ if (uncharge)
++ dec_vmalloc_charged(area);
+ for (i = 0; i < area->nr_pages; i++) {
+ if (unlikely(!area->pages[i]))
+ BUG();
+@@ -350,7 +416,7 @@ void __vunmap(void *addr, int deallocate
+ void vfree(void *addr)
+ {
+ BUG_ON(in_interrupt());
+- __vunmap(addr, 1);
++ __vunmap(addr, 1, 1);
+ }
+ EXPORT_SYMBOL(vfree);
+
+@@ -367,7 +433,7 @@ EXPORT_SYMBOL(vfree);
+ void vunmap(void *addr)
+ {
+ BUG_ON(in_interrupt());
+- __vunmap(addr, 0);
++ __vunmap(addr, 0, 0);
+ }
+ EXPORT_SYMBOL(vunmap);
+
+@@ -439,10 +505,12 @@ void *__vmalloc_area_node(struct vm_stru
+
+ if (map_vm_area(area, prot, &pages))
+ goto fail;
++
++ inc_vmalloc_charged(area, gfp_mask);
+ return area->addr;
+
+ fail:
+- vfree(area->addr);
++ __vunmap(area->addr, 1, 0);
+ return NULL;
+ }
+
+@@ -486,6 +554,21 @@ void *__vmalloc(unsigned long size, gfp_
+ }
+ EXPORT_SYMBOL(__vmalloc);
+
++static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot)
++{
++ struct vm_struct *area;
++
++ size = PAGE_ALIGN(size);
++ if (!size || (size >> PAGE_SHIFT) > num_physpages)
++ return NULL;
++
++ area = get_vm_area_best(size, VM_ALLOC);
++ if (!area)
++ return NULL;
++
++ return __vmalloc_area_node(area, mask, prot, -1);
++}
++
+ /**
+ * vmalloc - allocate virtually contiguous memory
+ *
+@@ -503,6 +586,26 @@ void *vmalloc(unsigned long size)
+ }
+ EXPORT_SYMBOL(vmalloc);
+
++void *ub_vmalloc(unsigned long size)
++{
++ return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL);
++}
++EXPORT_SYMBOL(ub_vmalloc);
++
++void *vmalloc_best(unsigned long size)
++{
++ return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
++}
++
++EXPORT_SYMBOL(vmalloc_best);
++
++void *ub_vmalloc_best(unsigned long size)
++{
++ return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL);
++}
++
++EXPORT_SYMBOL(ub_vmalloc_best);
++
+ /**
+ * vmalloc_node - allocate memory on a specific node
+ *
+@@ -521,6 +624,12 @@ void *vmalloc_node(unsigned long size, i
+ }
+ EXPORT_SYMBOL(vmalloc_node);
+
++void *ub_vmalloc_node(unsigned long size, int node)
++{
++ return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, node);
++}
++EXPORT_SYMBOL(ub_vmalloc_node);
++
+ #ifndef PAGE_KERNEL_EXEC
+ # define PAGE_KERNEL_EXEC PAGE_KERNEL
+ #endif
+@@ -631,3 +740,37 @@ finished:
+ read_unlock(&vmlist_lock);
+ return buf - buf_start;
+ }
++
++void vprintstat(void)
++{
++ struct vm_struct *p, *last_p = NULL;
++ unsigned long addr, size, free_size, max_free_size;
++ int num;
++
++ addr = VMALLOC_START;
++ size = max_free_size = 0;
++ num = 0;
++
++ read_lock(&vmlist_lock);
++ for (p = vmlist; p; p = p->next) {
++ free_size = (unsigned long)p->addr - addr;
++ if (free_size > max_free_size)
++ max_free_size = free_size;
++ addr = (unsigned long)p->addr + p->size;
++ size += p->size;
++ ++num;
++ last_p = p;
++ }
++ if (last_p) {
++ free_size = VMALLOC_END -
++ ((unsigned long)last_p->addr + last_p->size);
++ if (free_size > max_free_size)
++ max_free_size = free_size;
++ }
++ read_unlock(&vmlist_lock);
++
++ printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n"
++ " Max_Free: %luKB Start: %lx End: %lx\n",
++ size/1024, (VMALLOC_END - VMALLOC_START)/1024, num,
++ max_free_size/1024, VMALLOC_START, VMALLOC_END);
++}
+diff -upr linux-2.6.16.orig/mm/vmscan.c linux-2.6.16-026test015/mm/vmscan.c
+--- linux-2.6.16.orig/mm/vmscan.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/mm/vmscan.c 2006-07-04 14:41:38.000000000 +0400
+@@ -949,6 +949,17 @@ redo:
+ goto unlock_both;
+ }
+
++ /* Make sure the dirty bit is up to date */
++ if (try_to_unmap(page, 1) == SWAP_FAIL) {
++ rc = -EPERM;
++ goto unlock_both;
++ }
++
++ if (page_mapcount(page)) {
++ rc = -EAGAIN;
++ goto unlock_both;
++ }
++
+ /*
+ * Default handling if a filesystem does not provide
+ * a migration function. We can only migrate clean
+@@ -1243,6 +1254,7 @@ refill_inactive_zone(struct zone *zone,
+ reclaim_mapped = 1;
+ }
+
++ {KSTAT_PERF_ENTER(refill_inact)
+ lru_add_drain();
+ spin_lock_irq(&zone->lru_lock);
+ pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
+@@ -1322,6 +1334,7 @@ refill_inactive_zone(struct zone *zone,
+ local_irq_enable();
+
+ pagevec_release(&pvec);
++ KSTAT_PERF_LEAVE(refill_inact)}
+ }
+
+ /*
+@@ -1438,6 +1451,7 @@ int try_to_free_pages(struct zone **zone
+ unsigned long lru_pages = 0;
+ int i;
+
++ KSTAT_PERF_ENTER(ttfp);
+ sc.gfp_mask = gfp_mask;
+ sc.may_writepage = !laptop_mode;
+ sc.may_swap = 1;
+@@ -1500,6 +1514,7 @@ out:
+
+ zone->prev_priority = zone->temp_priority;
+ }
++ KSTAT_PERF_LEAVE(ttfp);
+ return ret;
+ }
+
+@@ -1832,7 +1847,8 @@ static int __init kswapd_init(void)
+ swap_setup();
+ for_each_pgdat(pgdat)
+ pgdat->kswapd
+- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
++ = find_task_by_pid_all(kernel_thread(kswapd,
++ pgdat, CLONE_KERNEL));
+ total_memory = nr_free_pagecache_pages();
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+diff -upr linux-2.6.16.orig/net/atm/clip.c linux-2.6.16-026test015/net/atm/clip.c
+--- linux-2.6.16.orig/net/atm/clip.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/atm/clip.c 2006-07-04 14:41:36.000000000 +0400
+@@ -613,12 +613,19 @@ static int clip_create(int number)
+
+
+ static int clip_device_event(struct notifier_block *this,unsigned long event,
+- void *dev)
++ void *arg)
+ {
++ struct net_device *dev = arg;
++
++ if (event == NETDEV_UNREGISTER) {
++ neigh_ifdown(&clip_tbl, dev);
++ return NOTIFY_DONE;
++ }
++
+ /* ignore non-CLIP devices */
+- if (((struct net_device *) dev)->type != ARPHRD_ATM ||
+- ((struct net_device *) dev)->hard_start_xmit != clip_start_xmit)
++ if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit)
+ return NOTIFY_DONE;
++
+ switch (event) {
+ case NETDEV_UP:
+ DPRINTK("clip_device_event NETDEV_UP\n");
+@@ -686,14 +693,12 @@ static struct notifier_block clip_inet_n
+ static void atmarpd_close(struct atm_vcc *vcc)
+ {
+ DPRINTK("atmarpd_close\n");
+- atmarpd = NULL; /* assumed to be atomic */
+- barrier();
+- unregister_inetaddr_notifier(&clip_inet_notifier);
+- unregister_netdevice_notifier(&clip_dev_notifier);
+- if (skb_peek(&sk_atm(vcc)->sk_receive_queue))
+- printk(KERN_ERR "atmarpd_close: closing with requests "
+- "pending\n");
++
++ rtnl_lock();
++ atmarpd = NULL;
+ skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
++ rtnl_unlock();
++
+ DPRINTK("(done)\n");
+ module_put(THIS_MODULE);
+ }
+@@ -714,7 +719,12 @@ static struct atm_dev atmarpd_dev = {
+
+ static int atm_init_atmarp(struct atm_vcc *vcc)
+ {
+- if (atmarpd) return -EADDRINUSE;
++ rtnl_lock();
++ if (atmarpd) {
++ rtnl_unlock();
++ return -EADDRINUSE;
++ }
++
+ if (start_timer) {
+ start_timer = 0;
+ init_timer(&idle_timer);
+@@ -731,10 +741,7 @@ static int atm_init_atmarp(struct atm_vc
+ vcc->push = NULL;
+ vcc->pop = NULL; /* crash */
+ vcc->push_oam = NULL; /* crash */
+- if (register_netdevice_notifier(&clip_dev_notifier))
+- printk(KERN_ERR "register_netdevice_notifier failed\n");
+- if (register_inetaddr_notifier(&clip_inet_notifier))
+- printk(KERN_ERR "register_inetaddr_notifier failed\n");
++ rtnl_unlock();
+ return 0;
+ }
+
+@@ -992,6 +999,8 @@ static int __init atm_clip_init(void)
+
+ clip_tbl_hook = &clip_tbl;
+ register_atm_ioctl(&clip_ioctl_ops);
++ register_netdevice_notifier(&clip_dev_notifier);
++ register_inetaddr_notifier(&clip_inet_notifier);
+
+ #ifdef CONFIG_PROC_FS
+ {
+@@ -1012,6 +1021,9 @@ static void __exit atm_clip_exit(void)
+
+ remove_proc_entry("arp", atm_proc_root);
+
++ unregister_inetaddr_notifier(&clip_inet_notifier);
++ unregister_netdevice_notifier(&clip_dev_notifier);
++
+ deregister_atm_ioctl(&clip_ioctl_ops);
+
+ /* First, stop the idle timer, so it stops banging
+diff -upr linux-2.6.16.orig/net/bridge/br_netfilter.c linux-2.6.16-026test015/net/bridge/br_netfilter.c
+--- linux-2.6.16.orig/net/bridge/br_netfilter.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/bridge/br_netfilter.c 2006-07-04 14:41:36.000000000 +0400
+@@ -739,6 +739,15 @@ out:
+ return NF_STOLEN;
+ }
+
++static int br_nf_dev_queue_xmit(struct sk_buff *skb)
++{
++ if (skb->protocol == htons(ETH_P_IP) &&
++ skb->len > skb->dev->mtu &&
++ !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
++ return ip_fragment(skb, br_dev_queue_push_xmit);
++ else
++ return br_dev_queue_push_xmit(skb);
++}
+
+ /* PF_BRIDGE/POST_ROUTING ********************************************/
+ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb,
+@@ -798,7 +807,7 @@ static unsigned int br_nf_post_routing(u
+ realoutdev = nf_bridge->netoutdev;
+ #endif
+ NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev,
+- br_dev_queue_push_xmit);
++ br_nf_dev_queue_xmit);
+
+ return NF_STOLEN;
+
+@@ -843,7 +852,7 @@ static unsigned int ip_sabotage_out(unsi
+ if ((out->hard_start_xmit == br_dev_xmit &&
+ okfn != br_nf_forward_finish &&
+ okfn != br_nf_local_out_finish &&
+- okfn != br_dev_queue_push_xmit)
++ okfn != br_nf_dev_queue_xmit)
+ #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
+ || ((out->priv_flags & IFF_802_1Q_VLAN) &&
+ VLAN_DEV_INFO(out)->real_dev->hard_start_xmit == br_dev_xmit)
+diff -upr linux-2.6.16.orig/net/compat.c linux-2.6.16-026test015/net/compat.c
+--- linux-2.6.16.orig/net/compat.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/compat.c 2006-07-04 14:41:36.000000000 +0400
+@@ -308,107 +308,6 @@ void scm_detach_fds_compat(struct msghdr
+ }
+
+ /*
+- * For now, we assume that the compatibility and native version
+- * of struct ipt_entry are the same - sfr. FIXME
+- */
+-struct compat_ipt_replace {
+- char name[IPT_TABLE_MAXNAMELEN];
+- u32 valid_hooks;
+- u32 num_entries;
+- u32 size;
+- u32 hook_entry[NF_IP_NUMHOOKS];
+- u32 underflow[NF_IP_NUMHOOKS];
+- u32 num_counters;
+- compat_uptr_t counters; /* struct ipt_counters * */
+- struct ipt_entry entries[0];
+-};
+-
+-static int do_netfilter_replace(int fd, int level, int optname,
+- char __user *optval, int optlen)
+-{
+- struct compat_ipt_replace __user *urepl;
+- struct ipt_replace __user *repl_nat;
+- char name[IPT_TABLE_MAXNAMELEN];
+- u32 origsize, tmp32, num_counters;
+- unsigned int repl_nat_size;
+- int ret;
+- int i;
+- compat_uptr_t ucntrs;
+-
+- urepl = (struct compat_ipt_replace __user *)optval;
+- if (get_user(origsize, &urepl->size))
+- return -EFAULT;
+-
+- /* Hack: Causes ipchains to give correct error msg --RR */
+- if (optlen != sizeof(*urepl) + origsize)
+- return -ENOPROTOOPT;
+-
+- /* XXX Assumes that size of ipt_entry is the same both in
+- * native and compat environments.
+- */
+- repl_nat_size = sizeof(*repl_nat) + origsize;
+- repl_nat = compat_alloc_user_space(repl_nat_size);
+-
+- ret = -EFAULT;
+- if (put_user(origsize, &repl_nat->size))
+- goto out;
+-
+- if (!access_ok(VERIFY_READ, urepl, optlen) ||
+- !access_ok(VERIFY_WRITE, repl_nat, optlen))
+- goto out;
+-
+- if (__copy_from_user(name, urepl->name, sizeof(urepl->name)) ||
+- __copy_to_user(repl_nat->name, name, sizeof(repl_nat->name)))
+- goto out;
+-
+- if (__get_user(tmp32, &urepl->valid_hooks) ||
+- __put_user(tmp32, &repl_nat->valid_hooks))
+- goto out;
+-
+- if (__get_user(tmp32, &urepl->num_entries) ||
+- __put_user(tmp32, &repl_nat->num_entries))
+- goto out;
+-
+- if (__get_user(num_counters, &urepl->num_counters) ||
+- __put_user(num_counters, &repl_nat->num_counters))
+- goto out;
+-
+- if (__get_user(ucntrs, &urepl->counters) ||
+- __put_user(compat_ptr(ucntrs), &repl_nat->counters))
+- goto out;
+-
+- if (__copy_in_user(&repl_nat->entries[0],
+- &urepl->entries[0],
+- origsize))
+- goto out;
+-
+- for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+- if (__get_user(tmp32, &urepl->hook_entry[i]) ||
+- __put_user(tmp32, &repl_nat->hook_entry[i]) ||
+- __get_user(tmp32, &urepl->underflow[i]) ||
+- __put_user(tmp32, &repl_nat->underflow[i]))
+- goto out;
+- }
+-
+- /*
+- * Since struct ipt_counters just contains two u_int64_t members
+- * we can just do the access_ok check here and pass the (converted)
+- * pointer into the standard syscall. We hope that the pointer is
+- * not misaligned ...
+- */
+- if (!access_ok(VERIFY_WRITE, compat_ptr(ucntrs),
+- num_counters * sizeof(struct ipt_counters)))
+- goto out;
+-
+-
+- ret = sys_setsockopt(fd, level, optname,
+- (char __user *)repl_nat, repl_nat_size);
+-
+-out:
+- return ret;
+-}
+-
+-/*
+ * A struct sock_filter is architecture independent.
+ */
+ struct compat_sock_fprog {
+@@ -460,10 +359,6 @@ static int do_set_sock_timeout(int fd, i
+ asmlinkage long compat_sys_setsockopt(int fd, int level, int optname,
+ char __user *optval, int optlen)
+ {
+- /* SO_SET_REPLACE seems to be the same in all levels */
+- if (optname == IPT_SO_SET_REPLACE)
+- return do_netfilter_replace(fd, level, optname,
+- optval, optlen);
+ if (level == SOL_SOCKET && optname == SO_ATTACH_FILTER)
+ return do_set_attach_filter(fd, level, optname,
+ optval, optlen);
+diff -upr linux-2.6.16.orig/net/core/datagram.c linux-2.6.16-026test015/net/core/datagram.c
+--- linux-2.6.16.orig/net/core/datagram.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/datagram.c 2006-07-04 14:41:37.000000000 +0400
+@@ -56,6 +56,8 @@
+ #include <net/sock.h>
+ #include <net/tcp_states.h>
+
++#include <ub/ub_net.h>
++
+ /*
+ * Is a socket 'connection oriented' ?
+ */
+@@ -493,6 +495,7 @@ unsigned int datagram_poll(struct file *
+ {
+ struct sock *sk = sock->sk;
+ unsigned int mask;
++ int no_ubc_space;
+
+ poll_wait(file, sk->sk_sleep, wait);
+ mask = 0;
+@@ -500,8 +503,14 @@ unsigned int datagram_poll(struct file *
+ /* exceptional events? */
+ if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ mask |= POLLERR;
+- if (sk->sk_shutdown == SHUTDOWN_MASK)
++ if (sk->sk_shutdown == SHUTDOWN_MASK) {
++ no_ubc_space = 0;
+ mask |= POLLHUP;
++ } else {
++ no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
++ if (no_ubc_space)
++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
++ }
+
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+@@ -518,7 +527,7 @@ unsigned int datagram_poll(struct file *
+ }
+
+ /* writable? */
+- if (sock_writeable(sk))
++ if (!no_ubc_space && sock_writeable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+ else
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+diff -upr linux-2.6.16.orig/net/core/dev.c linux-2.6.16-026test015/net/core/dev.c
+--- linux-2.6.16.orig/net/core/dev.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/dev.c 2006-07-04 14:41:39.000000000 +0400
+@@ -115,6 +115,10 @@
+ #include <net/iw_handler.h>
+ #endif /* CONFIG_NET_RADIO */
+ #include <asm/current.h>
++#include <ub/beancounter.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_mem.h>
+
+ /*
+ * The list of packet types we will receive (as opposed to discard)
+@@ -167,25 +171,40 @@ static struct list_head ptype_all; /* T
+ * unregister_netdevice(), which must be called with the rtnl
+ * semaphore held.
+ */
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define dev_tail (get_exec_env()->_net_dev_tail)
++#else
+ struct net_device *dev_base;
+ static struct net_device **dev_tail = &dev_base;
++EXPORT_SYMBOL(dev_base);
++#endif
+ DEFINE_RWLOCK(dev_base_lock);
+
+-EXPORT_SYMBOL(dev_base);
+ EXPORT_SYMBOL(dev_base_lock);
+
++#ifdef CONFIG_VE
++#define MAX_UNMOVABLE_NETDEVICES (8*4096)
++static uint8_t unmovable_ifindex_list[MAX_UNMOVABLE_NETDEVICES/8];
++static LIST_HEAD(dev_global_list);
++#endif
++
+ #define NETDEV_HASHBITS 8
+ static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
+ static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
+
+-static inline struct hlist_head *dev_name_hash(const char *name)
++struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env)
+ {
+- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
++ unsigned hash;
++ if (!ve_is_super(env))
++ return visible_dev_head(env);
++ hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+ return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
+ }
+
+-static inline struct hlist_head *dev_index_hash(int ifindex)
++struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env)
+ {
++ if (!ve_is_super(env))
++ return visible_dev_index_head(env);
+ return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
+ }
+
+@@ -469,7 +488,7 @@ struct net_device *__dev_get_by_name(con
+ {
+ struct hlist_node *p;
+
+- hlist_for_each(p, dev_name_hash(name)) {
++ hlist_for_each(p, dev_name_hash(name, get_exec_env())) {
+ struct net_device *dev
+ = hlist_entry(p, struct net_device, name_hlist);
+ if (!strncmp(dev->name, name, IFNAMSIZ))
+@@ -502,6 +521,32 @@ struct net_device *dev_get_by_name(const
+ }
+
+ /**
++ * __dev_global_get_by_name - find a device by its name in dev_global_list
++ * @name: name to find
++ *
++ * Find an interface by name. Must be called under RTNL semaphore
++ * If the name is found a pointer to the device
++ * is returned. If the name is not found then %NULL is returned. The
++ * reference counters are not incremented so the caller must be
++ * careful with locks.
++ */
++
++#ifdef CONFIG_VE
++struct net_device *__dev_global_get_by_name(const char *name)
++{
++ struct net_device *dev;
++ /* It's called relatively rarely */
++ list_for_each_entry(dev, &dev_global_list, dev_global_list_entry) {
++ if (strncmp(dev->name, name, IFNAMSIZ) == 0)
++ return dev;
++ }
++ return NULL;
++}
++#else /* CONFIG_VE */
++#define __dev_global_get_by_name(name) __dev_get_by_name(name)
++#endif /* CONFIG_VE */
++
++/**
+ * __dev_get_by_index - find a device by its ifindex
+ * @ifindex: index of device
+ *
+@@ -516,7 +561,7 @@ struct net_device *__dev_get_by_index(in
+ {
+ struct hlist_node *p;
+
+- hlist_for_each(p, dev_index_hash(ifindex)) {
++ hlist_for_each(p, dev_index_hash(ifindex, get_exec_env())) {
+ struct net_device *dev
+ = hlist_entry(p, struct net_device, index_hlist);
+ if (dev->ifindex == ifindex)
+@@ -635,6 +680,23 @@ int dev_valid_name(const char *name)
+ || strchr(name, '/'));
+ }
+
++static inline void __dev_check_name(const char *dev_name, const char *name,
++ long *inuse, const int max_netdevices)
++{
++ int i = 0;
++ char buf[IFNAMSIZ];
++
++ if (!sscanf(dev_name, name, &i))
++ return;
++ if (i < 0 || i >= max_netdevices)
++ return;
++
++ /* avoid cases where sscanf is not exact inverse of printf */
++ snprintf(buf, sizeof(buf), name, i);
++ if (!strncmp(buf, dev_name, IFNAMSIZ))
++ set_bit(i, inuse);
++}
++
+ /**
+ * dev_alloc_name - allocate a name for a device
+ * @dev: device
+@@ -671,16 +733,20 @@ int dev_alloc_name(struct net_device *de
+ if (!inuse)
+ return -ENOMEM;
+
+- for (d = dev_base; d; d = d->next) {
+- if (!sscanf(d->name, name, &i))
+- continue;
+- if (i < 0 || i >= max_netdevices)
+- continue;
+-
+- /* avoid cases where sscanf is not exact inverse of printf */
+- snprintf(buf, sizeof(buf), name, i);
+- if (!strncmp(buf, d->name, IFNAMSIZ))
+- set_bit(i, inuse);
++#ifdef CONFIG_VE
++ if (ve_is_super(get_exec_env())) {
++ list_for_each_entry(d, &dev_global_list,
++ dev_global_list_entry) {
++ __dev_check_name(d->name, name, inuse,
++ max_netdevices);
++ }
++ } else
++#endif
++ {
++ for (d = dev_base; d; d = d->next) {
++ __dev_check_name(d->name, name, inuse,
++ max_netdevices);
++ }
+ }
+
+ i = find_first_zero_bit(inuse, max_netdevices);
+@@ -688,7 +754,11 @@ int dev_alloc_name(struct net_device *de
+ }
+
+ snprintf(buf, sizeof(buf), name, i);
+- if (!__dev_get_by_name(buf)) {
++ if (ve_is_super(get_exec_env()))
++ d = __dev_global_get_by_name(buf);
++ else
++ d = __dev_get_by_name(buf);
++ if (d == NULL) {
+ strlcpy(dev->name, buf, IFNAMSIZ);
+ return i;
+ }
+@@ -721,13 +791,14 @@ int dev_change_name(struct net_device *d
+ if (!dev_valid_name(newname))
+ return -EINVAL;
+
++ /* Rename of devices in VE is prohibited by CAP_NET_ADMIN */
+ if (strchr(newname, '%')) {
+ err = dev_alloc_name(dev, newname);
+ if (err < 0)
+ return err;
+ strcpy(newname, dev->name);
+ }
+- else if (__dev_get_by_name(newname))
++ else if (__dev_global_get_by_name(newname))
+ return -EEXIST;
+ else
+ strlcpy(dev->name, newname, IFNAMSIZ);
+@@ -735,7 +806,8 @@ int dev_change_name(struct net_device *d
+ err = class_device_rename(&dev->class_dev, dev->name);
+ if (!err) {
+ hlist_del(&dev->name_hlist);
+- hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name,
++ get_exec_env()));
+ notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev);
+ }
+
+@@ -1294,6 +1366,25 @@ int dev_queue_xmit(struct sk_buff *skb)
+ skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
+ #endif
+ if (q->enqueue) {
++ struct user_beancounter *ub;
++
++ ub = netdev_bc(dev)->exec_ub;
++ /* the skb CAN be already charged if it transmitted via
++ * something like bonding device */
++ if (ub && (skb_bc(skb)->resource == 0)) {
++ unsigned long chargesize;
++ chargesize = skb_charge_fullsize(skb);
++ if (charge_beancounter(ub, UB_OTHERSOCKBUF,
++ chargesize, UB_SOFT)) {
++ rcu_read_unlock();
++ rc = -ENOMEM;
++ goto out_kfree_skb;
++ }
++ skb_bc(skb)->ub = ub;
++ skb_bc(skb)->charged = chargesize;
++ skb_bc(skb)->resource = UB_OTHERSOCKBUF;
++ }
++
+ /* Grab device queue */
+ spin_lock(&dev->queue_lock);
+
+@@ -1580,6 +1671,7 @@ int netif_receive_skb(struct sk_buff *sk
+ struct net_device *orig_dev;
+ int ret = NET_RX_DROP;
+ unsigned short type;
++ struct ve_struct *old_env;
+
+ /* if we've gotten here through NAPI, check netpoll */
+ if (skb->dev->poll && netpoll_rx(skb))
+@@ -1598,6 +1690,17 @@ int netif_receive_skb(struct sk_buff *sk
+ skb->h.raw = skb->nh.raw = skb->data;
+ skb->mac_len = skb->nh.raw - skb->mac.raw;
+
++#ifdef CONFIG_VE
++ /*
++ * Skb might be alloced in another VE context, than its device works.
++ * So, set the correct owner_env.
++ */
++ skb->owner_env = skb->dev->owner_env;
++ BUG_ON(skb->owner_env == NULL);
++#endif
++
++ old_env = set_exec_env(VE_OWNER_SKB(skb));
++
+ pt_prev = NULL;
+
+ rcu_read_lock();
+@@ -1663,6 +1766,7 @@ ncls:
+
+ out:
+ rcu_read_unlock();
++ (void)set_exec_env(old_env);
+ return ret;
+ }
+
+@@ -2038,7 +2142,7 @@ static int __init dev_proc_init(void)
+ {
+ int rc = -ENOMEM;
+
+- if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
++ if (!proc_glob_fops_create("net/dev", S_IRUGO, &dev_seq_fops))
+ goto out;
+ if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
+ goto out_dev;
+@@ -2050,7 +2154,7 @@ out:
+ out_softnet:
+ proc_net_remove("softnet_stat");
+ out_dev:
+- proc_net_remove("dev");
++ remove_proc_glob_entry("net/dev", NULL);
+ goto out;
+ }
+ #else
+@@ -2115,6 +2219,9 @@ void dev_set_promiscuity(struct net_devi
+ dev->flags &= ~IFF_PROMISC;
+ else
+ dev->flags |= IFF_PROMISC;
++ /* Promiscous mode on these devices does not mean anything */
++ if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT))
++ return;
+ if (dev->flags != old_flags) {
+ dev_mc_upload(dev);
+ printk(KERN_INFO "device %s %s promiscuous mode\n",
+@@ -2529,9 +2636,28 @@ int dev_ioctl(unsigned int cmd, void __u
+ * - require strict serialization.
+ * - do not return a value
+ */
++ case SIOCSIFMTU:
++ if (!capable(CAP_NET_ADMIN) &&
++ !capable(CAP_VE_NET_ADMIN))
++ return -EPERM;
++ dev_load(ifr.ifr_name);
++ rtnl_lock();
++ if (!ve_is_super(get_exec_env())) {
++ struct net_device *dev;
++ ret = -ENODEV;
++ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
++ goto out_set_mtu_unlock;
++ ret = -EPERM;
++ if (ifr.ifr_mtu > dev->orig_mtu)
++ goto out_set_mtu_unlock;
++ }
++ ret = dev_ifsioc(&ifr, cmd);
++out_set_mtu_unlock:
++ rtnl_unlock();
++ return ret;
++
+ case SIOCSIFFLAGS:
+ case SIOCSIFMETRIC:
+- case SIOCSIFMTU:
+ case SIOCSIFMAP:
+ case SIOCSIFHWADDR:
+ case SIOCSIFSLAVE:
+@@ -2613,20 +2739,73 @@ int dev_ioctl(unsigned int cmd, void __u
+ * dev_new_index - allocate an ifindex
+ *
+ * Returns a suitable unique value for a new device interface
+- * number. The caller must hold the rtnl semaphore or the
++ * number. The caller must hold the rtnl semaphore or the
+ * dev_base_lock to be sure it remains unique.
++ *
++ * Note: dev->name must be valid on entrance
+ */
+-static int dev_new_index(void)
++static int dev_ve_new_index(void)
+ {
+- static int ifindex;
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ int *ifindex = &get_exec_env()->ifindex;
++ int delta = 2;
++#else
++ static int s_ifindex;
++ int *ifindex = &s_ifindex;
++ int delta = 1;
++#endif
+ for (;;) {
+- if (++ifindex <= 0)
+- ifindex = 1;
+- if (!__dev_get_by_index(ifindex))
+- return ifindex;
++ *ifindex += delta;
++ if (*ifindex <= 0)
++ *ifindex = 1;
++ if (!__dev_get_by_index(*ifindex))
++ return *ifindex;
+ }
+ }
+
++#ifdef CONFIG_VE
++static int dev_glb_new_index(void)
++{
++ int i;
++
++ i = find_first_zero_bit((long*)unmovable_ifindex_list,
++ MAX_UNMOVABLE_NETDEVICES);
++
++ if (i == MAX_UNMOVABLE_NETDEVICES)
++ return -EMFILE;
++
++ __set_bit(i, (long*)unmovable_ifindex_list);
++ return (i + 1) * 2;
++}
++#endif
++
++static void dev_glb_free_index(struct net_device *dev)
++{
++#ifdef CONFIG_VE
++ int bit;
++
++ bit = dev->ifindex / 2 - 1;
++ BUG_ON(bit >= MAX_UNMOVABLE_NETDEVICES);
++ __clear_bit(bit, (long*)unmovable_ifindex_list);
++#endif
++}
++
++static int dev_new_index(struct net_device *dev)
++{
++#ifdef CONFIG_VE
++ if (ve_is_super(get_exec_env()) && ve_is_dev_movable(dev))
++ return dev_glb_new_index();
++#endif
++
++ return dev_ve_new_index();
++}
++
++static void dev_free_index(struct net_device *dev)
++{
++ if ((dev->ifindex % 2) == 0)
++ dev_glb_free_index(dev);
++}
++
+ static int dev_boot_phase = 1;
+
+ /* Delayed registration/unregisteration */
+@@ -2669,6 +2848,10 @@ int register_netdevice(struct net_device
+ /* When net_device's are persistent, this will be fatal. */
+ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
+
++ ret = -EPERM;
++ if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev))
++ goto out;
++
+ spin_lock_init(&dev->queue_lock);
+ spin_lock_init(&dev->xmit_lock);
+ dev->xmit_lock_owner = -1;
+@@ -2688,27 +2871,32 @@ int register_netdevice(struct net_device
+ if (ret) {
+ if (ret > 0)
+ ret = -EIO;
+- goto out_err;
++ goto out_free_div;
+ }
+ }
+
+ if (!dev_valid_name(dev->name)) {
+ ret = -EINVAL;
+- goto out_err;
++ goto out_free_div;
++ }
++
++ dev->ifindex = dev_new_index(dev);
++ if (dev->ifindex < 0) {
++ ret = dev->ifindex;
++ goto out_free_div;
+ }
+
+- dev->ifindex = dev_new_index();
+ if (dev->iflink == -1)
+ dev->iflink = dev->ifindex;
+
+ /* Check for existence of name */
+- head = dev_name_hash(dev->name);
++ head = dev_name_hash(dev->name, get_exec_env());
+ hlist_for_each(p, head) {
+ struct net_device *d
+ = hlist_entry(p, struct net_device, name_hlist);
+ if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
+ ret = -EEXIST;
+- goto out_err;
++ goto out_free_ind;
+ }
+ }
+
+@@ -2760,12 +2948,21 @@ int register_netdevice(struct net_device
+ set_bit(__LINK_STATE_PRESENT, &dev->state);
+
+ dev->next = NULL;
++ dev->owner_env = get_exec_env();
++ dev->orig_mtu = dev->mtu;
++ netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub());
++ netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub());
+ dev_init_scheduler(dev);
++#ifdef CONFIG_VE
++ if (ve_is_super(get_exec_env()))
++ list_add_tail(&dev->dev_global_list_entry, &dev_global_list);
++#endif
+ write_lock_bh(&dev_base_lock);
+ *dev_tail = dev;
+ dev_tail = &dev->next;
+ hlist_add_head(&dev->name_hlist, head);
+- hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex,
++ get_exec_env()));
+ dev_hold(dev);
+ dev->reg_state = NETREG_REGISTERING;
+ write_unlock_bh(&dev_base_lock);
+@@ -2779,7 +2976,9 @@ int register_netdevice(struct net_device
+
+ out:
+ return ret;
+-out_err:
++out_free_ind:
++ dev_free_index(dev);
++out_free_div:
+ free_divert_blk(dev);
+ goto out;
+ }
+@@ -2825,6 +3024,10 @@ int register_netdev(struct net_device *d
+ err = register_netdevice(dev);
+ out:
+ rtnl_unlock();
++ if (err == 0 && dev->reg_state != NETREG_REGISTERED) {
++ unregister_netdev(dev);
++ err = -ENOMEM;
++ }
+ return err;
+ }
+ EXPORT_SYMBOL(register_netdev);
+@@ -2907,6 +3110,7 @@ void netdev_run_todo(void)
+ {
+ struct list_head list = LIST_HEAD_INIT(list);
+ int err;
++ struct ve_struct *current_env;
+
+
+ /* Need to guard against multiple cpu's getting out of order. */
+@@ -2925,22 +3129,30 @@ void netdev_run_todo(void)
+ list_splice_init(&net_todo_list, &list);
+ spin_unlock(&net_todo_list_lock);
+
++ current_env = get_exec_env();
+ while (!list_empty(&list)) {
+ struct net_device *dev
+ = list_entry(list.next, struct net_device, todo_list);
+ list_del(&dev->todo_list);
+
++ (void)set_exec_env(dev->owner_env);
+ switch(dev->reg_state) {
+ case NETREG_REGISTERING:
++ dev->reg_state = NETREG_REGISTERED;
+ err = netdev_register_sysfs(dev);
+- if (err)
++ if (err) {
+ printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
+ dev->name, err);
+- dev->reg_state = NETREG_REGISTERED;
++ dev->reg_state = NETREG_REGISTER_ERR;
++ break;
++ }
+ break;
+
+ case NETREG_UNREGISTERING:
+ netdev_unregister_sysfs(dev);
++ /* fall through */
++
++ case NETREG_REGISTER_ERR:
+ dev->reg_state = NETREG_UNREGISTERED;
+
+ netdev_wait_allrefs(dev);
+@@ -2951,6 +3163,10 @@ void netdev_run_todo(void)
+ BUG_TRAP(!dev->ip6_ptr);
+ BUG_TRAP(!dev->dn_ptr);
+
++ put_beancounter(netdev_bc(dev)->exec_ub);
++ put_beancounter(netdev_bc(dev)->owner_ub);
++ netdev_bc(dev)->exec_ub = NULL;
++ netdev_bc(dev)->owner_ub = NULL;
+
+ /* It must be the very last action,
+ * after this 'dev' may point to freed up memory.
+@@ -2965,6 +3181,7 @@ void netdev_run_todo(void)
+ break;
+ }
+ }
++ (void)set_exec_env(current_env);
+
+ out:
+ up(&net_todo_run_mutex);
+@@ -2990,7 +3207,7 @@ struct net_device *alloc_netdev(int size
+ alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
+ alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
+
+- p = kmalloc(alloc_size, GFP_KERNEL);
++ p = ub_kmalloc(alloc_size, GFP_KERNEL);
+ if (!p) {
+ printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
+ return NULL;
+@@ -3070,7 +3287,8 @@ int unregister_netdevice(struct net_devi
+ return -ENODEV;
+ }
+
+- BUG_ON(dev->reg_state != NETREG_REGISTERED);
++ BUG_ON(dev->reg_state != NETREG_REGISTERED &&
++ dev->reg_state != NETREG_REGISTER_ERR);
+
+ /* If device is running, close it first. */
+ if (dev->flags & IFF_UP)
+@@ -3086,6 +3304,10 @@ int unregister_netdevice(struct net_devi
+ dev_tail = dp;
+ *dp = d->next;
+ write_unlock_bh(&dev_base_lock);
++#ifdef CONFIG_VE
++ if (ve_is_super(get_exec_env()))
++ list_del(&dev->dev_global_list_entry);
++#endif
+ break;
+ }
+ }
+@@ -3095,7 +3317,8 @@ int unregister_netdevice(struct net_devi
+ return -ENODEV;
+ }
+
+- dev->reg_state = NETREG_UNREGISTERING;
++ if (dev->reg_state != NETREG_REGISTER_ERR)
++ dev->reg_state = NETREG_UNREGISTERING;
+
+ synchronize_net();
+
+@@ -3119,6 +3342,8 @@ int unregister_netdevice(struct net_devi
+ /* Notifier chain MUST detach us from master device. */
+ BUG_TRAP(!dev->master);
+
++ dev_free_index(dev);
++
+ free_divert_blk(dev);
+
+ /* Finish processing unregister after unlock */
+@@ -3276,6 +3501,8 @@ EXPORT_SYMBOL(dev_close);
+ EXPORT_SYMBOL(dev_get_by_flags);
+ EXPORT_SYMBOL(dev_get_by_index);
+ EXPORT_SYMBOL(dev_get_by_name);
++EXPORT_SYMBOL(dev_name_hash);
++EXPORT_SYMBOL(dev_index_hash);
+ EXPORT_SYMBOL(dev_open);
+ EXPORT_SYMBOL(dev_queue_xmit);
+ EXPORT_SYMBOL(dev_remove_pack);
+diff -upr linux-2.6.16.orig/net/core/dev_mcast.c linux-2.6.16-026test015/net/core/dev_mcast.c
+--- linux-2.6.16.orig/net/core/dev_mcast.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/dev_mcast.c 2006-07-04 14:41:38.000000000 +0400
+@@ -290,9 +290,10 @@ static struct file_operations dev_mc_seq
+
+ void __init dev_mcast_init(void)
+ {
+- proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops);
++ proc_glob_fops_create("net/dev_mcast", 0, &dev_mc_seq_fops);
+ }
+
+ EXPORT_SYMBOL(dev_mc_add);
+ EXPORT_SYMBOL(dev_mc_delete);
+ EXPORT_SYMBOL(dev_mc_upload);
++EXPORT_SYMBOL(dev_mc_discard);
+diff -upr linux-2.6.16.orig/net/core/dst.c linux-2.6.16-026test015/net/core/dst.c
+--- linux-2.6.16.orig/net/core/dst.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/dst.c 2006-07-04 14:41:39.000000000 +0400
+@@ -95,12 +95,11 @@ static void dst_run_gc(unsigned long dum
+ dst_gc_timer_inc = DST_GC_INC;
+ dst_gc_timer_expires = DST_GC_MIN;
+ }
+- dst_gc_timer.expires = jiffies + dst_gc_timer_expires;
+ #if RT_CACHE_DEBUG >= 2
+ printk("dst_total: %d/%d %ld\n",
+ atomic_read(&dst_total), delayed, dst_gc_timer_expires);
+ #endif
+- add_timer(&dst_gc_timer);
++ mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
+
+ out:
+ spin_unlock(&dst_lock);
+@@ -260,11 +259,14 @@ static int dst_dev_event(struct notifier
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ case NETDEV_DOWN:
+- spin_lock_bh(&dst_lock);
++ local_bh_disable();
++ dst_run_gc(0);
++ spin_lock(&dst_lock);
+ for (dst = dst_garbage_list; dst; dst = dst->next) {
+ dst_ifdown(dst, dev, event != NETDEV_DOWN);
+ }
+- spin_unlock_bh(&dst_lock);
++ spin_unlock(&dst_lock);
++ local_bh_enable();
+ break;
+ }
+ return NOTIFY_DONE;
+diff -upr linux-2.6.16.orig/net/core/dv.c linux-2.6.16-026test015/net/core/dv.c
+--- linux-2.6.16.orig/net/core/dv.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/dv.c 2006-07-04 14:41:37.000000000 +0400
+@@ -547,3 +547,5 @@ void divert_frame(struct sk_buff *skb)
+ break;
+ }
+ }
++
++EXPORT_SYMBOL(free_divert_blk);
+diff -upr linux-2.6.16.orig/net/core/filter.c linux-2.6.16-026test015/net/core/filter.c
+--- linux-2.6.16.orig/net/core/filter.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/filter.c 2006-07-04 14:41:37.000000000 +0400
+@@ -34,6 +34,7 @@
+ #include <linux/timer.h>
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include <asm/unaligned.h>
+ #include <linux/filter.h>
+
+ /* No hurry in this branch */
+@@ -177,7 +178,7 @@ unsigned int sk_run_filter(struct sk_buf
+ load_w:
+ ptr = load_pointer(skb, k, 4, &tmp);
+ if (ptr != NULL) {
+- A = ntohl(*(u32 *)ptr);
++ A = ntohl(get_unaligned((u32 *)ptr));
+ continue;
+ }
+ break;
+@@ -186,7 +187,7 @@ load_w:
+ load_h:
+ ptr = load_pointer(skb, k, 2, &tmp);
+ if (ptr != NULL) {
+- A = ntohs(*(u16 *)ptr);
++ A = ntohs(get_unaligned((u16 *)ptr));
+ continue;
+ }
+ break;
+@@ -406,7 +407,7 @@ int sk_attach_filter(struct sock_fprog *
+ if (fprog->filter == NULL)
+ return -EINVAL;
+
+- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
++ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC);
+ if (!fp)
+ return -ENOMEM;
+ if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+diff -upr linux-2.6.16.orig/net/core/neighbour.c linux-2.6.16-026test015/net/core/neighbour.c
+--- linux-2.6.16.orig/net/core/neighbour.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/neighbour.c 2006-07-04 14:41:39.000000000 +0400
+@@ -33,6 +33,7 @@
+ #include <linux/rtnetlink.h>
+ #include <linux/random.h>
+ #include <linux/string.h>
++#include <ub/beancounter.h>
+
+ #define NEIGH_DEBUG 1
+
+@@ -639,6 +640,8 @@ static void neigh_periodic_timer(unsigne
+ struct neigh_table *tbl = (struct neigh_table *)arg;
+ struct neighbour *n, **np;
+ unsigned long expire, now = jiffies;
++ struct ve_struct *env = set_exec_env(tbl->owner_env);
++ struct user_beancounter *ub = set_exec_ub(tbl->owner_ub);
+
+ NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
+
+@@ -700,6 +703,8 @@ next_elt:
+ mod_timer(&tbl->gc_timer, now + expire);
+
+ write_unlock(&tbl->lock);
++ set_exec_ub(ub);
++ set_exec_env(env);
+ }
+
+ static __inline__ int neigh_max_probes(struct neighbour *n)
+@@ -727,6 +732,11 @@ static void neigh_timer_handler(unsigned
+ struct neighbour *neigh = (struct neighbour *)arg;
+ unsigned state;
+ int notify = 0;
++ struct ve_struct *env;
++ struct user_beancounter *ub;
++
++ env = set_exec_env(neigh->dev->owner_env);
++ ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub);
+
+ write_lock(&neigh->lock);
+
+@@ -824,6 +834,8 @@ out:
+ neigh_app_notify(neigh);
+ #endif
+ neigh_release(neigh);
++ (void)set_exec_ub(ub);
++ (void)set_exec_env(env);
+ }
+
+ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+@@ -1202,6 +1214,9 @@ static void neigh_proxy_process(unsigned
+ unsigned long now = jiffies;
+ struct sk_buff *skb;
+
++ struct ve_struct *env = set_exec_env(tbl->owner_env);
++ struct user_beancounter *ub = set_exec_ub(tbl->owner_ub);
++
+ spin_lock(&tbl->proxy_queue.lock);
+
+ skb = tbl->proxy_queue.next;
+@@ -1213,6 +1228,7 @@ static void neigh_proxy_process(unsigned
+ skb = skb->next;
+ if (tdif <= 0) {
+ struct net_device *dev = back->dev;
++
+ __skb_unlink(back, &tbl->proxy_queue);
+ if (tbl->proxy_redo && netif_running(dev))
+ tbl->proxy_redo(back);
+@@ -1220,6 +1236,7 @@ static void neigh_proxy_process(unsigned
+ kfree_skb(back);
+
+ dev_put(dev);
++
+ } else if (!sched_next || tdif < sched_next)
+ sched_next = tdif;
+ }
+@@ -1227,6 +1244,8 @@ static void neigh_proxy_process(unsigned
+ if (sched_next)
+ mod_timer(&tbl->proxy_timer, jiffies + sched_next);
+ spin_unlock(&tbl->proxy_queue.lock);
++ (void)set_exec_ub(ub);
++ (void)set_exec_env(env);
+ }
+
+ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
+@@ -1323,12 +1342,15 @@ void neigh_parms_destroy(struct neigh_pa
+ }
+
+
+-void neigh_table_init(struct neigh_table *tbl)
++int neigh_table_init(struct neigh_table *tbl)
+ {
+ unsigned long now = jiffies;
+ unsigned long phsize;
+
+ atomic_set(&tbl->parms.refcnt, 1);
++ atomic_set(&tbl->entries, 0);
++ tbl->hash_chain_gc = 0;
++ tbl->parms.next = NULL;
+ INIT_RCU_HEAD(&tbl->parms.rcu_head);
+ tbl->parms.reachable_time =
+ neigh_rand_reach_time(tbl->parms.base_reachable_time);
+@@ -1336,22 +1358,30 @@ void neigh_table_init(struct neigh_table
+ if (!tbl->kmem_cachep)
+ tbl->kmem_cachep = kmem_cache_create(tbl->id,
+ tbl->entry_size,
+- 0, SLAB_HWCACHE_ALIGN,
++ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC,
+ NULL, NULL);
+
+ if (!tbl->kmem_cachep)
+- panic("cannot create neighbour cache");
++ return -ENOMEM;
++
++ tbl->owner_env = get_ve(get_exec_env());
++ tbl->owner_ub = get_beancounter(get_exec_ub());
+
+ tbl->stats = alloc_percpu(struct neigh_statistics);
+ if (!tbl->stats)
+- panic("cannot create neighbour cache statistics");
++ return -ENOMEM;
+
+ #ifdef CONFIG_PROC_FS
+- tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat);
+- if (!tbl->pde)
+- panic("cannot create neighbour proc dir entry");
+- tbl->pde->proc_fops = &neigh_stat_seq_fops;
+- tbl->pde->data = tbl;
++ if (ve_is_super(get_exec_env())) {
++ char name[strlen(tbl->id) + sizeof("net/stat/")];
++ strcpy(name, "net/stat/");
++ strcat(name, tbl->id);
++ tbl->pde = create_proc_glob_entry(name, S_IRUGO, NULL);
++ if (tbl->pde) {
++ tbl->pde->proc_fops = &neigh_stat_seq_fops;
++ tbl->pde->data = tbl;
++ }
++ }
+ #endif
+
+ tbl->hash_mask = 1;
+@@ -1361,7 +1391,7 @@ void neigh_table_init(struct neigh_table
+ tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL);
+
+ if (!tbl->hash_buckets || !tbl->phash_buckets)
+- panic("cannot allocate neighbour cache hashes");
++ goto nomem;
+
+ memset(tbl->phash_buckets, 0, phsize);
+
+@@ -1385,6 +1415,24 @@ void neigh_table_init(struct neigh_table
+ tbl->next = neigh_tables;
+ neigh_tables = tbl;
+ write_unlock(&neigh_tbl_lock);
++ return 0;
++
++nomem:
++ if (tbl->hash_buckets) {
++ neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
++ tbl->hash_buckets = NULL;
++ }
++ if (tbl->phash_buckets) {
++ kfree(tbl->phash_buckets);
++ tbl->phash_buckets = NULL;
++ }
++ if (tbl->stats) {
++ free_percpu(tbl->stats);
++ tbl->stats = NULL;
++ }
++ put_beancounter(tbl->owner_ub);
++ put_ve(tbl->owner_env);
++ return -ENOMEM;
+ }
+
+ int neigh_table_clear(struct neigh_table *tbl)
+@@ -1398,6 +1446,15 @@ int neigh_table_clear(struct neigh_table
+ neigh_ifdown(tbl, NULL);
+ if (atomic_read(&tbl->entries))
+ printk(KERN_CRIT "neighbour leakage\n");
++#ifdef CONFIG_PROC_FS
++ if (ve_is_super(get_exec_env())) {
++ char name[strlen(tbl->id) + sizeof("net/stat/")];
++ strcpy(name, "net/stat/");
++ strcat(name, tbl->id);
++ remove_proc_glob_entry(name, NULL);
++ }
++#endif
++
+ write_lock(&neigh_tbl_lock);
+ for (tp = &neigh_tables; *tp; tp = &(*tp)->next) {
+ if (*tp == tbl) {
+@@ -1413,6 +1470,9 @@ int neigh_table_clear(struct neigh_table
+ kfree(tbl->phash_buckets);
+ tbl->phash_buckets = NULL;
+
++ put_beancounter(tbl->owner_ub);
++ put_ve(tbl->owner_env);
++
+ return 0;
+ }
+
+@@ -1435,6 +1495,8 @@ int neigh_delete(struct sk_buff *skb, st
+
+ if (tbl->family != ndm->ndm_family)
+ continue;
++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env()))
++ continue;
+ read_unlock(&neigh_tbl_lock);
+
+ err = -EINVAL;
+@@ -1488,6 +1550,8 @@ int neigh_add(struct sk_buff *skb, struc
+
+ if (tbl->family != ndm->ndm_family)
+ continue;
++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env()))
++ continue;
+ read_unlock(&neigh_tbl_lock);
+
+ err = -EINVAL;
+@@ -1720,6 +1784,9 @@ int neightbl_set(struct sk_buff *skb, st
+ if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
+ continue;
+
++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env()))
++ continue;
++
+ if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id))
+ break;
+ }
+@@ -1941,6 +2008,8 @@ int neigh_dump_info(struct sk_buff *skb,
+ s_t = cb->args[0];
+
+ for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) {
++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env()))
++ continue;
+ if (t < s_t || (family && tbl->family != family))
+ continue;
+ if (t > s_t)
+@@ -2530,11 +2599,12 @@ int neigh_sysctl_register(struct net_dev
+ int p_id, int pdev_id, char *p_name,
+ proc_handler *handler, ctl_handler *strategy)
+ {
+- struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
++ struct neigh_sysctl_table *t;
+ const char *dev_name_source = NULL;
+ char *dev_name = NULL;
+ int err = 0;
+
++ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return -ENOBUFS;
+ memcpy(t, &neigh_sysctl_template, sizeof(*t));
+diff -upr linux-2.6.16.orig/net/core/net-sysfs.c linux-2.6.16-026test015/net/core/net-sysfs.c
+--- linux-2.6.16.orig/net/core/net-sysfs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/net-sysfs.c 2006-07-04 14:41:38.000000000 +0400
+@@ -388,12 +388,13 @@ static void netdev_release(struct class_
+ struct net_device *dev
+ = container_of(cd, struct net_device, class_dev);
+
+- BUG_ON(dev->reg_state != NETREG_RELEASED);
++ BUG_ON(dev->reg_state != NETREG_RELEASED &&
++ dev->reg_state != NETREG_REGISTERING);
+
+ kfree((char *)dev - dev->padded);
+ }
+
+-static struct class net_class = {
++struct class net_class = {
+ .name = "net",
+ .release = netdev_release,
+ .class_dev_attrs = net_class_attributes,
+@@ -401,6 +402,13 @@ static struct class net_class = {
+ .uevent = netdev_uevent,
+ #endif
+ };
++EXPORT_SYMBOL(net_class);
++
++#ifndef CONFIG_VE
++#define visible_net_class net_class
++#else
++#define visible_net_class (*get_exec_env()->net_class)
++#endif
+
+ void netdev_unregister_sysfs(struct net_device * net)
+ {
+@@ -424,7 +432,7 @@ int netdev_register_sysfs(struct net_dev
+ struct class_device *class_dev = &(net->class_dev);
+ int ret;
+
+- class_dev->class = &net_class;
++ class_dev->class = &visible_net_class;
+ class_dev->class_data = net;
+
+ strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE);
+@@ -453,12 +461,21 @@ out_cleanup:
+ out_unreg:
+ printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n",
+ net->name, ret);
+- class_device_unregister(class_dev);
++ /* put is called in free_netdev() */
++ class_device_del(class_dev);
+ out:
+ return ret;
+ }
+
++void prepare_sysfs_netdev(void)
++{
++#ifdef CONFIG_VE
++ get_ve0()->net_class = &net_class;
++#endif
++}
++
+ int netdev_sysfs_init(void)
+ {
++ prepare_sysfs_netdev();
+ return class_register(&net_class);
+ }
+diff -upr linux-2.6.16.orig/net/core/rtnetlink.c linux-2.6.16-026test015/net/core/rtnetlink.c
+--- linux-2.6.16.orig/net/core/rtnetlink.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/rtnetlink.c 2006-07-04 14:41:38.000000000 +0400
+@@ -434,6 +434,8 @@ static int rtnetlink_dump_all(struct sk_
+ if (rtnetlink_links[idx] == NULL ||
+ rtnetlink_links[idx][type].dumpit == NULL)
+ continue;
++ if (vz_security_proto_check(idx, 0, 0))
++ continue;
+ if (idx > s_idx)
+ memset(&cb->args[0], 0, sizeof(cb->args));
+ if (rtnetlink_links[idx][type].dumpit(skb, cb))
+@@ -501,7 +503,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, s
+ return 0;
+
+ family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family;
+- if (family >= NPROTO) {
++ if (family >= NPROTO || vz_security_proto_check(family, 0, 0)) {
+ *errp = -EAFNOSUPPORT;
+ return -1;
+ }
+diff -upr linux-2.6.16.orig/net/core/scm.c linux-2.6.16-026test015/net/core/scm.c
+--- linux-2.6.16.orig/net/core/scm.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/scm.c 2006-07-04 14:41:38.000000000 +0400
+@@ -34,6 +34,7 @@
+ #include <net/compat.h>
+ #include <net/scm.h>
+
++#include <ub/ub_mem.h>
+
+ /*
+ * Only allow a user to send credentials, that they could set with
+@@ -42,7 +43,9 @@
+
+ static __inline__ int scm_check_creds(struct ucred *creds)
+ {
+- if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) &&
++ if ((creds->pid == virt_tgid(current) ||
++ creds->pid == current->tgid ||
++ capable(CAP_VE_SYS_ADMIN)) &&
+ ((creds->uid == current->uid || creds->uid == current->euid ||
+ creds->uid == current->suid) || capable(CAP_SETUID)) &&
+ ((creds->gid == current->gid || creds->gid == current->egid ||
+@@ -69,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *c
+
+ if (!fpl)
+ {
+- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
++ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ if (!fpl)
+ return -ENOMEM;
+ *fplp = fpl;
+@@ -275,7 +278,7 @@ struct scm_fp_list *scm_fp_dup(struct sc
+ if (!fpl)
+ return NULL;
+
+- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL);
++ new_fpl = ub_kmalloc(sizeof(*fpl), GFP_KERNEL);
+ if (new_fpl) {
+ for (i=fpl->count-1; i>=0; i--)
+ get_file(fpl->fp[i]);
+diff -upr linux-2.6.16.orig/net/core/skbuff.c linux-2.6.16-026test015/net/core/skbuff.c
+--- linux-2.6.16.orig/net/core/skbuff.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/skbuff.c 2006-07-04 14:41:38.000000000 +0400
+@@ -48,6 +48,7 @@
+ #include <linux/in.h>
+ #include <linux/inet.h>
+ #include <linux/slab.h>
++#include <linux/kmem_cache.h>
+ #include <linux/netdevice.h>
+ #ifdef CONFIG_NET_CLS_ACT
+ #include <net/pkt_sched.h>
+@@ -68,6 +69,8 @@
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+
++#include <ub/ub_net.h>
++
+ static kmem_cache_t *skbuff_head_cache __read_mostly;
+ static kmem_cache_t *skbuff_fclone_cache __read_mostly;
+
+@@ -147,6 +150,9 @@ struct sk_buff *__alloc_skb(unsigned int
+ if (!skb)
+ goto out;
+
++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA))
++ goto nobc;
++
+ /* Get the DATA. Size must match skb_add_mtu(). */
+ size = SKB_DATA_ALIGN(size);
+ data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+@@ -160,6 +166,7 @@ struct sk_buff *__alloc_skb(unsigned int
+ skb->data = data;
+ skb->tail = data;
+ skb->end = data + size;
++ SET_VE_OWNER_SKB(skb, get_exec_env());
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ atomic_set(&shinfo->dataref, 1);
+@@ -182,6 +189,8 @@ struct sk_buff *__alloc_skb(unsigned int
+ out:
+ return skb;
+ nodata:
++ ub_skb_free_bc(skb);
++nobc:
+ kmem_cache_free(cache, skb);
+ skb = NULL;
+ goto out;
+@@ -214,6 +223,9 @@ struct sk_buff *alloc_skb_from_cache(kme
+ if (!skb)
+ goto out;
+
++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA))
++ goto nobc;
++
+ /* Get the DATA. */
+ size = SKB_DATA_ALIGN(size);
+ data = kmem_cache_alloc(cp, gfp_mask);
+@@ -227,6 +239,7 @@ struct sk_buff *alloc_skb_from_cache(kme
+ skb->data = data;
+ skb->tail = data;
+ skb->end = data + size;
++ SET_VE_OWNER_SKB(skb, get_exec_env());
+
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+@@ -236,6 +249,8 @@ struct sk_buff *alloc_skb_from_cache(kme
+ out:
+ return skb;
+ nodata:
++ ub_skb_free_bc(skb);
++nobc:
+ kmem_cache_free(skbuff_head_cache, skb);
+ skb = NULL;
+ goto out;
+@@ -290,6 +305,7 @@ void kfree_skbmem(struct sk_buff *skb)
+ atomic_t *fclone_ref;
+
+ skb_release_data(skb);
++ ub_skb_free_bc(skb);
+ switch (skb->fclone) {
+ case SKB_FCLONE_UNAVAILABLE:
+ kmem_cache_free(skbuff_head_cache, skb);
+@@ -331,6 +347,7 @@ void __kfree_skb(struct sk_buff *skb)
+ #ifdef CONFIG_XFRM
+ secpath_put(skb->sp);
+ #endif
++ ub_skb_uncharge(skb);
+ if (skb->destructor) {
+ WARN_ON(in_irq());
+ skb->destructor(skb);
+@@ -386,6 +403,11 @@ struct sk_buff *skb_clone(struct sk_buff
+ n->fclone = SKB_FCLONE_UNAVAILABLE;
+ }
+
++ if (ub_skb_alloc_bc(n, gfp_mask)) {
++ kmem_cache_free(skbuff_head_cache, n);
++ return NULL;
++ }
++
+ #define C(x) n->x = skb->x
+
+ n->next = n->prev = NULL;
+@@ -415,6 +437,7 @@ struct sk_buff *skb_clone(struct sk_buff
+ C(ipvs_property);
+ #endif
+ C(protocol);
++ SET_VE_OWNER_SKB(n, VE_OWNER_SKB(skb));
+ n->destructor = NULL;
+ #ifdef CONFIG_NETFILTER
+ C(nfmark);
+diff -upr linux-2.6.16.orig/net/core/sock.c linux-2.6.16-026test015/net/core/sock.c
+--- linux-2.6.16.orig/net/core/sock.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/sock.c 2006-07-04 14:41:38.000000000 +0400
+@@ -108,6 +108,7 @@
+ #include <linux/net.h>
+ #include <linux/mm.h>
+ #include <linux/slab.h>
++#include <linux/kmem_cache.h>
+ #include <linux/interrupt.h>
+ #include <linux/poll.h>
+ #include <linux/tcp.h>
+@@ -124,6 +125,9 @@
+ #include <net/xfrm.h>
+ #include <linux/ipsec.h>
+
++#include <ub/ub_net.h>
++#include <ub/beancounter.h>
++
+ #include <linux/filter.h>
+
+ #ifdef CONFIG_INET
+@@ -172,7 +176,7 @@ static void sock_warn_obsolete_bsdism(co
+ static char warncomm[TASK_COMM_LEN];
+ if (strcmp(warncomm, current->comm) && warned < 5) {
+ strcpy(warncomm, current->comm);
+- printk(KERN_WARNING "process `%s' is using obsolete "
++ ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete "
+ "%s SO_BSDCOMPAT\n", warncomm, name);
+ warned++;
+ }
+@@ -404,8 +408,9 @@ set_rcvbuf:
+ if (!valbool) {
+ sk->sk_bound_dev_if = 0;
+ } else {
+- if (optlen > IFNAMSIZ)
+- optlen = IFNAMSIZ;
++ if (optlen > IFNAMSIZ - 1)
++ optlen = IFNAMSIZ - 1;
++ memset(devname, 0, sizeof(devname));
+ if (copy_from_user(devname, optval, optlen)) {
+ ret = -EFAULT;
+ break;
+@@ -659,6 +664,7 @@ struct sock *sk_alloc(int family, gfp_t
+ */
+ sk->sk_prot = sk->sk_prot_creator = prot;
+ sock_lock_init(sk);
++ SET_VE_OWNER_SK(sk, get_exec_env());
+ }
+
+ if (security_sk_alloc(sk, family, priority))
+@@ -698,6 +704,7 @@ void sk_free(struct sock *sk)
+ __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
+
+ security_sk_free(sk);
++ ub_sock_uncharge(sk);
+ if (sk->sk_prot_creator->slab != NULL)
+ kmem_cache_free(sk->sk_prot_creator->slab, sk);
+ else
+@@ -742,14 +749,11 @@ struct sock *sk_clone(const struct sock
+ if (filter != NULL)
+ sk_filter_charge(newsk, filter);
+
+- if (unlikely(xfrm_sk_clone_policy(newsk))) {
+- /* It is still raw copy of parent, so invalidate
+- * destructor and make plain sk_free() */
+- newsk->sk_destruct = NULL;
+- sk_free(newsk);
+- newsk = NULL;
+- goto out;
+- }
++ if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0)
++ goto out_err;
++
++ if (unlikely(xfrm_sk_clone_policy(newsk)))
++ goto out_err;
+
+ newsk->sk_err = 0;
+ newsk->sk_priority = 0;
+@@ -773,8 +777,15 @@ struct sock *sk_clone(const struct sock
+ if (newsk->sk_prot->sockets_allocated)
+ atomic_inc(newsk->sk_prot->sockets_allocated);
+ }
+-out:
+ return newsk;
++
++out_err:
++ /* It is still raw copy of parent, so invalidate
++ * destructor and make plain sk_free() */
++ sock_reset_flag(newsk, SOCK_TIMESTAMP);
++ newsk->sk_destruct = NULL;
++ sk_free(newsk);
++ return NULL;
+ }
+
+ EXPORT_SYMBOL_GPL(sk_clone);
+@@ -934,14 +945,12 @@ static long sock_wait_for_wmem(struct so
+ /*
+ * Generic send/receive buffer handlers
+ */
+-
+-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
+- unsigned long header_len,
+- unsigned long data_len,
+- int noblock, int *errcode)
++struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size,
++ unsigned long size2, int noblock,
++ int *errcode)
+ {
+ struct sk_buff *skb;
+- gfp_t gfp_mask;
++ unsigned int gfp_mask;
+ long timeo;
+ int err;
+
+@@ -959,46 +968,35 @@ static struct sk_buff *sock_alloc_send_p
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto failure;
+
+- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+- skb = alloc_skb(header_len, sk->sk_allocation);
+- if (skb) {
+- int npages;
+- int i;
+-
+- /* No pages, we're done... */
+- if (!data_len)
+- break;
+-
+- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
+- skb->truesize += data_len;
+- skb_shinfo(skb)->nr_frags = npages;
+- for (i = 0; i < npages; i++) {
+- struct page *page;
+- skb_frag_t *frag;
+-
+- page = alloc_pages(sk->sk_allocation, 0);
+- if (!page) {
+- err = -ENOBUFS;
+- skb_shinfo(skb)->nr_frags = i;
+- kfree_skb(skb);
+- goto failure;
+- }
+-
+- frag = &skb_shinfo(skb)->frags[i];
+- frag->page = page;
+- frag->page_offset = 0;
+- frag->size = (data_len >= PAGE_SIZE ?
+- PAGE_SIZE :
+- data_len);
+- data_len -= PAGE_SIZE;
+- }
++ if (ub_sock_getwres_other(sk, skb_charge_size(size))) {
++ if (size2 < size) {
++ size = size2;
++ continue;
++ }
++ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
++ err = -EAGAIN;
++ if (!timeo)
++ goto failure;
++ if (signal_pending(current))
++ goto interrupted;
++ timeo = ub_sock_wait_for_space(sk, timeo,
++ skb_charge_size(size));
++ continue;
++ }
+
++ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
++ skb = alloc_skb(size, sk->sk_allocation);
++ if (skb)
+ /* Full success... */
+ break;
+- }
++ ub_sock_retwres_other(sk, skb_charge_size(size),
++ SOCK_MIN_UBCSPACE_CH);
+ err = -ENOBUFS;
+ goto failure;
+ }
++ ub_sock_retwres_other(sk,
++ skb_charge_size(size),
++ SOCK_MIN_UBCSPACE_CH);
+ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = -EAGAIN;
+@@ -1009,6 +1007,7 @@ static struct sk_buff *sock_alloc_send_p
+ timeo = sock_wait_for_wmem(sk, timeo);
+ }
+
++ ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF);
+ skb_set_owner_w(skb, sk);
+ return skb;
+
+@@ -1022,7 +1021,7 @@ failure:
+ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
+ int noblock, int *errcode)
+ {
+- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
++ return sock_alloc_send_skb2(sk, size, size, noblock, errcode);
+ }
+
+ static void __lock_sock(struct sock *sk)
+@@ -1462,7 +1461,8 @@ int proto_register(struct proto *prot, i
+
+ if (alloc_slab) {
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
+- SLAB_HWCACHE_ALIGN, NULL, NULL);
++ SLAB_HWCACHE_ALIGN | SLAB_UBC,
++ NULL, NULL);
+
+ if (prot->slab == NULL) {
+ printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
+@@ -1478,9 +1478,11 @@ int proto_register(struct proto *prot, i
+ goto out_free_sock_slab;
+
+ sprintf(request_sock_slab_name, mask, prot->name);
+- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+- prot->rsk_prot->obj_size, 0,
+- SLAB_HWCACHE_ALIGN, NULL, NULL);
++ prot->rsk_prot->slab =
++ kmem_cache_create(request_sock_slab_name,
++ prot->rsk_prot->obj_size, 0,
++ SLAB_HWCACHE_ALIGN | SLAB_UBC,
++ NULL, NULL);
+
+ if (prot->rsk_prot->slab == NULL) {
+ printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+@@ -1501,7 +1503,7 @@ int proto_register(struct proto *prot, i
+ prot->twsk_prot->twsk_slab =
+ kmem_cache_create(timewait_sock_slab_name,
+ prot->twsk_prot->twsk_obj_size,
+- 0, SLAB_HWCACHE_ALIGN,
++ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC,
+ NULL, NULL);
+ if (prot->twsk_prot->twsk_slab == NULL)
+ goto out_free_timewait_sock_slab_name;
+diff -upr linux-2.6.16.orig/net/core/stream.c linux-2.6.16-026test015/net/core/stream.c
+--- linux-2.6.16.orig/net/core/stream.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/core/stream.c 2006-07-04 14:41:37.000000000 +0400
+@@ -111,8 +111,9 @@ EXPORT_SYMBOL(sk_stream_wait_close);
+ * sk_stream_wait_memory - Wait for more memory for a socket
+ * @sk: socket to wait for memory
+ * @timeo_p: for how long
++ * @amount - amount of memory to wait for (in UB space!)
+ */
+-int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
++int sk_stream_wait_memory(struct sock *sk, long *timeo_p, unsigned long amount)
+ {
+ int err = 0;
+ long vm_wait = 0;
+@@ -134,8 +135,11 @@ int sk_stream_wait_memory(struct sock *s
+ if (signal_pending(current))
+ goto do_interrupted;
+ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+- if (sk_stream_memory_free(sk) && !vm_wait)
+- break;
++ if (amount == 0) {
++ if (sk_stream_memory_free(sk) && !vm_wait)
++ break;
++ } else
++ ub_sock_sndqueueadd_tcp(sk, amount);
+
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_pending++;
+@@ -144,6 +148,8 @@ int sk_stream_wait_memory(struct sock *s
+ sk_stream_memory_free(sk) &&
+ vm_wait);
+ sk->sk_write_pending--;
++ if (amount > 0)
++ ub_sock_sndqueuedel(sk);
+
+ if (vm_wait) {
+ vm_wait -= current_timeo;
+diff -upr linux-2.6.16.orig/net/dccp/ipv6.c linux-2.6.16-026test015/net/dccp/ipv6.c
+--- linux-2.6.16.orig/net/dccp/ipv6.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/dccp/ipv6.c 2006-07-04 14:41:37.000000000 +0400
+@@ -872,6 +872,8 @@ static struct sock *dccp_v6_request_recv
+ ip6_dst_store(newsk, dst, NULL);
+ newsk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
++ if (!sysctl_tcp_use_sg)
++ newsk->sk_route_caps &= ~NETIF_F_SG;
+
+ newdp6 = (struct dccp6_sock *)newsk;
+ newinet = inet_sk(newsk);
+diff -upr linux-2.6.16.orig/net/ipv4/af_inet.c linux-2.6.16-026test015/net/ipv4/af_inet.c
+--- linux-2.6.16.orig/net/ipv4/af_inet.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/af_inet.c 2006-07-04 14:41:38.000000000 +0400
+@@ -114,6 +114,7 @@
+ #ifdef CONFIG_IP_MROUTE
+ #include <linux/mroute.h>
+ #endif
++#include <ub/ub_net.h>
+
+ DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
+
+@@ -298,6 +299,13 @@ lookup_protocol:
+ if (sk == NULL)
+ goto out;
+
++ err = -ENOBUFS;
++ if (ub_sock_charge(sk, PF_INET, sock->type))
++ goto out_sk_free;
++ /* if charge was successful, sock_init_data() MUST be called to
++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource
++ */
++
+ err = 0;
+ sk->sk_no_check = answer_no_check;
+ if (INET_PROTOSW_REUSE & answer_flags)
+@@ -355,6 +363,9 @@ out:
+ out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
++out_sk_free:
++ sk_free(sk);
++ return err;
+ }
+
+
+@@ -369,6 +380,9 @@ int inet_release(struct socket *sock)
+
+ if (sk) {
+ long timeout;
++ struct ve_struct *saved_env;
++
++ saved_env = set_exec_env(VE_OWNER_SK(sk));
+
+ /* Applications forget to leave groups before exiting */
+ ip_mc_drop_socket(sk);
+@@ -386,6 +400,8 @@ int inet_release(struct socket *sock)
+ timeout = sk->sk_lingertime;
+ sock->sk = NULL;
+ sk->sk_prot->close(sk, timeout);
++
++ (void)set_exec_env(saved_env);
+ }
+ return 0;
+ }
+@@ -1108,20 +1124,20 @@ static struct net_protocol icmp_protocol
+
+ static int __init init_ipv4_mibs(void)
+ {
+- net_statistics[0] = alloc_percpu(struct linux_mib);
+- net_statistics[1] = alloc_percpu(struct linux_mib);
+- ip_statistics[0] = alloc_percpu(struct ipstats_mib);
+- ip_statistics[1] = alloc_percpu(struct ipstats_mib);
+- icmp_statistics[0] = alloc_percpu(struct icmp_mib);
+- icmp_statistics[1] = alloc_percpu(struct icmp_mib);
+- tcp_statistics[0] = alloc_percpu(struct tcp_mib);
+- tcp_statistics[1] = alloc_percpu(struct tcp_mib);
+- udp_statistics[0] = alloc_percpu(struct udp_mib);
+- udp_statistics[1] = alloc_percpu(struct udp_mib);
++ ve_net_statistics[0] = alloc_percpu(struct linux_mib);
++ ve_net_statistics[1] = alloc_percpu(struct linux_mib);
++ ve_ip_statistics[0] = alloc_percpu(struct ipstats_mib);
++ ve_ip_statistics[1] = alloc_percpu(struct ipstats_mib);
++ ve_icmp_statistics[0] = alloc_percpu(struct icmp_mib);
++ ve_icmp_statistics[1] = alloc_percpu(struct icmp_mib);
++ ve_tcp_statistics[0] = alloc_percpu(struct tcp_mib);
++ ve_tcp_statistics[1] = alloc_percpu(struct tcp_mib);
++ ve_udp_statistics[0] = alloc_percpu(struct udp_mib);
++ ve_udp_statistics[1] = alloc_percpu(struct udp_mib);
+ if (!
+- (net_statistics[0] && net_statistics[1] && ip_statistics[0]
+- && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
+- && udp_statistics[0] && udp_statistics[1]))
++ (ve_net_statistics[0] && ve_net_statistics[1] && ve_ip_statistics[0]
++ && ve_ip_statistics[1] && ve_tcp_statistics[0] && ve_tcp_statistics[1]
++ && ve_udp_statistics[0] && ve_udp_statistics[1]))
+ return -ENOMEM;
+
+ (void) tcp_mib_init();
+diff -upr linux-2.6.16.orig/net/ipv4/arp.c linux-2.6.16-026test015/net/ipv4/arp.c
+--- linux-2.6.16.orig/net/ipv4/arp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/arp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -175,7 +175,7 @@ struct neigh_ops arp_broken_ops = {
+ .queue_xmit = dev_queue_xmit,
+ };
+
+-struct neigh_table arp_tbl = {
++struct neigh_table global_arp_tbl = {
+ .family = AF_INET,
+ .entry_size = sizeof(struct neighbour) + 4,
+ .key_len = 4,
+@@ -184,7 +184,7 @@ struct neigh_table arp_tbl = {
+ .proxy_redo = parp_redo,
+ .id = "arp_cache",
+ .parms = {
+- .tbl = &arp_tbl,
++ .tbl = &global_arp_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+@@ -920,6 +920,9 @@ out:
+
+ static void parp_redo(struct sk_buff *skb)
+ {
++#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETFILTER_DEBUG)
++ skb->nf_debug = 0;
++#endif
+ arp_process(skb);
+ }
+
+@@ -988,7 +991,7 @@ static int arp_req_set(struct arpreq *r,
+ return 0;
+ }
+ if (dev == NULL) {
+- ipv4_devconf.proxy_arp = 1;
++ ve_ipv4_devconf.proxy_arp = 1;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+@@ -1094,7 +1097,7 @@ static int arp_req_delete(struct arpreq
+ return pneigh_delete(&arp_tbl, &ip, dev);
+ if (mask == 0) {
+ if (dev == NULL) {
+- ipv4_devconf.proxy_arp = 0;
++ ve_ipv4_devconf.proxy_arp = 0;
+ return 0;
+ }
+ if (__in_dev_get_rtnl(dev)) {
+@@ -1240,7 +1243,9 @@ static int arp_proc_init(void);
+
+ void __init arp_init(void)
+ {
+- neigh_table_init(&arp_tbl);
++ get_ve0()->ve_arp_tbl = &global_arp_tbl;
++ if (neigh_table_init(&arp_tbl))
++ panic("cannot initialize ARP tables\n");
+
+ dev_add_pack(&arp_packet_type);
+ arp_proc_init();
+@@ -1372,8 +1377,9 @@ static int arp_seq_open(struct inode *in
+ {
+ struct seq_file *seq;
+ int rc = -ENOMEM;
+- struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+-
++ struct neigh_seq_state *s;
++
++ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ goto out;
+
+@@ -1401,7 +1407,7 @@ static struct file_operations arp_seq_fo
+
+ static int __init arp_proc_init(void)
+ {
+- if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
++ if (!proc_glob_fops_create("net/arp", S_IRUGO, &arp_seq_fops))
+ return -ENOMEM;
+ return 0;
+ }
+@@ -1421,8 +1427,55 @@ EXPORT_SYMBOL(arp_rcv);
+ EXPORT_SYMBOL(arp_create);
+ EXPORT_SYMBOL(arp_xmit);
+ EXPORT_SYMBOL(arp_send);
+-EXPORT_SYMBOL(arp_tbl);
++EXPORT_SYMBOL(global_arp_tbl);
+
+ #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+ EXPORT_SYMBOL(clip_tbl_hook);
+ #endif
++
++int ve_arp_init(struct ve_struct *ve)
++{
++ struct ve_struct *old_env;
++ int err;
++
++ ve->ve_arp_tbl = kmalloc(sizeof(struct neigh_table), GFP_KERNEL);
++ if (ve->ve_arp_tbl == NULL) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ *(ve->ve_arp_tbl) = global_arp_tbl;
++ ve->ve_arp_tbl->parms.tbl = ve->ve_arp_tbl;
++ old_env = set_exec_env(ve);
++ err = neigh_table_init(ve->ve_arp_tbl);
++ if (err)
++ goto out_free;
++#ifdef CONFIG_SYSCTL
++ neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
++ NET_IPV4_NEIGH, "ipv4", NULL, NULL);
++#endif
++ set_exec_env(old_env);
++ err = 0;
++
++out:
++ return err;
++
++out_free:
++ kfree(ve->ve_arp_tbl);
++ ve->ve_arp_tbl = NULL;
++ goto out;
++}
++EXPORT_SYMBOL(ve_arp_init);
++
++void ve_arp_fini(struct ve_struct *ve)
++{
++ if (ve->ve_arp_tbl) {
++#ifdef CONFIG_SYSCTL
++ neigh_sysctl_unregister(&ve->ve_arp_tbl->parms);
++#endif
++ neigh_table_clear(ve->ve_arp_tbl);
++ kfree(ve->ve_arp_tbl);
++ ve->ve_arp_tbl = NULL;
++ }
++}
++EXPORT_SYMBOL(ve_arp_fini);
+diff -upr linux-2.6.16.orig/net/ipv4/devinet.c linux-2.6.16-026test015/net/ipv4/devinet.c
+--- linux-2.6.16.orig/net/ipv4/devinet.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/devinet.c 2006-07-04 14:41:39.000000000 +0400
+@@ -71,7 +71,7 @@ struct ipv4_devconf ipv4_devconf = {
+ .shared_media = 1,
+ };
+
+-static struct ipv4_devconf ipv4_devconf_dflt = {
++struct ipv4_devconf ipv4_devconf_dflt = {
+ .accept_redirects = 1,
+ .send_redirects = 1,
+ .secure_redirects = 1,
+@@ -79,10 +79,16 @@ static struct ipv4_devconf ipv4_devconf_
+ .accept_source_route = 1,
+ };
+
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ve_ipv4_devconf_dflt (*(get_exec_env()->_ipv4_devconf_dflt))
++#else
++#define ve_ipv4_devconf_dflt ipv4_devconf_dflt
++#endif
++
+ static void rtmsg_ifa(int event, struct in_ifaddr *);
+
+ static struct notifier_block *inetaddr_chain;
+-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
++void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy);
+ #ifdef CONFIG_SYSCTL
+ static void devinet_sysctl_register(struct in_device *in_dev,
+@@ -92,7 +98,7 @@ static void devinet_sysctl_unregister(st
+
+ /* Locks all the inet devices. */
+
+-static struct in_ifaddr *inet_alloc_ifa(void)
++struct in_ifaddr *inet_alloc_ifa(void)
+ {
+ struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
+
+@@ -103,6 +109,7 @@ static struct in_ifaddr *inet_alloc_ifa(
+
+ return ifa;
+ }
++EXPORT_SYMBOL_GPL(inet_alloc_ifa);
+
+ static void inet_rcu_free_ifa(struct rcu_head *head)
+ {
+@@ -175,6 +182,7 @@ out_kfree:
+ in_dev = NULL;
+ goto out;
+ }
++EXPORT_SYMBOL_GPL(inetdev_init);
+
+ static void in_dev_rcu_put(struct rcu_head *head)
+ {
+@@ -190,7 +198,7 @@ static void inetdev_destroy(struct in_de
+ ASSERT_RTNL();
+
+ dev = in_dev->dev;
+- if (dev == &loopback_dev)
++ if (dev == &ve0_loopback)
+ return;
+
+ in_dev->dead = 1;
+@@ -232,7 +240,7 @@ int inet_addr_onlink(struct in_device *i
+ return 0;
+ }
+
+-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
++void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+ int destroy)
+ {
+ struct in_ifaddr *promote = NULL;
+@@ -320,7 +328,7 @@ static void inet_del_ifa(struct in_devic
+ }
+ }
+
+-static int inet_insert_ifa(struct in_ifaddr *ifa)
++int inet_insert_ifa(struct in_ifaddr *ifa)
+ {
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct in_ifaddr *ifa1, **ifap, **last_primary;
+@@ -370,6 +378,7 @@ static int inet_insert_ifa(struct in_ifa
+
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(inet_insert_ifa);
+
+ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+ {
+@@ -578,7 +587,7 @@ int devinet_ioctl(unsigned int cmd, void
+
+ case SIOCSIFFLAGS:
+ ret = -EACCES;
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ goto out;
+ break;
+ case SIOCSIFADDR: /* Set interface address (and family) */
+@@ -586,7 +595,7 @@ int devinet_ioctl(unsigned int cmd, void
+ case SIOCSIFDSTADDR: /* Set the destination address */
+ case SIOCSIFNETMASK: /* Set the netmask for the interface */
+ ret = -EACCES;
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ goto out;
+ ret = -EINVAL;
+ if (sin->sin_family != AF_INET)
+@@ -1163,10 +1172,10 @@ static struct rtnetlink_link inet_rtnetl
+ void inet_forward_change(void)
+ {
+ struct net_device *dev;
+- int on = ipv4_devconf.forwarding;
++ int on = ve_ipv4_devconf.forwarding;
+
+- ipv4_devconf.accept_redirects = !on;
+- ipv4_devconf_dflt.forwarding = on;
++ ve_ipv4_devconf.accept_redirects = !on;
++ ve_ipv4_devconf_dflt.forwarding = on;
+
+ read_lock(&dev_base_lock);
+ for (dev = dev_base; dev; dev = dev->next) {
+@@ -1191,9 +1200,9 @@ static int devinet_sysctl_forward(ctl_ta
+ int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && *valp != val) {
+- if (valp == &ipv4_devconf.forwarding)
++ if (valp == &ve_ipv4_devconf.forwarding)
+ inet_forward_change();
+- else if (valp != &ipv4_devconf_dflt.forwarding)
++ else if (valp != &ve_ipv4_devconf_dflt.forwarding)
+ rt_cache_flush(0);
+ }
+
+@@ -1464,30 +1473,22 @@ static struct devinet_sysctl_table {
+ },
+ };
+
+-static void devinet_sysctl_register(struct in_device *in_dev,
+- struct ipv4_devconf *p)
++static struct devinet_sysctl_table *__devinet_sysctl_register(char *dev_name,
++ int ifindex, struct ipv4_devconf *p)
+ {
+ int i;
+- struct net_device *dev = in_dev ? in_dev->dev : NULL;
+- struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+- char *dev_name = NULL;
++ struct devinet_sysctl_table *t;
+
++ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+- return;
++ goto out;
++
+ memcpy(t, &devinet_sysctl, sizeof(*t));
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+ t->devinet_vars[i].de = NULL;
+ }
+
+- if (dev) {
+- dev_name = dev->name;
+- t->devinet_dev[0].ctl_name = dev->ifindex;
+- } else {
+- dev_name = "default";
+- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+- }
+-
+ /*
+ * Make a copy of dev_name, because '.procname' is regarded as const
+ * by sysctl and we wouldn't want anyone to change it under our feet
+@@ -1495,8 +1496,9 @@ static void devinet_sysctl_register(stru
+ */
+ dev_name = kstrdup(dev_name, GFP_KERNEL);
+ if (!dev_name)
+- goto free;
++ goto out_free_table;
+
++ t->devinet_dev[0].ctl_name = ifindex;
+ t->devinet_dev[0].procname = dev_name;
+ t->devinet_dev[0].child = t->devinet_vars;
+ t->devinet_dev[0].de = NULL;
+@@ -1509,17 +1511,38 @@ static void devinet_sysctl_register(stru
+
+ t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+ if (!t->sysctl_header)
+- goto free_procname;
++ goto out_free_procname;
+
+- p->sysctl = t;
+- return;
++ return t;
+
+ /* error path */
+- free_procname:
++out_free_procname:
+ kfree(dev_name);
+- free:
++out_free_table:
+ kfree(t);
+- return;
++out:
++ printk(KERN_DEBUG "Can't register net/ipv4/conf sysctls.\n");
++ return NULL;
++}
++
++static void devinet_sysctl_register(struct in_device *in_dev,
++ struct ipv4_devconf *p)
++{
++ struct net_device *dev;
++ char *dev_name;
++ int ifindex;
++
++ dev = in_dev ? in_dev->dev : NULL;
++
++ if (dev) {
++ dev_name = dev->name;
++ ifindex = dev->ifindex;
++ } else {
++ dev_name = "default";
++ ifindex = NET_PROTO_CONF_DEFAULT;
++ }
++
++ p->sysctl = __devinet_sysctl_register(dev_name, ifindex, p);
+ }
+
+ static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+@@ -1532,7 +1555,170 @@ static void devinet_sysctl_unregister(st
+ kfree(t);
+ }
+ }
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++static ctl_table net_sysctl_tables[] = {
++ /* 0: net */
++ {
++ .ctl_name = CTL_NET,
++ .procname = "net",
++ .mode = 0555,
++ .child = &net_sysctl_tables[2],
++ },
++ { .ctl_name = 0, },
++ /* 2: net/ipv4 */
++ {
++ .ctl_name = NET_IPV4,
++ .procname = "ipv4",
++ .mode = 0555,
++ .child = &net_sysctl_tables[4],
++ },
++ { .ctl_name = 0, },
++ /* 4, 5: net/ipv4/[vars] */
++ {
++ .ctl_name = NET_IPV4_FORWARD,
++ .procname = "ip_forward",
++ .data = &ipv4_devconf.forwarding,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &ipv4_sysctl_forward,
++ .strategy = &ipv4_sysctl_forward_strategy,
++ },
++ {
++ .ctl_name = NET_IPV4_ROUTE,
++ .procname = "route",
++ .maxlen = 0,
++ .mode = 0555,
++ .child = &net_sysctl_tables[7],
++ },
++ { .ctl_name = 0 },
++ /* 7: net/ipv4/route/flush */
++ {
++ .ctl_name = NET_IPV4_ROUTE_FLUSH,
++ .procname = "flush",
++ .data = NULL, /* setuped below */
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &ipv4_sysctl_rtcache_flush,
++ .strategy = &ipv4_sysctl_rtcache_flush_strategy,
++ },
++ { .ctl_name = 0 },
++};
++
++static int ip_forward_sysctl_register(struct ve_struct *ve,
++ struct ipv4_devconf *p)
++{
++ struct ctl_table_header *hdr;
++ ctl_table *root;
++
++ root = clone_sysctl_template(net_sysctl_tables,
++ sizeof(net_sysctl_tables) / sizeof(ctl_table));
++ if (root == NULL)
++ goto out;
++
++ root[4].data = &p->forwarding;
++ root[7].data = &ipv4_flush_delay;
++
++ hdr = register_sysctl_table(root, 1);
++ if (hdr == NULL)
++ goto out_free;
++
++ ve->forward_header = hdr;
++ ve->forward_table = root;
++ return 0;
++
++out_free:
++ free_sysctl_clone(root);
++out:
++ return -ENOMEM;
++}
++
++static inline void ip_forward_sysctl_unregister(struct ve_struct *ve)
++{
++ unregister_sysctl_table(ve->forward_header);
++ ve->forward_header = NULL;
++}
++
++static inline void ip_forward_sysctl_free(struct ve_struct *ve)
++{
++ free_sysctl_clone(ve->forward_table);
++ ve->forward_table = NULL;
++}
++#endif
++#endif
++
++int devinet_sysctl_init(struct ve_struct *ve)
++{
++ int err = 0;
++#ifdef CONFIG_SYSCTL
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ struct ipv4_devconf *conf, *conf_def;
++
++ err = -ENOMEM;
++
++ conf = kmalloc(sizeof(*conf), GFP_KERNEL);
++ if (!conf)
++ goto err1;
++
++ memcpy(conf, &ipv4_devconf, sizeof(*conf));
++ conf->sysctl = __devinet_sysctl_register("all",
++ NET_PROTO_CONF_ALL, conf);
++ if (!conf->sysctl)
++ goto err2;
++
++ conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL);
++ if (!conf_def)
++ goto err3;
++
++ memcpy(conf_def, &ipv4_devconf_dflt, sizeof(*conf_def));
++ conf_def->sysctl = __devinet_sysctl_register("default",
++ NET_PROTO_CONF_DEFAULT, conf_def);
++ if (!conf_def->sysctl)
++ goto err4;
++
++ err = ip_forward_sysctl_register(ve, conf);
++ if (err)
++ goto err5;
++
++ ve->_ipv4_devconf = conf;
++ ve->_ipv4_devconf_dflt = conf_def;
++ return 0;
++
++err5:
++ devinet_sysctl_unregister(conf_def);
++err4:
++ kfree(conf_def);
++err3:
++ devinet_sysctl_unregister(conf);
++err2:
++ kfree(conf);
++err1:
+ #endif
++#endif
++ return err;
++}
++
++void devinet_sysctl_fini(struct ve_struct *ve)
++{
++#ifdef CONFIG_SYSCTL
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ ip_forward_sysctl_unregister(ve);
++ devinet_sysctl_unregister(ve->_ipv4_devconf);
++ devinet_sysctl_unregister(ve->_ipv4_devconf_dflt);
++#endif
++#endif
++}
++
++void devinet_sysctl_free(struct ve_struct *ve)
++{
++#ifdef CONFIG_SYSCTL
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ ip_forward_sysctl_free(ve);
++ kfree(ve->_ipv4_devconf);
++ kfree(ve->_ipv4_devconf_dflt);
++#endif
++#endif
++}
+
+ void __init devinet_init(void)
+ {
+@@ -1542,13 +1728,18 @@ void __init devinet_init(void)
+ #ifdef CONFIG_SYSCTL
+ devinet_sysctl.sysctl_header =
+ register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
+- devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
++ __devinet_sysctl_register("default", NET_PROTO_CONF_DEFAULT,
++ &ipv4_devconf_dflt);
+ #endif
+ }
+
+ EXPORT_SYMBOL(devinet_ioctl);
+ EXPORT_SYMBOL(in_dev_finish_destroy);
+ EXPORT_SYMBOL(inet_select_addr);
++EXPORT_SYMBOL(inet_del_ifa);
+ EXPORT_SYMBOL(inetdev_by_index);
++EXPORT_SYMBOL(devinet_sysctl_init);
++EXPORT_SYMBOL(devinet_sysctl_fini);
++EXPORT_SYMBOL(devinet_sysctl_free);
+ EXPORT_SYMBOL(register_inetaddr_notifier);
+ EXPORT_SYMBOL(unregister_inetaddr_notifier);
+diff -upr linux-2.6.16.orig/net/ipv4/fib_frontend.c linux-2.6.16-026test015/net/ipv4/fib_frontend.c
+--- linux-2.6.16.orig/net/ipv4/fib_frontend.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/fib_frontend.c 2006-07-04 14:41:39.000000000 +0400
+@@ -53,14 +53,46 @@
+
+ #define RT_TABLE_MIN RT_TABLE_MAIN
+
++#undef ip_fib_local_table
++#undef ip_fib_main_table
+ struct fib_table *ip_fib_local_table;
+ struct fib_table *ip_fib_main_table;
++void prepare_fib_tables(void)
++{
++#ifdef CONFIG_VE
++ get_ve0()->_local_table = ip_fib_local_table;
++ ip_fib_local_table = (struct fib_table *)0x12345678;
++ get_ve0()->_main_table = ip_fib_main_table;
++ ip_fib_main_table = (struct fib_table *)0x12345678;
++#endif
++}
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ip_fib_local_table get_exec_env()->_local_table
++#define ip_fib_main_table get_exec_env()->_main_table
++#endif
+
+ #else
+
+ #define RT_TABLE_MIN 1
+
++#undef fib_tables
+ struct fib_table *fib_tables[RT_TABLE_MAX+1];
++void prepare_fib_tables(void)
++{
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ int i;
++
++ BUG_ON(sizeof(fib_tables) !=
++ sizeof(((struct ve_struct *)0)->_fib_tables));
++ memcpy(get_ve0()->_fib_tables, fib_tables, sizeof(fib_tables));
++ for (i = 0; i <= RT_TABLE_MAX; i++)
++ fib_tables[i] = (void *)0x12366678;
++#endif
++}
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define fib_tables get_exec_env()->_fib_tables
++#endif
+
+ struct fib_table *__fib_new_table(int id)
+ {
+@@ -250,7 +282,7 @@ int ip_rt_ioctl(unsigned int cmd, void _
+ switch (cmd) {
+ case SIOCADDRT: /* Add a route */
+ case SIOCDELRT: /* Delete a route */
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+ return -EFAULT;
+@@ -653,6 +685,7 @@ static struct notifier_block fib_netdev_
+
+ void __init ip_fib_init(void)
+ {
++ prepare_fib_tables();
+ #ifndef CONFIG_IP_MULTIPLE_TABLES
+ ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
+ ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);
+diff -upr linux-2.6.16.orig/net/ipv4/fib_hash.c linux-2.6.16-026test015/net/ipv4/fib_hash.c
+--- linux-2.6.16.orig/net/ipv4/fib_hash.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/fib_hash.c 2006-07-04 14:41:38.000000000 +0400
+@@ -36,6 +36,7 @@
+ #include <linux/skbuff.h>
+ #include <linux/netlink.h>
+ #include <linux/init.h>
++#include <linux/ve.h>
+
+ #include <net/ip.h>
+ #include <net/protocol.h>
+@@ -73,11 +74,6 @@ struct fn_zone {
+ * can be cheaper than memory lookup, so that FZ_* macros are used.
+ */
+
+-struct fn_hash {
+- struct fn_zone *fn_zones[33];
+- struct fn_zone *fn_zone_list;
+-};
+-
+ static inline u32 fn_hash(u32 key, struct fn_zone *fz)
+ {
+ u32 h = ntohl(key)>>(32 - fz->fz_order);
+@@ -623,7 +619,7 @@ fn_hash_delete(struct fib_table *tb, str
+ return -ESRCH;
+ }
+
+-static int fn_flush_list(struct fn_zone *fz, int idx)
++static int fn_flush_list(struct fn_zone *fz, int idx, int destroy)
+ {
+ struct hlist_head *head = &fz->fz_hash[idx];
+ struct hlist_node *node, *n;
+@@ -638,7 +634,9 @@ static int fn_flush_list(struct fn_zone
+ list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+- if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
++ if (fi == NULL)
++ continue;
++ if (destroy || (fi->fib_flags&RTNH_F_DEAD)) {
+ write_lock_bh(&fib_hash_lock);
+ list_del(&fa->fa_list);
+ if (list_empty(&f->fn_alias)) {
+@@ -660,7 +658,7 @@ static int fn_flush_list(struct fn_zone
+ return found;
+ }
+
+-static int fn_hash_flush(struct fib_table *tb)
++static int __fn_hash_flush(struct fib_table *tb, int destroy)
+ {
+ struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+ struct fn_zone *fz;
+@@ -670,11 +668,84 @@ static int fn_hash_flush(struct fib_tabl
+ int i;
+
+ for (i = fz->fz_divisor - 1; i >= 0; i--)
+- found += fn_flush_list(fz, i);
++ found += fn_flush_list(fz, i, destroy);
+ }
+ return found;
+ }
+
++static int fn_hash_flush(struct fib_table *tb)
++{
++ return __fn_hash_flush(tb, 0);
++}
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++void fib_hash_destroy(struct fib_table *tb)
++{
++ __fn_hash_flush(tb, 1);
++ kfree(tb);
++}
++
++/*
++ * Initialization of virtualized networking subsystem.
++ */
++int init_ve_route(struct ve_struct *ve)
++{
++#ifdef CONFIG_IP_MULTIPLE_TABLES
++ if (fib_rules_create())
++ return -ENOMEM;
++ ve->_fib_tables[RT_TABLE_LOCAL] = fib_hash_init(RT_TABLE_LOCAL);
++ if (!ve->_fib_tables[RT_TABLE_LOCAL])
++ goto out_destroy;
++ ve->_fib_tables[RT_TABLE_MAIN] = fib_hash_init(RT_TABLE_MAIN);
++ if (!ve->_fib_tables[RT_TABLE_MAIN])
++ goto out_destroy_local;
++
++ return 0;
++
++out_destroy_local:
++ fib_hash_destroy(ve->_fib_tables[RT_TABLE_LOCAL]);
++out_destroy:
++ fib_rules_destroy();
++ ve->_local_rule = NULL;
++ return -ENOMEM;
++#else
++ ve->_local_table = fib_hash_init(RT_TABLE_LOCAL);
++ if (!ve->_local_table)
++ return -ENOMEM;
++ ve->_main_table = fib_hash_init(RT_TABLE_MAIN);
++ if (!ve->_main_table) {
++ fib_hash_destroy(ve->_local_table);
++ return -ENOMEM;
++ }
++ return 0;
++#endif
++}
++
++void fini_ve_route(struct ve_struct *ve)
++{
++#ifdef CONFIG_IP_MULTIPLE_TABLES
++ int i;
++ for (i=0; i<RT_TABLE_MAX+1; i++)
++ {
++ if (!ve->_fib_tables[i])
++ continue;
++ fib_hash_destroy(ve->_fib_tables[i]);
++ }
++ fib_rules_destroy();
++ ve->_local_rule = NULL;
++#else
++ fib_hash_destroy(ve->_local_table);
++ fib_hash_destroy(ve->_main_table);
++#endif
++ fib_hash_free(ve->_fib_info_hash, ve->_fib_hash_size);
++ fib_hash_free(ve->_fib_info_laddrhash, ve->_fib_hash_size);
++ ve->_fib_info_hash = ve->_fib_info_laddrhash = NULL;
++}
++
++EXPORT_SYMBOL(init_ve_route);
++EXPORT_SYMBOL(fini_ve_route);
++#endif
++
+
+ static inline int
+ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
+@@ -766,7 +837,7 @@ static int fn_hash_dump(struct fib_table
+ return skb->len;
+ }
+
+-#ifdef CONFIG_IP_MULTIPLE_TABLES
++#if defined(CONFIG_IP_MULTIPLE_TABLES) || defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
+ struct fib_table * fib_hash_init(int id)
+ #else
+ struct fib_table * __init fib_hash_init(int id)
+@@ -1076,13 +1147,13 @@ static struct file_operations fib_seq_fo
+
+ int __init fib_proc_init(void)
+ {
+- if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
++ if (!proc_glob_fops_create("net/route", S_IRUGO, &fib_seq_fops))
+ return -ENOMEM;
+ return 0;
+ }
+
+ void __init fib_proc_exit(void)
+ {
+- proc_net_remove("route");
++ remove_proc_glob_entry("net/route", NULL);
+ }
+ #endif /* CONFIG_PROC_FS */
+diff -upr linux-2.6.16.orig/net/ipv4/fib_lookup.h linux-2.6.16-026test015/net/ipv4/fib_lookup.h
+--- linux-2.6.16.orig/net/ipv4/fib_lookup.h 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/fib_lookup.h 2006-07-04 14:41:38.000000000 +0400
+@@ -41,5 +41,6 @@ extern struct fib_alias *fib_find_alias(
+ extern int fib_detect_death(struct fib_info *fi, int order,
+ struct fib_info **last_resort,
+ int *last_idx, int *dflt);
++void fib_hash_free(struct hlist_head *hash, int bytes);
+
+ #endif /* _FIB_LOOKUP_H */
+diff -upr linux-2.6.16.orig/net/ipv4/fib_rules.c linux-2.6.16-026test015/net/ipv4/fib_rules.c
+--- linux-2.6.16.orig/net/ipv4/fib_rules.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/fib_rules.c 2006-07-04 14:41:39.000000000 +0400
+@@ -39,6 +39,7 @@
+ #include <linux/proc_fs.h>
+ #include <linux/skbuff.h>
+ #include <linux/netlink.h>
++#include <linux/rtnetlink.h>
+ #include <linux/init.h>
+
+ #include <net/ip.h>
+@@ -99,9 +100,89 @@ static struct fib_rule local_rule = {
+ .r_action = RTN_UNICAST,
+ };
+
+-static struct fib_rule *fib_rules = &local_rule;
+ static DEFINE_RWLOCK(fib_rules_lock);
+
++void __init prepare_fib_rules(void)
++{
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ get_ve0()->_local_rule = &local_rule;
++ get_ve0()->_fib_rules = &local_rule;
++#endif
++}
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define local_rule (*(get_exec_env()->_local_rule))
++#define fib_rules (get_exec_env()->_fib_rules)
++#else
++static struct fib_rule *fib_rules = &local_rule;
++#endif
++
++#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE)
++int fib_rules_create()
++{
++ struct fib_rule *default_rule, *main_rule, *loc_rule;
++
++ default_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL);
++ if (default_rule == NULL)
++ goto out_def;
++ memset(default_rule, 0, sizeof(struct fib_rule));
++ atomic_set(&default_rule->r_clntref, 1);
++ default_rule->r_preference = 0x7FFF;
++ default_rule->r_table = RT_TABLE_DEFAULT;
++ default_rule->r_action = RTN_UNICAST;
++
++ main_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL);
++ if (main_rule == NULL)
++ goto out_main;
++ memset(main_rule, 0, sizeof(struct fib_rule));
++ atomic_set(&main_rule->r_clntref, 1);
++ main_rule->r_preference = 0x7FFE;
++ main_rule->r_table = RT_TABLE_MAIN;
++ main_rule->r_action = RTN_UNICAST;
++ main_rule->r_next = default_rule;
++
++ loc_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL);
++ if (loc_rule == NULL)
++ goto out_loc;
++ memset(loc_rule, 0, sizeof(struct fib_rule));
++ atomic_set(&loc_rule->r_clntref, 1);
++ loc_rule->r_preference = 0;
++ loc_rule->r_table = RT_TABLE_LOCAL;
++ loc_rule->r_action = RTN_UNICAST;
++ loc_rule->r_next = main_rule;
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ get_exec_env()->_local_rule = loc_rule;
++ get_exec_env()->_fib_rules = loc_rule;
++#endif
++
++ return 0;
++
++out_loc:
++ kfree(main_rule);
++out_main:
++ kfree(default_rule);
++out_def:
++ return -1;
++}
++
++void fib_rules_destroy()
++{
++ struct fib_rule *r;
++
++ rtnl_lock();
++ write_lock_bh(&fib_rules_lock);
++ while(fib_rules != NULL) {
++ r = fib_rules;
++ fib_rules = fib_rules->r_next;
++ r->r_dead = 1;
++ fib_rule_put(r);
++ }
++ write_unlock_bh(&fib_rules_lock);
++ rtnl_unlock();
++}
++#endif
++
+ int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+ {
+ struct rtattr **rta = arg;
+@@ -435,5 +516,6 @@ int inet_dump_rules(struct sk_buff *skb,
+
+ void __init fib_rules_init(void)
+ {
++ prepare_fib_rules();
+ register_netdevice_notifier(&fib_rules_notifier);
+ }
+diff -upr linux-2.6.16.orig/net/ipv4/fib_semantics.c linux-2.6.16-026test015/net/ipv4/fib_semantics.c
+--- linux-2.6.16.orig/net/ipv4/fib_semantics.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/fib_semantics.c 2006-07-04 14:41:39.000000000 +0400
+@@ -33,6 +33,7 @@
+ #include <linux/netdevice.h>
+ #include <linux/if_arp.h>
+ #include <linux/proc_fs.h>
++#include <linux/ve.h>
+ #include <linux/skbuff.h>
+ #include <linux/netlink.h>
+ #include <linux/init.h>
+@@ -56,6 +57,24 @@ static struct hlist_head *fib_info_laddr
+ static unsigned int fib_hash_size;
+ static unsigned int fib_info_cnt;
+
++void prepare_fib_info(void)
++{
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ get_ve0()->_fib_info_hash = fib_info_hash;
++ get_ve0()->_fib_info_laddrhash = fib_info_laddrhash;
++ get_ve0()->_fib_hash_size = fib_hash_size;
++ get_ve0()->_fib_info_cnt = fib_info_cnt;
++#endif
++}
++
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define fib_info_hash (get_exec_env()->_fib_info_hash)
++#define fib_info_laddrhash (get_exec_env()->_fib_info_laddrhash)
++#define fib_hash_size (get_exec_env()->_fib_hash_size)
++#define fib_info_cnt (get_exec_env()->_fib_info_cnt)
++#endif
++
++
+ #define DEVINDEX_HASHBITS 8
+ #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+@@ -235,13 +254,15 @@ static struct fib_info *fib_find_info(co
+ return NULL;
+ }
+
+-static inline unsigned int fib_devindex_hashfn(unsigned int val)
++static inline unsigned int fib_devindex_hashfn(unsigned int val,
++ envid_t veid)
+ {
+ unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+ return (val ^
+ (val >> DEVINDEX_HASHBITS) ^
+- (val >> (DEVINDEX_HASHBITS * 2))) & mask;
++ (val >> (DEVINDEX_HASHBITS * 2)) ^
++ (veid ^ (veid >> 16))) & mask;
+ }
+
+ /* Check, that the gateway is already configured.
+@@ -257,7 +278,7 @@ int ip_fib_check_default(u32 gw, struct
+
+ read_lock(&fib_info_lock);
+
+- hash = fib_devindex_hashfn(dev->ifindex);
++ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env));
+ head = &fib_info_devhash[hash];
+ hlist_for_each_entry(nh, node, head, nh_hash) {
+ if (nh->nh_dev == dev &&
+@@ -580,7 +601,7 @@ static struct hlist_head *fib_hash_alloc
+ __get_free_pages(GFP_KERNEL, get_order(bytes));
+ }
+
+-static void fib_hash_free(struct hlist_head *hash, int bytes)
++void fib_hash_free(struct hlist_head *hash, int bytes)
+ {
+ if (!hash)
+ return;
+@@ -837,7 +858,8 @@ link_it:
+
+ if (!nh->nh_dev)
+ continue;
+- hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
++ hash = fib_devindex_hashfn(nh->nh_dev->ifindex,
++ VEID(nh->nh_dev->owner_env));
+ head = &fib_info_devhash[hash];
+ hlist_add_head(&nh->nh_hash, head);
+ } endfor_nexthops(fi)
+@@ -1184,7 +1206,8 @@ int fib_sync_down(u32 local, struct net_
+
+ if (dev) {
+ struct fib_info *prev_fi = NULL;
+- unsigned int hash = fib_devindex_hashfn(dev->ifindex);
++ unsigned int hash = fib_devindex_hashfn(dev->ifindex,
++ VEID(dev->owner_env));
+ struct hlist_head *head = &fib_info_devhash[hash];
+ struct hlist_node *node;
+ struct fib_nh *nh;
+@@ -1249,7 +1272,7 @@ int fib_sync_up(struct net_device *dev)
+ return 0;
+
+ prev_fi = NULL;
+- hash = fib_devindex_hashfn(dev->ifindex);
++ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env));
+ head = &fib_info_devhash[hash];
+ ret = 0;
+
+diff -upr linux-2.6.16.orig/net/ipv4/fib_trie.c linux-2.6.16-026test015/net/ipv4/fib_trie.c
+--- linux-2.6.16.orig/net/ipv4/fib_trie.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/fib_trie.c 2006-07-04 14:41:36.000000000 +0400
+@@ -314,11 +314,6 @@ static void __leaf_free_rcu(struct rcu_h
+ kfree(container_of(head, struct leaf, rcu));
+ }
+
+-static inline void free_leaf(struct leaf *leaf)
+-{
+- call_rcu(&leaf->rcu, __leaf_free_rcu);
+-}
+-
+ static void __leaf_info_free_rcu(struct rcu_head *head)
+ {
+ kfree(container_of(head, struct leaf_info, rcu));
+@@ -357,7 +352,12 @@ static void __tnode_free_rcu(struct rcu_
+
+ static inline void tnode_free(struct tnode *tn)
+ {
+- call_rcu(&tn->rcu, __tnode_free_rcu);
++ if(IS_LEAF(tn)) {
++ struct leaf *l = (struct leaf *) tn;
++ call_rcu_bh(&l->rcu, __leaf_free_rcu);
++ }
++ else
++ call_rcu(&tn->rcu, __tnode_free_rcu);
+ }
+
+ static struct leaf *leaf_new(void)
+diff -upr linux-2.6.16.orig/net/ipv4/igmp.c linux-2.6.16-026test015/net/ipv4/igmp.c
+--- linux-2.6.16.orig/net/ipv4/igmp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/igmp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -2262,6 +2262,8 @@ static inline struct ip_mc_list *igmp_mc
+ state->dev;
+ state->dev = state->dev->next) {
+ struct in_device *in_dev;
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ in_dev = in_dev_get(state->dev);
+ if (!in_dev)
+ continue;
+@@ -2291,6 +2293,8 @@ static struct ip_mc_list *igmp_mc_get_ne
+ state->in_dev = NULL;
+ break;
+ }
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ state->in_dev = in_dev_get(state->dev);
+ if (!state->in_dev)
+ continue;
+@@ -2425,6 +2429,8 @@ static inline struct ip_sf_list *igmp_mc
+ state->dev;
+ state->dev = state->dev->next) {
+ struct in_device *idev;
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ idev = in_dev_get(state->dev);
+ if (unlikely(idev == NULL))
+ continue;
+@@ -2464,6 +2470,8 @@ static struct ip_sf_list *igmp_mcf_get_n
+ state->idev = NULL;
+ goto out;
+ }
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ state->idev = in_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+@@ -2584,8 +2592,8 @@ static struct file_operations igmp_mcf_s
+
+ int __init igmp_mc_proc_init(void)
+ {
+- proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
+- proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
++ proc_glob_fops_create("net/igmp", S_IRUGO, &igmp_mc_seq_fops);
++ proc_glob_fops_create("net/mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+ return 0;
+ }
+ #endif
+diff -upr linux-2.6.16.orig/net/ipv4/inet_connection_sock.c linux-2.6.16-026test015/net/ipv4/inet_connection_sock.c
+--- linux-2.6.16.orig/net/ipv4/inet_connection_sock.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/inet_connection_sock.c 2006-07-04 14:41:38.000000000 +0400
+@@ -25,6 +25,9 @@
+ #include <net/tcp_states.h>
+ #include <net/xfrm.h>
+
++#include <ub/ub_net.h>
++#include <ub/ub_orphan.h>
++
+ #ifdef INET_CSK_DEBUG
+ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
+ EXPORT_SYMBOL(inet_csk_timer_bug_msg);
+@@ -48,6 +51,7 @@ int inet_csk_bind_conflict(const struct
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
+ !inet_v6_ipv6only(sk2) &&
++ !ve_accessible_strict(VE_OWNER_SK(sk), VE_OWNER_SK(sk2)) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+@@ -77,7 +81,9 @@ int inet_csk_get_port(struct inet_hashin
+ struct hlist_node *node;
+ struct inet_bind_bucket *tb;
+ int ret;
++ struct ve_struct *env;
+
++ env = VE_OWNER_SK(sk);
+ local_bh_disable();
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+@@ -86,11 +92,15 @@ int inet_csk_get_port(struct inet_hashin
+ int rover = net_random() % (high - low) + low;
+
+ do {
+- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)];
++ head = &hashinfo->bhash[inet_bhashfn(rover,
++ hashinfo->bhash_size, VEID(env))];
+ spin_lock(&head->lock);
+- inet_bind_bucket_for_each(tb, node, &head->chain)
++ inet_bind_bucket_for_each(tb, node, &head->chain) {
++ if (!ve_accessible_strict(VE_OWNER_TB(tb),env))
++ continue;
+ if (tb->port == rover)
+ goto next;
++ }
+ break;
+ next:
+ spin_unlock(&head->lock);
+@@ -113,11 +123,15 @@ int inet_csk_get_port(struct inet_hashin
+ */
+ snum = rover;
+ } else {
+- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)];
++ head = &hashinfo->bhash[inet_bhashfn(snum,
++ hashinfo->bhash_size, VEID(env))];
+ spin_lock(&head->lock);
+- inet_bind_bucket_for_each(tb, node, &head->chain)
++ inet_bind_bucket_for_each(tb, node, &head->chain) {
++ if (!ve_accessible_strict(VE_OWNER_TB(tb), env))
++ continue;
+ if (tb->port == snum)
+ goto tb_found;
++ }
+ }
+ tb = NULL;
+ goto tb_not_found;
+@@ -136,7 +150,7 @@ tb_found:
+ }
+ tb_not_found:
+ ret = 1;
+- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL)
++ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum, env)) == NULL)
+ goto fail_unlock;
+ if (hlist_empty(&tb->owners)) {
+ if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+@@ -541,7 +555,7 @@ void inet_csk_destroy_sock(struct sock *
+
+ sk_refcnt_debug_release(sk);
+
+- atomic_dec(sk->sk_prot->orphan_count);
++ ub_dec_orphan_count(sk);
+ sock_put(sk);
+ }
+
+@@ -621,7 +635,7 @@ void inet_csk_listen_stop(struct sock *s
+
+ sock_orphan(child);
+
+- atomic_inc(sk->sk_prot->orphan_count);
++ ub_inc_orphan_count(sk);
+
+ inet_csk_destroy_sock(child);
+
+diff -upr linux-2.6.16.orig/net/ipv4/inet_diag.c linux-2.6.16-026test015/net/ipv4/inet_diag.c
+--- linux-2.6.16.orig/net/ipv4/inet_diag.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/inet_diag.c 2006-07-04 14:41:38.000000000 +0400
+@@ -673,7 +673,9 @@ static int inet_diag_dump(struct sk_buff
+ struct inet_diag_req *r = NLMSG_DATA(cb->nlh);
+ const struct inet_diag_handler *handler;
+ struct inet_hashinfo *hashinfo;
++ struct ve_struct *ve;
+
++ ve = get_exec_env();
+ handler = inet_diag_table[cb->nlh->nlmsg_type];
+ BUG_ON(handler == NULL);
+ hashinfo = handler->idiag_hashinfo;
+@@ -694,6 +696,8 @@ static int inet_diag_dump(struct sk_buff
+ sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+ struct inet_sock *inet = inet_sk(sk);
+
++ if (!ve_accessible(VE_OWNER_SK(sk), ve))
++ continue;
+ if (num < s_num) {
+ num++;
+ continue;
+@@ -754,6 +758,8 @@ skip_listen_ht:
+ sk_for_each(sk, node, &head->chain) {
+ struct inet_sock *inet = inet_sk(sk);
+
++ if (!ve_accessible(VE_OWNER_SK(sk), ve))
++ continue;
+ if (num < s_num)
+ goto next_normal;
+ if (!(r->idiag_states & (1 << sk->sk_state)))
+@@ -778,6 +784,8 @@ next_normal:
+ inet_twsk_for_each(tw, node,
+ &hashinfo->ehash[i + hashinfo->ehash_size].chain) {
+
++ if (!ve_accessible_veid(inet_twsk(sk)->tw_owner_env, VEID(ve)))
++ continue;
+ if (num < s_num)
+ goto next_dying;
+ if (r->id.idiag_sport != tw->tw_sport &&
+diff -upr linux-2.6.16.orig/net/ipv4/inet_hashtables.c linux-2.6.16-026test015/net/ipv4/inet_hashtables.c
+--- linux-2.6.16.orig/net/ipv4/inet_hashtables.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/inet_hashtables.c 2006-07-04 14:41:38.000000000 +0400
+@@ -30,7 +30,8 @@
+ */
+ struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep,
+ struct inet_bind_hashbucket *head,
+- const unsigned short snum)
++ const unsigned short snum,
++ struct ve_struct *ve)
+ {
+ struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC);
+
+@@ -38,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucke
+ tb->port = snum;
+ tb->fastreuse = 0;
+ INIT_HLIST_HEAD(&tb->owners);
++ SET_VE_OWNER_TB(tb, ve);
+ hlist_add_head(&tb->node, &head->chain);
+ }
+ return tb;
+@@ -71,10 +73,13 @@ EXPORT_SYMBOL(inet_bind_hash);
+ */
+ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk)
+ {
+- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size);
+- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
++ int bhash;
++ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+
++ bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size,
++ VEID(VE_OWNER_SK(sk)));
++ head = &hashinfo->bhash[bhash];
+ spin_lock(&head->lock);
+ tb = inet_csk(sk)->icsk_bind_hash;
+ __sk_del_bind_node(sk);
+@@ -130,7 +135,8 @@ EXPORT_SYMBOL(inet_listen_wlock);
+ * wildcarded during the search since they can never be otherwise.
+ */
+ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr,
+- const unsigned short hnum, const int dif)
++ const unsigned short hnum, const int dif,
++ struct ve_struct *env)
+ {
+ struct sock *result = NULL, *sk;
+ const struct hlist_node *node;
+@@ -139,6 +145,8 @@ struct sock *__inet_lookup_listener(cons
+ sk_for_each(sk, node, head) {
+ const struct inet_sock *inet = inet_sk(sk);
+
++ if (!ve_accessible_strict(VE_OWNER_SK(sk), env))
++ continue;
+ if (inet->num == hnum && !ipv6_only_sock(sk)) {
+ const __u32 rcv_saddr = inet->rcv_saddr;
+ int score = sk->sk_family == PF_INET ? 1 : 0;
+@@ -169,7 +177,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_listener
+ /* called with local bh disabled */
+ static int __inet_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, __u16 lport,
+- struct inet_timewait_sock **twp)
++ struct inet_timewait_sock **twp,
++ struct ve_struct *ve)
+ {
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+@@ -178,12 +187,15 @@ static int __inet_check_established(stru
+ int dif = sk->sk_bound_dev_if;
+ INET_ADDR_COOKIE(acookie, saddr, daddr)
+ const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+- struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
++ unsigned int hash;
++ struct inet_ehash_bucket *head;
+ struct sock *sk2;
+ const struct hlist_node *node;
+ struct inet_timewait_sock *tw;
+
++ hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve));
++ head = inet_ehash_bucket(hinfo, hash);
++
+ prefetch(head->chain.first);
+ write_lock(&head->lock);
+
+@@ -191,7 +203,8 @@ static int __inet_check_established(stru
+ sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) {
+ tw = inet_twsk(sk2);
+
+- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
++ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr,
++ ports, dif, ve)) {
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+@@ -202,7 +215,8 @@ static int __inet_check_established(stru
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
++ if (INET_MATCH(sk2, hash, acookie, saddr, daddr,
++ ports, dif, ve))
+ goto not_unique;
+ }
+
+@@ -253,7 +267,9 @@ int inet_hash_connect(struct inet_timewa
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
++ struct ve_struct *ve;
+
++ ve = VE_OWNER_SK(sk);
+ if (!snum) {
+ int low = sysctl_local_port_range[0];
+ int high = sysctl_local_port_range[1];
+@@ -268,7 +284,8 @@ int inet_hash_connect(struct inet_timewa
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
++ head = &hinfo->bhash[inet_bhashfn(port,
++ hinfo->bhash_size, VEID(ve))];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+@@ -282,13 +299,14 @@ int inet_hash_connect(struct inet_timewa
+ goto next_port;
+ if (!__inet_check_established(death_row,
+ sk, port,
+- &tw))
++ &tw, ve))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port);
++ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
++ head, port, ve);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+@@ -323,7 +341,7 @@ ok:
+ goto out;
+ }
+
+- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
++ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+@@ -333,7 +351,7 @@ ok:
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+- ret = __inet_check_established(death_row, sk, snum, NULL);
++ ret = __inet_check_established(death_row, sk, snum, NULL, ve);
+ out:
+ local_bh_enable();
+ return ret;
+diff -upr linux-2.6.16.orig/net/ipv4/inet_timewait_sock.c linux-2.6.16-026test015/net/ipv4/inet_timewait_sock.c
+--- linux-2.6.16.orig/net/ipv4/inet_timewait_sock.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/inet_timewait_sock.c 2006-07-04 14:41:38.000000000 +0400
+@@ -32,7 +32,8 @@ void __inet_twsk_kill(struct inet_timewa
+ write_unlock(&ehead->lock);
+
+ /* Disassociate with bind bucket. */
+- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)];
++ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num,
++ hashinfo->bhash_size, tw->tw_owner_env)];
+ spin_lock(&bhead->lock);
+ tb = tw->tw_tb;
+ __hlist_del(&tw->tw_bind_node);
+@@ -66,7 +67,8 @@ void __inet_twsk_hashdance(struct inet_t
+ Note, that any socket with inet->num != 0 MUST be bound in
+ binding cache, even if it is closed.
+ */
+- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)];
++ bhead = &hashinfo->bhash[inet_bhashfn(inet->num,
++ hashinfo->bhash_size, tw->tw_owner_env)];
+ spin_lock(&bhead->lock);
+ tw->tw_tb = icsk->icsk_bind_hash;
+ BUG_TRAP(icsk->icsk_bind_hash);
+@@ -90,9 +92,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance)
+
+ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
+ {
+- struct inet_timewait_sock *tw =
+- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
+- SLAB_ATOMIC);
++ struct user_beancounter *ub;
++ struct inet_timewait_sock *tw;
++
++ ub = set_exec_ub(sock_bc(sk)->ub);
++ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
++ SLAB_ATOMIC);
++ (void)set_exec_ub(ub);
++
+ if (tw != NULL) {
+ const struct inet_sock *inet = inet_sk(sk);
+
+diff -upr linux-2.6.16.orig/net/ipv4/ip_forward.c linux-2.6.16-026test015/net/ipv4/ip_forward.c
+--- linux-2.6.16.orig/net/ipv4/ip_forward.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/ip_forward.c 2006-07-04 14:41:38.000000000 +0400
+@@ -87,6 +87,24 @@ int ip_forward(struct sk_buff *skb)
+ if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+ goto sr_failed;
+
++ /*
++ * We try to optimize forwarding of VE packets:
++ * do not decrement TTL (and so save skb_cow)
++ * during forwarding of outgoing pkts from VE.
++ * For incoming pkts we still do ttl decr,
++ * since such skb is not cloned and does not require
++ * actual cow. So, there is at least one place
++ * in pkts path with mandatory ttl decr, that is
++ * sufficient to prevent routing loops.
++ */
++ iph = skb->nh.iph;
++ if (
++#ifdef CONFIG_IP_ROUTE_NAT
++ (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */
++#endif /* and */
++ (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */
++ goto no_ttl_decr;
++
+ /* We are about to mangle packet. Copy it! */
+ if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+ goto drop;
+@@ -95,6 +113,8 @@ int ip_forward(struct sk_buff *skb)
+ /* Decrease ttl after skb cow done */
+ ip_decrease_ttl(iph);
+
++no_ttl_decr:
++
+ /*
+ * We now generate an ICMP HOST REDIRECT giving the route
+ * we calculated.
+diff -upr linux-2.6.16.orig/net/ipv4/ip_fragment.c linux-2.6.16-026test015/net/ipv4/ip_fragment.c
+--- linux-2.6.16.orig/net/ipv4/ip_fragment.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/ip_fragment.c 2006-07-04 14:41:38.000000000 +0400
+@@ -44,6 +44,7 @@
+ #include <linux/udp.h>
+ #include <linux/inet.h>
+ #include <linux/netfilter_ipv4.h>
++#include <linux/ve_owner.h>
+
+ /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+@@ -97,8 +98,12 @@ struct ipq {
+ int iif;
+ unsigned int rid;
+ struct inet_peer *peer;
++ struct ve_struct *owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(IPQ, struct ipq, owner_env)
++DCL_VE_OWNER(IPQ, struct ipq, owner_env)
++
+ /* Hash table. */
+
+ #define IPQ_HASHSZ 64
+@@ -182,7 +187,8 @@ static __inline__ void frag_free_queue(s
+
+ static __inline__ struct ipq *frag_alloc_queue(void)
+ {
+- struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
++ struct ipq *qp = kmalloc(sizeof(struct ipq) + sizeof(void *),
++ GFP_ATOMIC);
+
+ if(!qp)
+ return NULL;
+@@ -278,6 +284,9 @@ static void ip_evictor(void)
+ static void ip_expire(unsigned long arg)
+ {
+ struct ipq *qp = (struct ipq *) arg;
++ struct ve_struct *envid;
++
++ envid = set_exec_env(VE_OWNER_IPQ(qp));
+
+ spin_lock(&qp->lock);
+
+@@ -300,6 +309,8 @@ static void ip_expire(unsigned long arg)
+ out:
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
++
++ (void)set_exec_env(envid);
+ }
+
+ /* Creation primitives. */
+@@ -321,7 +332,8 @@ static struct ipq *ip_frag_intern(unsign
+ qp->saddr == qp_in->saddr &&
+ qp->daddr == qp_in->daddr &&
+ qp->protocol == qp_in->protocol &&
+- qp->user == qp_in->user) {
++ qp->user == qp_in->user &&
++ qp->owner_env == get_exec_env()) {
+ atomic_inc(&qp->refcnt);
+ write_unlock(&ipfrag_lock);
+ qp_in->last_in |= COMPLETE;
+@@ -371,6 +383,8 @@ static struct ipq *ip_frag_create(unsign
+ spin_lock_init(&qp->lock);
+ atomic_set(&qp->refcnt, 1);
+
++ SET_VE_OWNER_IPQ(qp, get_exec_env());
++
+ return ip_frag_intern(hash, qp);
+
+ out_nomem:
+@@ -397,7 +411,8 @@ static inline struct ipq *ip_find(struct
+ qp->saddr == saddr &&
+ qp->daddr == daddr &&
+ qp->protocol == protocol &&
+- qp->user == user) {
++ qp->user == user &&
++ qp->owner_env == get_exec_env()) {
+ atomic_inc(&qp->refcnt);
+ read_unlock(&ipfrag_lock);
+ return qp;
+@@ -719,6 +734,9 @@ struct sk_buff *ip_defrag(struct sk_buff
+ qp->meat == qp->len)
+ ret = ip_frag_reasm(qp, dev);
+
++ if (ret)
++ SET_VE_OWNER_SKB(ret, VE_OWNER_SKB(skb));
++
+ spin_unlock(&qp->lock);
+ ipq_put(qp, NULL);
+ return ret;
+@@ -729,6 +747,51 @@ struct sk_buff *ip_defrag(struct sk_buff
+ return NULL;
+ }
+
++#ifdef CONFIG_VE
++/* XXX */
++void ip_fragment_cleanup(struct ve_struct *envid)
++{
++ int i, progress;
++
++ /* All operations with fragment queues are performed from NET_RX/TX
++ * soft interrupts or from timer context. --Den */
++ local_bh_disable();
++ do {
++ progress = 0;
++ for (i = 0; i < IPQ_HASHSZ; i++) {
++ struct ipq *qp;
++ struct hlist_node *p, *n;
++
++ if (hlist_empty(&ipq_hash[i]))
++ continue;
++inner_restart:
++ read_lock(&ipfrag_lock);
++ hlist_for_each_entry_safe(qp, p, n,
++ &ipq_hash[i], list) {
++ if (!ve_accessible_strict(
++ VE_OWNER_IPQ(qp),
++ envid))
++ continue;
++ atomic_inc(&qp->refcnt);
++ read_unlock(&ipfrag_lock);
++
++ spin_lock(&qp->lock);
++ if (!(qp->last_in&COMPLETE))
++ ipq_kill(qp);
++ spin_unlock(&qp->lock);
++
++ ipq_put(qp, NULL);
++ progress = 1;
++ goto inner_restart;
++ }
++ read_unlock(&ipfrag_lock);
++ }
++ } while(progress);
++ local_bh_enable();
++}
++EXPORT_SYMBOL(ip_fragment_cleanup);
++#endif
++
+ void ipfrag_init(void)
+ {
+ ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+diff -upr linux-2.6.16.orig/net/ipv4/ip_output.c linux-2.6.16-026test015/net/ipv4/ip_output.c
+--- linux-2.6.16.orig/net/ipv4/ip_output.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/ip_output.c 2006-07-04 14:41:37.000000000 +0400
+@@ -86,8 +86,6 @@
+
+ int sysctl_ip_default_ttl = IPDEFTTL;
+
+-static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
+-
+ /* Generate a checksum for an outgoing IP datagram. */
+ __inline__ void ip_send_check(struct iphdr *iph)
+ {
+@@ -421,7 +419,7 @@ static void ip_copy_metadata(struct sk_b
+ * single device frame, and queue such a frame for sending.
+ */
+
+-static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
++int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+ {
+ struct iphdr *iph;
+ int raw = 0;
+@@ -673,6 +671,8 @@ fail:
+ return err;
+ }
+
++EXPORT_SYMBOL(ip_fragment);
++
+ int
+ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+ {
+@@ -1249,11 +1249,7 @@ int ip_push_pending_frames(struct sock *
+ iph->tos = inet->tos;
+ iph->tot_len = htons(skb->len);
+ iph->frag_off = df;
+- if (!df) {
+- __ip_select_ident(iph, &rt->u.dst, 0);
+- } else {
+- iph->id = htons(inet->id++);
+- }
++ ip_select_ident(iph, &rt->u.dst, sk);
+ iph->ttl = ttl;
+ iph->protocol = sk->sk_protocol;
+ iph->saddr = rt->rt_src;
+@@ -1340,12 +1336,13 @@ void ip_send_reply(struct sock *sk, stru
+ char data[40];
+ } replyopts;
+ struct ipcm_cookie ipc;
+- u32 daddr;
++ u32 saddr, daddr;
+ struct rtable *rt = (struct rtable*)skb->dst;
+
+ if (ip_options_echo(&replyopts.opt, skb))
+ return;
+
++ saddr = skb->nh.iph->daddr;
+ daddr = ipc.addr = rt->rt_src;
+ ipc.opt = NULL;
+
+@@ -1359,7 +1356,7 @@ void ip_send_reply(struct sock *sk, stru
+ {
+ struct flowi fl = { .nl_u = { .ip4_u =
+ { .daddr = daddr,
+- .saddr = rt->rt_spec_dst,
++ .saddr = saddr,
+ .tos = RT_TOS(skb->nh.iph->tos) } },
+ /* Not quite clean, but right. */
+ .uli_u = { .ports =
+diff -upr linux-2.6.16.orig/net/ipv4/ipmr.c linux-2.6.16-026test015/net/ipv4/ipmr.c
+--- linux-2.6.16.orig/net/ipv4/ipmr.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/ipmr.c 2006-07-04 14:41:38.000000000 +0400
+@@ -837,7 +837,7 @@ static void mrtsock_destruct(struct sock
+ {
+ rtnl_lock();
+ if (sk == mroute_socket) {
+- ipv4_devconf.mc_forwarding--;
++ ve_ipv4_devconf.mc_forwarding--;
+
+ write_lock_bh(&mrt_lock);
+ mroute_socket=NULL;
+@@ -888,7 +888,7 @@ int ip_mroute_setsockopt(struct sock *sk
+ mroute_socket=sk;
+ write_unlock_bh(&mrt_lock);
+
+- ipv4_devconf.mc_forwarding++;
++ ve_ipv4_devconf.mc_forwarding++;
+ }
+ rtnl_unlock();
+ return ret;
+diff -upr linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_conn.c linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_conn.c
+--- linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_conn.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_conn.c 2006-07-04 14:41:37.000000000 +0400
+@@ -902,7 +902,8 @@ int ip_vs_conn_init(void)
+ /* Allocate ip_vs_conn slab cache */
+ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ sizeof(struct ip_vs_conn), 0,
+- SLAB_HWCACHE_ALIGN, NULL, NULL);
++ SLAB_HWCACHE_ALIGN | SLAB_UBC,
++ NULL, NULL);
+ if (!ip_vs_conn_cachep) {
+ vfree(ip_vs_conn_tab);
+ return -ENOMEM;
+diff -upr linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_core.c linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_core.c
+--- linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_core.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_core.c 2006-07-04 14:41:38.000000000 +0400
+@@ -952,6 +952,10 @@ ip_vs_in(unsigned int hooknum, struct sk
+ * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
+ * ... don't know why 1st test DOES NOT include 2nd (?)
+ */
++ /*
++ * VZ: the question above is right.
++ * The second test is superfluous.
++ */
+ if (unlikely(skb->pkt_type != PACKET_HOST
+ || skb->dev == &loopback_dev || skb->sk)) {
+ IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/arp_tables.c linux-2.6.16-026test015/net/ipv4/netfilter/arp_tables.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/arp_tables.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/arp_tables.c 2006-07-04 14:41:36.000000000 +0400
+@@ -941,7 +941,7 @@ static int do_add_counters(void __user *
+
+ write_lock_bh(&t->lock);
+ private = t->private;
+- if (private->number != paddc->num_counters) {
++ if (private->number != tmp.num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_core.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_core.c 2006-07-04 14:41:39.000000000 +0400
+@@ -49,6 +49,7 @@
+ #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_core.h>
+ #include <linux/netfilter_ipv4/listhelp.h>
++#include <ub/ub_mem.h>
+
+ #define IP_CONNTRACK_VERSION "2.4"
+
+@@ -60,22 +61,41 @@
+
+ DEFINE_RWLOCK(ip_conntrack_lock);
+
+-/* ip_conntrack_standalone needs this */
+-atomic_t ip_conntrack_count = ATOMIC_INIT(0);
++#ifdef CONFIG_VE_IPTABLES
++#define ve_ip_conntrack_helpers \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_helpers)
++#define ve_ip_conntrack_max \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_max)
++#define ve_ip_conntrack_count \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_count)
++#define ve_ip_conntrack_unconfirmed \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_unconfirmed)
++#else
+
+ void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+ LIST_HEAD(ip_conntrack_expect_list);
+ struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+ static LIST_HEAD(helpers);
++struct list_head *ip_conntrack_hash;
++static LIST_HEAD(unconfirmed);
++#define ve_ip_conntrack_count ip_conntrack_count
++#define ve_ip_conntrack_helpers helpers
++#define ve_ip_conntrack_max ip_conntrack_max
++#define ve_ip_conntrack_unconfirmed unconfirmed
++#endif
++
++/* ip_conntrack_standalone needs this */
++atomic_t ip_conntrack_count = ATOMIC_INIT(0);
++
+ unsigned int ip_conntrack_htable_size = 0;
+ int ip_conntrack_max;
+-struct list_head *ip_conntrack_hash;
+ static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+ static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
+ struct ip_conntrack ip_conntrack_untracked;
+ unsigned int ip_ct_log_invalid;
+-static LIST_HEAD(unconfirmed);
++#ifndef CONFIG_VE_IPTABLES
+ static int ip_conntrack_vmalloc;
++#endif
+
+ static unsigned int ip_conntrack_next_id = 1;
+ static unsigned int ip_conntrack_expect_next_id = 1;
+@@ -105,6 +125,9 @@ void ip_ct_deliver_cached_events(const s
+ {
+ struct ip_conntrack_ecache *ecache;
+
++ if (!ve_is_super(get_exec_env()))
++ return;
++
+ local_bh_disable();
+ ecache = &__get_cpu_var(ip_conntrack_ecache);
+ if (ecache->ct == ct)
+@@ -133,6 +156,9 @@ static void ip_ct_event_cache_flush(void
+ struct ip_conntrack_ecache *ecache;
+ int cpu;
+
++ if (!ve_is_super(get_exec_env()))
++ return;
++
+ for_each_cpu(cpu) {
+ ecache = &per_cpu(ip_conntrack_ecache, cpu);
+ if (ecache->ct)
+@@ -226,7 +252,7 @@ __ip_conntrack_expect_find(const struct
+ {
+ struct ip_conntrack_expect *i;
+
+- list_for_each_entry(i, &ip_conntrack_expect_list, list) {
++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) {
+ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
+ atomic_inc(&i->use);
+ return i;
+@@ -255,7 +281,7 @@ find_expectation(const struct ip_conntra
+ {
+ struct ip_conntrack_expect *i;
+
+- list_for_each_entry(i, &ip_conntrack_expect_list, list) {
++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) {
+ /* If master is not in hash table yet (ie. packet hasn't left
+ this machine yet), how can other end know about expected?
+ Hence these are not the droids you are looking for (if
+@@ -284,7 +310,7 @@ void ip_ct_remove_expectations(struct ip
+ if (ct->expecting == 0)
+ return;
+
+- list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
++ list_for_each_entry_safe(i, tmp, &ve_ip_conntrack_expect_list, list) {
+ if (i->master == ct && del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ ip_conntrack_expect_put(i);
+@@ -302,8 +328,10 @@ clean_from_lists(struct ip_conntrack *ct
+
+ ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+- LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+- LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
++ LIST_DELETE(&ve_ip_conntrack_hash[ho],
++ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
++ LIST_DELETE(&ve_ip_conntrack_hash[hr],
++ &ct->tuplehash[IP_CT_DIR_REPLY]);
+
+ /* Destroy all pending expectations */
+ ip_ct_remove_expectations(ct);
+@@ -329,8 +357,8 @@ destroy_conntrack(struct nf_conntrack *n
+ if (proto && proto->destroy)
+ proto->destroy(ct);
+
+- if (ip_conntrack_destroyed)
+- ip_conntrack_destroyed(ct);
++ if (ve_ip_conntrack_destroyed)
++ ve_ip_conntrack_destroyed(ct);
+
+ write_lock_bh(&ip_conntrack_lock);
+ /* Expectations will have been removed in clean_from_lists,
+@@ -358,7 +386,11 @@ destroy_conntrack(struct nf_conntrack *n
+ static void death_by_timeout(unsigned long ul_conntrack)
+ {
+ struct ip_conntrack *ct = (void *)ul_conntrack;
++#ifdef CONFIG_VE_IPTABLES
++ struct ve_struct *old;
+
++ old = set_exec_env(VE_OWNER_CT(ct));
++#endif
+ write_lock_bh(&ip_conntrack_lock);
+ /* Inside lock so preempt is disabled on module removal path.
+ * Otherwise we can get spurious warnings. */
+@@ -366,6 +398,9 @@ static void death_by_timeout(unsigned lo
+ clean_from_lists(ct);
+ write_unlock_bh(&ip_conntrack_lock);
+ ip_conntrack_put(ct);
++#ifdef CONFIG_VE_IPTABLES
++ (void)set_exec_env(old);
++#endif
+ }
+
+ static inline int
+@@ -386,7 +421,7 @@ __ip_conntrack_find(const struct ip_conn
+ unsigned int hash = hash_conntrack(tuple);
+
+ ASSERT_READ_LOCK(&ip_conntrack_lock);
+- list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
++ list_for_each_entry(h, &ve_ip_conntrack_hash[hash], list) {
+ if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+ CONNTRACK_STAT_INC(found);
+ return h;
+@@ -418,9 +453,9 @@ static void __ip_conntrack_hash_insert(s
+ unsigned int repl_hash)
+ {
+ ct->id = ++ip_conntrack_next_id;
+- list_prepend(&ip_conntrack_hash[hash],
++ list_prepend(&ve_ip_conntrack_hash[hash],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+- list_prepend(&ip_conntrack_hash[repl_hash],
++ list_prepend(&ve_ip_conntrack_hash[repl_hash],
+ &ct->tuplehash[IP_CT_DIR_REPLY].list);
+ }
+
+@@ -471,11 +506,11 @@ __ip_conntrack_confirm(struct sk_buff **
+ /* See if there's one in the list already, including reverse:
+ NAT could have grabbed it without realizing, since we're
+ not in the hash. If there is, we lost race. */
+- if (!LIST_FIND(&ip_conntrack_hash[hash],
++ if (!LIST_FIND(&ve_ip_conntrack_hash[hash],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+- && !LIST_FIND(&ip_conntrack_hash[repl_hash],
++ && !LIST_FIND(&ve_ip_conntrack_hash[repl_hash],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+@@ -569,7 +604,7 @@ static inline int helper_cmp(const struc
+ static struct ip_conntrack_helper *
+ __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
+ {
+- return LIST_FIND(&helpers, helper_cmp,
++ return LIST_FIND(&ve_ip_conntrack_helpers, helper_cmp,
+ struct ip_conntrack_helper *,
+ tuple);
+ }
+@@ -605,7 +640,7 @@ void ip_conntrack_helper_put(struct ip_c
+ struct ip_conntrack_protocol *
+ __ip_conntrack_proto_find(u_int8_t protocol)
+ {
+- return ip_ct_protos[protocol];
++ return ve_ip_ct_protos[protocol];
+ }
+
+ /* this is guaranteed to always return a valid protocol helper, since
+@@ -632,29 +667,32 @@ void ip_conntrack_proto_put(struct ip_co
+ }
+
+ struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
+- struct ip_conntrack_tuple *repl)
++ struct ip_conntrack_tuple *repl, struct user_beancounter *ub)
+ {
+ struct ip_conntrack *conntrack;
++ struct user_beancounter *old_ub;
+
+ if (!ip_conntrack_hash_rnd_initted) {
+ get_random_bytes(&ip_conntrack_hash_rnd, 4);
+ ip_conntrack_hash_rnd_initted = 1;
+ }
+
+- if (ip_conntrack_max
+- && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
++ if (ve_ip_conntrack_max
++ && atomic_read(&ve_ip_conntrack_count) >= ve_ip_conntrack_max) {
+ unsigned int hash = hash_conntrack(orig);
+ /* Try dropping from this hash chain. */
+- if (!early_drop(&ip_conntrack_hash[hash])) {
++ if (!early_drop(&ve_ip_conntrack_hash[hash])) {
+ if (net_ratelimit())
+- printk(KERN_WARNING
+- "ip_conntrack: table full, dropping"
+- " packet.\n");
++ ve_printk(VE_LOG_BOTH, KERN_WARNING
++ "ip_conntrack: VPS %d: table full, dropping"
++ " packet.\n", VEID(get_exec_env()));
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
++ old_ub = set_exec_ub(ub);
+ conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
++ (void)set_exec_ub(old_ub);
+ if (!conntrack) {
+ DEBUGP("Can't allocate conntrack.\n");
+ return ERR_PTR(-ENOMEM);
+@@ -669,8 +707,11 @@ struct ip_conntrack *ip_conntrack_alloc(
+ init_timer(&conntrack->timeout);
+ conntrack->timeout.data = (unsigned long)conntrack;
+ conntrack->timeout.function = death_by_timeout;
++#ifdef CONFIG_VE_IPTABLES
++ SET_VE_OWNER_CT(conntrack, get_exec_env());
++#endif
+
+- atomic_inc(&ip_conntrack_count);
++ atomic_inc(&ve_ip_conntrack_count);
+
+ return conntrack;
+ }
+@@ -678,7 +719,7 @@ struct ip_conntrack *ip_conntrack_alloc(
+ void
+ ip_conntrack_free(struct ip_conntrack *conntrack)
+ {
+- atomic_dec(&ip_conntrack_count);
++ atomic_dec(&ve_ip_conntrack_count);
+ kmem_cache_free(ip_conntrack_cachep, conntrack);
+ }
+
+@@ -692,13 +733,22 @@ init_conntrack(struct ip_conntrack_tuple
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ struct ip_conntrack_expect *exp;
++ struct user_beancounter *ub;
+
+ if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return NULL;
+ }
+
+- conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
++#ifdef CONFIG_USER_RESOURCE
++ if (skb->dev != NULL) /* received skb */
++ ub = netdev_bc(skb->dev)->exec_ub;
++ else if (skb->sk != NULL) /* sent skb */
++ ub = sock_bc(skb->sk)->ub;
++ else
++#endif
++ ub = NULL;
++ conntrack = ip_conntrack_alloc(tuple, &repl_tuple, ub);
+ if (conntrack == NULL || IS_ERR(conntrack))
+ return (struct ip_conntrack_tuple_hash *)conntrack;
+
+@@ -733,7 +783,8 @@ init_conntrack(struct ip_conntrack_tuple
+ }
+
+ /* Overload tuple linked list to put us in unconfirmed list. */
+- list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
++ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list,
++ &ve_ip_conntrack_unconfirmed);
+
+ write_unlock_bh(&ip_conntrack_lock);
+
+@@ -925,7 +976,7 @@ void ip_conntrack_unexpect_related(struc
+
+ write_lock_bh(&ip_conntrack_lock);
+ /* choose the the oldest expectation to evict */
+- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
++ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) {
+ if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ write_unlock_bh(&ip_conntrack_lock);
+@@ -959,11 +1010,11 @@ void ip_conntrack_expect_put(struct ip_c
+ kmem_cache_free(ip_conntrack_expect_cachep, exp);
+ }
+
+-static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
++void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
+ {
+ atomic_inc(&exp->use);
+ exp->master->expecting++;
+- list_add(&exp->list, &ip_conntrack_expect_list);
++ list_add(&exp->list, &ve_ip_conntrack_expect_list);
+
+ init_timer(&exp->timeout);
+ exp->timeout.data = (unsigned long)exp;
+@@ -975,13 +1026,14 @@ static void ip_conntrack_expect_insert(s
+ atomic_inc(&exp->use);
+ CONNTRACK_STAT_INC(expect_create);
+ }
++EXPORT_SYMBOL_GPL(ip_conntrack_expect_insert);
+
+ /* Race with expectations being used means we could have none to find; OK. */
+ static void evict_oldest_expect(struct ip_conntrack *master)
+ {
+ struct ip_conntrack_expect *i;
+
+- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
++ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) {
+ if (i->master == master) {
+ if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+@@ -1012,7 +1064,7 @@ int ip_conntrack_expect_related(struct i
+ DEBUGP("mask: "); DUMP_TUPLE(&expect->mask);
+
+ write_lock_bh(&ip_conntrack_lock);
+- list_for_each_entry(i, &ip_conntrack_expect_list, list) {
++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) {
+ if (expect_matches(i, expect)) {
+ /* Refresh timer: if it's dying, ignore.. */
+ if (refresh_timer(i)) {
+@@ -1060,18 +1112,48 @@ int ip_conntrack_helper_register(struct
+ {
+ BUG_ON(me->timeout == 0);
+ write_lock_bh(&ip_conntrack_lock);
+- list_prepend(&helpers, me);
++ list_prepend(&ve_ip_conntrack_helpers, me);
+ write_unlock_bh(&ip_conntrack_lock);
+
+ return 0;
+ }
+
++int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *me)
++{
++ int ret;
++ struct module *mod = me->me;
++
++ if (!ve_is_super(get_exec_env())) {
++ struct ip_conntrack_helper *tmp;
++ __module_get(mod);
++ ret = -ENOMEM;
++ tmp = kmalloc(sizeof(struct ip_conntrack_helper), GFP_KERNEL);
++ if (!tmp)
++ goto nomem;
++ memcpy(tmp, me, sizeof(struct ip_conntrack_helper));
++ me = tmp;
++ }
++
++ ret = ip_conntrack_helper_register(me);
++ if (ret)
++ goto out;
++
++ return 0;
++out:
++ if (!ve_is_super(get_exec_env())){
++ kfree(me);
++nomem:
++ module_put(mod);
++ }
++ return ret;
++}
++
+ struct ip_conntrack_helper *
+ __ip_conntrack_helper_find_byname(const char *name)
+ {
+ struct ip_conntrack_helper *h;
+
+- list_for_each_entry(h, &helpers, list) {
++ list_for_each_entry(h, &ve_ip_conntrack_helpers, list) {
+ if (!strcmp(h->name, name))
+ return h;
+ }
+@@ -1096,19 +1178,20 @@ void ip_conntrack_helper_unregister(stru
+
+ /* Need write lock here, to delete helper. */
+ write_lock_bh(&ip_conntrack_lock);
+- LIST_DELETE(&helpers, me);
++ LIST_DELETE(&ve_ip_conntrack_helpers, me);
+
+ /* Get rid of expectations */
+- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) {
+ if (exp->master->helper == me && del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+ ip_conntrack_expect_put(exp);
+ }
+ }
+ /* Get rid of expecteds, set helpers to NULL. */
+- LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
++ LIST_FIND_W(&ve_ip_conntrack_unconfirmed, unhelp,
++ struct ip_conntrack_tuple_hash*, me);
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+- LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
++ LIST_FIND_W(&ve_ip_conntrack_hash[i], unhelp,
+ struct ip_conntrack_tuple_hash *, me);
+ write_unlock_bh(&ip_conntrack_lock);
+
+@@ -1116,6 +1199,25 @@ void ip_conntrack_helper_unregister(stru
+ synchronize_net();
+ }
+
++void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
++{
++
++ if (!ve_is_super(get_exec_env())) {
++ read_lock_bh(&ip_conntrack_lock);
++ me = list_named_find(&ve_ip_conntrack_helpers, me->name);
++ read_unlock_bh(&ip_conntrack_lock);
++ if (!me)
++ return;
++ }
++
++ ip_conntrack_helper_unregister(me);
++
++ if (!ve_is_super(get_exec_env())) {
++ module_put(me->me);
++ kfree(me);
++ }
++}
++
+ /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
+ enum ip_conntrack_info ctinfo,
+@@ -1246,13 +1348,13 @@ get_next_corpse(int (*iter)(struct ip_co
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+- h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
++ h = LIST_FIND_W(&ve_ip_conntrack_hash[*bucket], do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
+ if (h)
+ break;
+ }
+ if (!h)
+- h = LIST_FIND_W(&unconfirmed, do_iter,
++ h = LIST_FIND_W(&ve_ip_conntrack_unconfirmed, do_iter,
+ struct ip_conntrack_tuple_hash *, iter, data);
+ if (h)
+ atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+@@ -1289,6 +1391,11 @@ getorigdst(struct sock *sk, int optval,
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple;
+
++#ifdef CONFIG_VE_IPTABLES
++ if (!get_exec_env()->_ip_conntrack)
++ return -ENOPROTOOPT;
++#endif
++
+ IP_CT_TUPLE_U_BLANK(&tuple);
+ tuple.src.ip = inet->rcv_saddr;
+ tuple.src.u.tcp.port = inet->sport;
+@@ -1318,6 +1425,7 @@ getorigdst(struct sock *sk, int optval,
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.ip;
++ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+@@ -1359,12 +1467,17 @@ static void free_conntrack_hash(struct l
+ get_order(sizeof(struct list_head) * size));
+ }
+
++static void ip_conntrack_cache_free(void)
++{
++ kmem_cache_destroy(ip_conntrack_expect_cachep);
++ kmem_cache_destroy(ip_conntrack_cachep);
++ nf_unregister_sockopt(&so_getorigdst);
++}
++
+ /* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+ void ip_conntrack_cleanup(void)
+ {
+- ip_ct_attach = NULL;
+-
+ /* This makes sure all current packets have passed through
+ netfilter framework. Roll on, two-stage module
+ delete... */
+@@ -1373,19 +1486,32 @@ void ip_conntrack_cleanup(void)
+ ip_ct_event_cache_flush();
+ i_see_dead_people:
+ ip_conntrack_flush();
+- if (atomic_read(&ip_conntrack_count) != 0) {
++ if (atomic_read(&ve_ip_conntrack_count) != 0) {
+ schedule();
+ goto i_see_dead_people;
+ }
+- /* wait until all references to ip_conntrack_untracked are dropped */
+- while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
+- schedule();
+-
+- kmem_cache_destroy(ip_conntrack_cachep);
+- kmem_cache_destroy(ip_conntrack_expect_cachep);
+- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
++ if (ve_is_super(get_exec_env())) {
++ /* wait until all references to ip_conntrack_untracked are
++ * dropped */
++ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
++ schedule();
++ ip_ct_attach = NULL;
++ ip_conntrack_cache_free();
++ }
++ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
+- nf_unregister_sockopt(&so_getorigdst);
++ ve_ip_conntrack_hash = NULL;
++ INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed);
++ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list);
++ INIT_LIST_HEAD(&ve_ip_conntrack_helpers);
++ atomic_set(&ve_ip_conntrack_count, 0);
++ ve_ip_conntrack_max = 0;
++#ifdef CONFIG_VE_IPTABLES
++ kfree(ve_ip_ct_protos);
++ ve_ip_ct_protos = NULL;
++ kfree(get_exec_env()->_ip_conntrack);
++ get_exec_env()->_ip_conntrack = NULL;
++#endif
+ }
+
+ static struct list_head *alloc_hashtable(int size, int *vmalloced)
+@@ -1394,13 +1520,13 @@ static struct list_head *alloc_hashtable
+ unsigned int i;
+
+ *vmalloced = 0;
+- hash = (void*)__get_free_pages(GFP_KERNEL,
++ hash = (void*)__get_free_pages(GFP_KERNEL_UBC,
+ get_order(sizeof(struct list_head)
+ * size));
+ if (!hash) {
+ *vmalloced = 1;
+ printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+- hash = vmalloc(sizeof(struct list_head) * size);
++ hash = ub_vmalloc(sizeof(struct list_head) * size);
+ }
+
+ if (hash)
+@@ -1436,8 +1562,8 @@ static int set_hashsize(const char *val,
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+- while (!list_empty(&ip_conntrack_hash[i])) {
+- h = list_entry(ip_conntrack_hash[i].next,
++ while (!list_empty(&ve_ip_conntrack_hash[i])) {
++ h = list_entry(ve_ip_conntrack_hash[i].next,
+ struct ip_conntrack_tuple_hash, list);
+ list_del(&h->list);
+ bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+@@ -1445,12 +1571,12 @@ static int set_hashsize(const char *val,
+ }
+ }
+ old_size = ip_conntrack_htable_size;
+- old_vmalloced = ip_conntrack_vmalloc;
+- old_hash = ip_conntrack_hash;
++ old_vmalloced = ve_ip_conntrack_vmalloc;
++ old_hash = ve_ip_conntrack_hash;
+
+ ip_conntrack_htable_size = hashsize;
+- ip_conntrack_vmalloc = vmalloced;
+- ip_conntrack_hash = hash;
++ ve_ip_conntrack_vmalloc = vmalloced;
++ ve_ip_conntrack_hash = hash;
+ ip_conntrack_hash_rnd = rnd;
+ write_unlock_bh(&ip_conntrack_lock);
+
+@@ -1461,9 +1587,8 @@ static int set_hashsize(const char *val,
+ module_param_call(hashsize, set_hashsize, param_get_uint,
+ &ip_conntrack_htable_size, 0600);
+
+-int __init ip_conntrack_init(void)
++static int ip_conntrack_cache_create(void)
+ {
+- unsigned int i;
+ int ret;
+
+ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
+@@ -1477,70 +1602,127 @@ int __init ip_conntrack_init(void)
+ if (ip_conntrack_htable_size < 16)
+ ip_conntrack_htable_size = 16;
+ }
+- ip_conntrack_max = 8 * ip_conntrack_htable_size;
++ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size;
+
+ printk("ip_conntrack version %s (%u buckets, %d max)"
+ " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
+- ip_conntrack_htable_size, ip_conntrack_max,
++ ip_conntrack_htable_size, ve_ip_conntrack_max,
+ sizeof(struct ip_conntrack));
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret != 0) {
+ printk(KERN_ERR "Unable to register netfilter socket option\n");
+- return ret;
+- }
+-
+- ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+- &ip_conntrack_vmalloc);
+- if (!ip_conntrack_hash) {
+- printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
+- goto err_unreg_sockopt;
++ goto out_sockopt;
+ }
+
++ ret = -ENOMEM;
+ ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
+ sizeof(struct ip_conntrack), 0,
+- 0, NULL, NULL);
++ SLAB_UBC, NULL, NULL);
+ if (!ip_conntrack_cachep) {
+ printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
+- goto err_free_hash;
++ goto err_unreg_sockopt;
+ }
+
+ ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+ sizeof(struct ip_conntrack_expect),
+- 0, 0, NULL, NULL);
++ 0, SLAB_UBC, NULL, NULL);
+ if (!ip_conntrack_expect_cachep) {
+ printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+ goto err_free_conntrack_slab;
+ }
+
++ return 0;
++
++err_free_conntrack_slab:
++ kmem_cache_destroy(ip_conntrack_cachep);
++err_unreg_sockopt:
++ nf_unregister_sockopt(&so_getorigdst);
++out_sockopt:
++ return ret;
++}
++
++int ip_conntrack_init(void)
++{
++ struct ve_struct *env;
++ unsigned int i;
++ int ret;
++
++ env = get_exec_env();
++#ifdef CONFIG_VE_IPTABLES
++ ret = -ENOMEM;
++ env->_ip_conntrack =
++ kmalloc(sizeof(struct ve_ip_conntrack), GFP_KERNEL);
++ if (!env->_ip_conntrack)
++ goto out;
++ memset(env->_ip_conntrack, 0, sizeof(struct ve_ip_conntrack));
++ if (ve_is_super(env)) {
++ ret = ip_conntrack_cache_create();
++ if (ret)
++ goto cache_fail;
++ } else
++ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size;
++#else /* CONFIG_VE_IPTABLES */
++ ret = ip_conntrack_cache_create();
++ if (ret)
++ goto out;
++#endif
++
++ ret = -ENOMEM;
++ ve_ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
++ &ve_ip_conntrack_vmalloc);
++ if (!ve_ip_conntrack_hash) {
++ printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
++ goto err_free_cache;
++ }
++
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_ct_protos = (struct ip_conntrack_protocol **)
++ ub_kmalloc(sizeof(void *)*MAX_IP_CT_PROTO, GFP_KERNEL);
++ if (!ve_ip_ct_protos)
++ goto err_free_hash;
++#endif
+ /* Don't NEED lock here, but good form anyway. */
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < MAX_IP_CT_PROTO; i++)
+- ip_ct_protos[i] = &ip_conntrack_generic_protocol;
++ ve_ip_ct_protos[i] = &ip_conntrack_generic_protocol;
+ /* Sew in builtin protocols. */
+- ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+- ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+- ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
++ ve_ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
++ ve_ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
++ ve_ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
+ write_unlock_bh(&ip_conntrack_lock);
+
+- /* For use by ipt_REJECT */
+- ip_ct_attach = ip_conntrack_attach;
+-
+- /* Set up fake conntrack:
+- - to never be deleted, not in any hashes */
+- atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
+- /* - and look it like as a confirmed connection */
+- set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
++ INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed);
++ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list);
++ INIT_LIST_HEAD(&ve_ip_conntrack_helpers);
++
++ if (ve_is_super(env)) {
++ /* For use by ipt_REJECT */
++ ip_ct_attach = ip_conntrack_attach;
++
++ /* Set up fake conntrack:
++ - to never be deleted, not in any hashes */
++ atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
++ /* - and look it like as a confirmed connection */
++ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
++ }
+
+- return ret;
++ return 0;
+
+-err_free_conntrack_slab:
+- kmem_cache_destroy(ip_conntrack_cachep);
++#ifdef CONFIG_VE_IPTABLES
+ err_free_hash:
+- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
++#endif
++ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
+-err_unreg_sockopt:
+- nf_unregister_sockopt(&so_getorigdst);
+-
+- return -ENOMEM;
++ ve_ip_conntrack_hash = NULL;
++err_free_cache:
++ if (ve_is_super(env))
++ ip_conntrack_cache_free();
++#ifdef CONFIG_VE_IPTABLES
++cache_fail:
++ kfree(env->_ip_conntrack);
++ env->_ip_conntrack = NULL;
++#endif
++out:
++ return ret;
+ }
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_ftp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_ftp.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/ctype.h>
+ #include <net/checksum.h>
+ #include <net/tcp.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+@@ -425,8 +426,8 @@ static int help(struct sk_buff **pskb,
+
+ /* Now, NAT might want to mangle the packet, and register the
+ * (possibly changed) expectation itself. */
+- if (ip_nat_ftp_hook)
+- ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
++ if (ve_ip_nat_ftp_hook)
++ ret = ve_ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+ matchoff, matchlen, exp, &seq);
+ else {
+ /* Can't expect this? Best to drop packet now. */
+@@ -452,16 +453,39 @@ out_update_nl:
+ static struct ip_conntrack_helper ftp[MAX_PORTS];
+ static char ftp_names[MAX_PORTS][sizeof("ftp-65535")];
+
+-/* Not __exit: called from init() */
+-static void fini(void)
++void fini_iptable_ftp(void)
+ {
+ int i;
+ for (i = 0; i < ports_c; i++) {
+ DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
+ ports[i]);
+- ip_conntrack_helper_unregister(&ftp[i]);
++ virt_ip_conntrack_helper_unregister(&ftp[i]);
+ }
++}
++
++int init_iptable_ftp(void)
++{
++ int i, ret;
+
++ for (i = 0; i < ports_c; i++) {
++ DEBUGP("ip_ct_ftp: registering helper for port %d\n",
++ ports[i]);
++ ret = virt_ip_conntrack_helper_register(&ftp[i]);
++ if (ret) {
++ fini_iptable_ftp();
++ return ret;
++ }
++ }
++ return 0;
++}
++
++/* Not __exit: called from init() */
++static void fini(void)
++{
++ KSYMMODUNRESOLVE(ip_conntrack_ftp);
++ KSYMUNRESOLVE(init_iptable_ftp);
++ KSYMUNRESOLVE(fini_iptable_ftp);
++ fini_iptable_ftp();
+ kfree(ftp_buffer);
+ }
+
+@@ -496,13 +520,17 @@ static int __init init(void)
+
+ DEBUGP("ip_ct_ftp: registering helper for port %d\n",
+ ports[i]);
+- ret = ip_conntrack_helper_register(&ftp[i]);
++ ret = virt_ip_conntrack_helper_register(&ftp[i]);
+
+ if (ret) {
+ fini();
+ return ret;
+ }
+ }
++
++ KSYMRESOLVE(init_iptable_ftp);
++ KSYMRESOLVE(fini_iptable_ftp);
++ KSYMMODRESOLVE(ip_conntrack_ftp);
+ return 0;
+ }
+
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_irc.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_irc.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_irc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_irc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -28,6 +28,7 @@
+ #include <linux/ip.h>
+ #include <net/checksum.h>
+ #include <net/tcp.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+@@ -244,6 +245,33 @@ static char irc_names[MAX_PORTS][sizeof(
+
+ static void fini(void);
+
++void fini_iptable_irc(void)
++{
++ int i;
++ for (i = 0; i < ports_c; i++) {
++ DEBUGP("unregistering port %d\n",
++ ports[i]);
++ virt_ip_conntrack_helper_unregister(&irc_helpers[i]);
++ }
++}
++
++int init_iptable_irc(void)
++{
++ int i, ret;
++
++ for (i = 0; i < ports_c; i++) {
++ DEBUGP("port #%d: %d\n", i, ports[i]);
++ ret = virt_ip_conntrack_helper_register(&irc_helpers[i]);
++ if (ret) {
++ printk("ip_conntrack_irc: ERROR registering port %d\n",
++ ports[i]);
++ fini_iptable_irc();
++ return -EBUSY;
++ }
++ }
++ return 0;
++}
++
+ static int __init init(void)
+ {
+ int i, ret;
+@@ -283,7 +311,7 @@ static int __init init(void)
+
+ DEBUGP("port #%d: %d\n", i, ports[i]);
+
+- ret = ip_conntrack_helper_register(hlpr);
++ ret = virt_ip_conntrack_helper_register(hlpr);
+
+ if (ret) {
+ printk("ip_conntrack_irc: ERROR registering port %d\n",
+@@ -292,6 +320,10 @@ static int __init init(void)
+ return -EBUSY;
+ }
+ }
++
++ KSYMRESOLVE(init_iptable_irc);
++ KSYMRESOLVE(fini_iptable_irc);
++ KSYMMODRESOLVE(ip_conntrack_irc);
+ return 0;
+ }
+
+@@ -299,12 +331,10 @@ static int __init init(void)
+ * it is needed by the init function */
+ static void fini(void)
+ {
+- int i;
+- for (i = 0; i < ports_c; i++) {
+- DEBUGP("unregistering port %d\n",
+- ports[i]);
+- ip_conntrack_helper_unregister(&irc_helpers[i]);
+- }
++ KSYMMODUNRESOLVE(ip_conntrack_irc);
++ KSYMUNRESOLVE(init_iptable_irc);
++ KSYMUNRESOLVE(fini_iptable_irc);
++ fini_iptable_irc();
+ kfree(irc_buffer);
+ }
+
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_netlink.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_netlink.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-07-04 14:41:39.000000000 +0400
+@@ -29,6 +29,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
+ #include <linux/notifier.h>
++#include <net/sock.h>
+
+ #include <linux/netfilter.h>
+ #include <linux/netfilter_ipv4/ip_conntrack.h>
+@@ -39,6 +40,8 @@
+
+ #include <linux/netfilter/nfnetlink.h>
+ #include <linux/netfilter/nfnetlink_conntrack.h>
++#include <ub/beancounter.h>
++#include <ub/ub_sk.h>
+
+ MODULE_LICENSE("GPL");
+
+@@ -403,7 +406,7 @@ ctnetlink_dump_table(struct sk_buff *skb
+
+ read_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
++ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+@@ -440,7 +443,7 @@ ctnetlink_dump_table_w(struct sk_buff *s
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) {
+- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) {
++ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) {
+ h = (struct ip_conntrack_tuple_hash *) i;
+ if (DIRECTION(h) != IP_CT_DIR_ORIGINAL)
+ continue;
+@@ -1003,14 +1006,15 @@ ctnetlink_change_conntrack(struct ip_con
+ static int
+ ctnetlink_create_conntrack(struct nfattr *cda[],
+ struct ip_conntrack_tuple *otuple,
+- struct ip_conntrack_tuple *rtuple)
++ struct ip_conntrack_tuple *rtuple,
++ struct user_beancounter *ub)
+ {
+ struct ip_conntrack *ct;
+ int err = -EINVAL;
+
+ DEBUGP("entered %s\n", __FUNCTION__);
+
+- ct = ip_conntrack_alloc(otuple, rtuple);
++ ct = ip_conntrack_alloc(otuple, rtuple, ub);
+ if (ct == NULL || IS_ERR(ct))
+ return -ENOMEM;
+
+@@ -1087,8 +1091,16 @@ ctnetlink_new_conntrack(struct sock *ctn
+ write_unlock_bh(&ip_conntrack_lock);
+ DEBUGP("no such conntrack, create new\n");
+ err = -ENOENT;
+- if (nlh->nlmsg_flags & NLM_F_CREATE)
+- err = ctnetlink_create_conntrack(cda, &otuple, &rtuple);
++ if (nlh->nlmsg_flags & NLM_F_CREATE) {
++#ifdef CONFIG_USER_RESOURCE
++ if (skb->sk)
++ err = ctnetlink_create_conntrack(cda, &otuple,
++ &rtuple, sock_bc(skb->sk)->ub);
++ else
++#endif
++ err = ctnetlink_create_conntrack(cda,
++ &otuple, &rtuple, NULL);
++ }
+ return err;
+ }
+ /* implicit 'else' */
+@@ -1249,7 +1261,7 @@ ctnetlink_exp_dump_table(struct sk_buff
+ DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id);
+
+ read_lock_bh(&ip_conntrack_lock);
+- list_for_each_prev(i, &ip_conntrack_expect_list) {
++ list_for_each_prev(i, &ve_ip_conntrack_expect_list) {
+ exp = (struct ip_conntrack_expect *) i;
+ if (exp->id <= *id)
+ continue;
+@@ -1395,7 +1407,7 @@ ctnetlink_del_expect(struct sock *ctnl,
+ write_unlock_bh(&ip_conntrack_lock);
+ return -EINVAL;
+ }
+- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list,
+ list) {
+ if (exp->master->helper == h
+ && del_timer(&exp->timeout)) {
+@@ -1407,7 +1419,7 @@ ctnetlink_del_expect(struct sock *ctnl,
+ } else {
+ /* This basically means we have to flush everything*/
+ write_lock_bh(&ip_conntrack_lock);
+- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list,
++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list,
+ list) {
+ if (del_timer(&exp->timeout)) {
+ ip_ct_unlink_expect(exp);
+@@ -1619,7 +1631,7 @@ static void __exit ctnetlink_exit(void)
+ printk("ctnetlink: unregistering from nfnetlink.\n");
+
+ #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+- ip_conntrack_unregister_notifier(&ctnl_notifier_exp);
++ ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp);
+ ip_conntrack_unregister_notifier(&ctnl_notifier);
+ #endif
+
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_generic.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-07-04 14:41:39.000000000 +0400
+@@ -52,7 +52,7 @@ static int packet(struct ip_conntrack *c
+ const struct sk_buff *skb,
+ enum ip_conntrack_info ctinfo)
+ {
+- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
++ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_generic_timeout);
+ return NF_ACCEPT;
+ }
+
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -104,7 +104,7 @@ static int icmp_packet(struct ip_conntra
+ } else {
+ atomic_inc(&ct->proto.icmp.count);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+- ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
++ ip_ct_refresh_acct(ct, ctinfo, skb, ve_ip_ct_icmp_timeout);
+ }
+
+ return NF_ACCEPT;
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2006-07-04 14:41:36.000000000 +0400
+@@ -235,12 +235,15 @@ static int do_basic_checks(struct ip_con
+ flag = 1;
+ }
+
+- /* Cookie Ack/Echo chunks not the first OR
+- Init / Init Ack / Shutdown compl chunks not the only chunks */
+- if ((sch->type == SCTP_CID_COOKIE_ACK
++ /*
++ * Cookie Ack/Echo chunks not the first OR
++ * Init / Init Ack / Shutdown compl chunks not the only chunks
++ * OR zero-length.
++ */
++ if (((sch->type == SCTP_CID_COOKIE_ACK
+ || sch->type == SCTP_CID_COOKIE_ECHO
+ || flag)
+- && count !=0 ) {
++ && count !=0) || !sch->length) {
+ DEBUGP("Basic checks failed\n");
+ return 1;
+ }
+@@ -251,7 +254,7 @@ static int do_basic_checks(struct ip_con
+ }
+
+ DEBUGP("Basic checks passed\n");
+- return 0;
++ return count == 0;
+ }
+
+ static int new_state(enum ip_conntrack_dir dir,
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -98,7 +98,7 @@ unsigned int ip_ct_tcp_timeout_close =
+ to ~13-30min depending on RTO. */
+ unsigned int ip_ct_tcp_timeout_max_retrans = 5 MINS;
+
+-static const unsigned int * tcp_timeouts[]
++const unsigned int * tcp_timeouts[]
+ = { NULL, /* TCP_CONNTRACK_NONE */
+ &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */
+ &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */
+@@ -762,7 +762,7 @@ static int tcp_in_window(struct ip_ct_tc
+ : "SEQ is under the lower bound (already ACKed data retransmitted)"
+ : "SEQ is over the upper bound (over the window of the receiver)");
+
+- res = ip_ct_tcp_be_liberal;
++ res = ve_ip_ct_tcp_be_liberal;
+ }
+
+ DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+@@ -1033,9 +1033,11 @@ static int tcp_packet(struct ip_conntrac
+ && (new_state == TCP_CONNTRACK_FIN_WAIT
+ || new_state == TCP_CONNTRACK_CLOSE))
+ conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+- timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
+- && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
+- ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
++ timeout = conntrack->proto.tcp.retrans >= ve_ip_ct_tcp_max_retrans &&
++ ve_ip_ct_tcp_timeouts[new_state] >
++ ve_ip_ct_tcp_timeout_max_retrans
++ ? ve_ip_ct_tcp_timeout_max_retrans :
++ ve_ip_ct_tcp_timeouts[new_state];
+ write_unlock_bh(&tcp_lock);
+
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+@@ -1110,7 +1112,7 @@ static int tcp_new(struct ip_conntrack *
+ conntrack->proto.tcp.seen[1].flags = 0;
+ conntrack->proto.tcp.seen[0].loose =
+ conntrack->proto.tcp.seen[1].loose = 0;
+- } else if (ip_ct_tcp_loose == 0) {
++ } else if (ve_ip_ct_tcp_loose == 0) {
+ /* Don't try to pick up connections. */
+ return 0;
+ } else {
+@@ -1134,7 +1136,7 @@ static int tcp_new(struct ip_conntrack *
+ conntrack->proto.tcp.seen[0].flags =
+ conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
+ conntrack->proto.tcp.seen[0].loose =
+- conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
++ conntrack->proto.tcp.seen[1].loose = ve_ip_ct_tcp_loose;
+ }
+
+ conntrack->proto.tcp.seen[1].td_end = 0;
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -71,12 +71,12 @@ static int udp_packet(struct ip_conntrac
+ stream. Extend timeout. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+ ip_ct_refresh_acct(conntrack, ctinfo, skb,
+- ip_ct_udp_timeout_stream);
++ ve_ip_ct_udp_timeout_stream);
+ /* Also, more likely to be important, and not a probe */
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
+ } else
+- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
++ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_udp_timeout);
+
+ return NF_ACCEPT;
+ }
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_standalone.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-07-04 14:41:39.000000000 +0400
+@@ -28,6 +28,7 @@
+ #include <net/checksum.h>
+ #include <net/ip.h>
+ #include <net/route.h>
++#include <linux/nfcalls.h>
+
+ #define ASSERT_READ_LOCK(x)
+ #define ASSERT_WRITE_LOCK(x)
+@@ -46,9 +47,31 @@
+
+ MODULE_LICENSE("GPL");
+
++int ip_conntrack_disable_ve0 = 0;
++module_param(ip_conntrack_disable_ve0, int, 0440);
++
+ extern atomic_t ip_conntrack_count;
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_ip_conntrack_count \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_count)
++#else
++#define ve_ip_conntrack_count ip_conntrack_count
++#endif
+ DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
++/* Prior to 2.6.15, we had a ip_conntrack_enable_ve0 param. */
++static int warn_set(const char *val, struct kernel_param *kp)
++{
++ printk(KERN_INFO KBUILD_MODNAME
++ ": parameter ip_conntrack_enable_ve0 is obsoleted. In ovzkernel"
++ " >= 2.6.15 connection tracking on hardware node is enabled by "
++ "default, use ip_conntrack_disable_ve0=1 parameter to "
++ "disable.\n");
++ return 0;
++}
++module_param_call(ip_conntrack_enable_ve0, warn_set, NULL, NULL, 0);
++
+ static int kill_proto(struct ip_conntrack *i, void *data)
+ {
+ return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum ==
+@@ -89,8 +112,8 @@ static struct list_head *ct_get_first(st
+ for (st->bucket = 0;
+ st->bucket < ip_conntrack_htable_size;
+ st->bucket++) {
+- if (!list_empty(&ip_conntrack_hash[st->bucket]))
+- return ip_conntrack_hash[st->bucket].next;
++ if (!list_empty(&ve_ip_conntrack_hash[st->bucket]))
++ return ve_ip_conntrack_hash[st->bucket].next;
+ }
+ return NULL;
+ }
+@@ -100,10 +123,10 @@ static struct list_head *ct_get_next(str
+ struct ct_iter_state *st = seq->private;
+
+ head = head->next;
+- while (head == &ip_conntrack_hash[st->bucket]) {
++ while (head == &ve_ip_conntrack_hash[st->bucket]) {
+ if (++st->bucket >= ip_conntrack_htable_size)
+ return NULL;
+- head = ip_conntrack_hash[st->bucket].next;
++ head = ve_ip_conntrack_hash[st->bucket].next;
+ }
+ return head;
+ }
+@@ -234,7 +257,7 @@ static struct file_operations ct_file_op
+ /* expects */
+ static void *exp_seq_start(struct seq_file *s, loff_t *pos)
+ {
+- struct list_head *e = &ip_conntrack_expect_list;
++ struct list_head *e = &ve_ip_conntrack_expect_list;
+ loff_t i;
+
+ /* strange seq_file api calls stop even if we fail,
+@@ -246,7 +269,7 @@ static void *exp_seq_start(struct seq_fi
+
+ for (i = 0; i <= *pos; i++) {
+ e = e->next;
+- if (e == &ip_conntrack_expect_list)
++ if (e == &ve_ip_conntrack_expect_list)
+ return NULL;
+ }
+ return e;
+@@ -259,7 +282,7 @@ static void *exp_seq_next(struct seq_fil
+ ++*pos;
+ e = e->next;
+
+- if (e == &ip_conntrack_expect_list)
++ if (e == &ve_ip_conntrack_expect_list)
+ return NULL;
+
+ return e;
+@@ -344,7 +367,7 @@ static void ct_cpu_seq_stop(struct seq_f
+
+ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+ {
+- unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
++ unsigned int nr_conntracks = atomic_read(&ve_ip_conntrack_count);
+ struct ip_conntrack_stat *st = v;
+
+ if (v == SEQ_START_TOKEN) {
+@@ -541,6 +564,28 @@ static struct nf_hook_ops ip_conntrack_l
+
+ /* From ip_conntrack_core.c */
+ extern int ip_conntrack_max;
++#ifdef CONFIG_VE_IPTABLES
++#define ve_ip_conntrack_max \
++ (get_exec_env()->_ip_conntrack->_ip_conntrack_max)
++#define ve_ip_ct_sysctl_header \
++ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_header)
++#define ve_ip_ct_net_table \
++ (get_exec_env()->_ip_conntrack->_ip_ct_net_table)
++#define ve_ip_ct_ipv4_table \
++ (get_exec_env()->_ip_conntrack->_ip_ct_ipv4_table)
++#define ve_ip_ct_netfilter_table \
++ (get_exec_env()->_ip_conntrack->_ip_ct_netfilter_table)
++#define ve_ip_ct_sysctl_table \
++ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_table)
++#else
++#define ve_ip_conntrack_max ip_conntrack_max
++static struct ctl_table_header *ip_ct_sysctl_header;
++#define ve_ip_ct_sysctl_header ip_ct_sysctl_header
++#define ve_ip_ct_net_table ip_ct_net_table
++#define ve_ip_ct_ipv4_table ip_ct_ipv4_table
++#define ve_ip_ct_netfilter_table ip_ct_netfilter_table
++#define ve_ip_ct_sysctl_table ip_ct_sysctl_table
++#endif
+ extern unsigned int ip_conntrack_htable_size;
+
+ /* From ip_conntrack_proto_tcp.c */
+@@ -571,8 +616,6 @@ extern unsigned int ip_ct_generic_timeou
+ static int log_invalid_proto_min = 0;
+ static int log_invalid_proto_max = 255;
+
+-static struct ctl_table_header *ip_ct_sysctl_header;
+-
+ static ctl_table ip_ct_sysctl_table[] = {
+ {
+ .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
+@@ -781,6 +824,112 @@ static ctl_table ip_ct_net_table[] = {
+ };
+
+ EXPORT_SYMBOL(ip_ct_log_invalid);
++
++#ifdef CONFIG_VE_IPTABLES
++static void ip_conntrack_sysctl_cleanup(void)
++{
++ if (!ve_is_super(get_exec_env())) {
++ kfree(ve_ip_ct_net_table);
++ kfree(ve_ip_ct_ipv4_table);
++ kfree(ve_ip_ct_netfilter_table);
++ kfree(ve_ip_ct_sysctl_table);
++ }
++ ve_ip_ct_net_table = NULL;
++ ve_ip_ct_ipv4_table = NULL;
++ ve_ip_ct_netfilter_table = NULL;
++ ve_ip_ct_sysctl_table = NULL;
++}
++
++#define ALLOC_ENVCTL(field,k,label) \
++ if ( !(field = kmalloc(k*sizeof(ctl_table), GFP_KERNEL)) ) \
++ goto label;
++static int ip_conntrack_sysctl_init(void)
++{
++ int i, ret = 0;
++
++ ret = -ENOMEM;
++ if (ve_is_super(get_exec_env())) {
++ ve_ip_ct_net_table = ip_ct_net_table;
++ ve_ip_ct_ipv4_table = ip_ct_ipv4_table;
++ ve_ip_ct_netfilter_table = ip_ct_netfilter_table;
++ ve_ip_ct_sysctl_table = ip_ct_sysctl_table;
++ } else {
++ /* allocate structures in ve_struct */
++ ALLOC_ENVCTL(ve_ip_ct_net_table, 2, out);
++ ALLOC_ENVCTL(ve_ip_ct_ipv4_table, 2, nomem_1);
++ ALLOC_ENVCTL(ve_ip_ct_netfilter_table, 3, nomem_2);
++ ALLOC_ENVCTL(ve_ip_ct_sysctl_table, 21, nomem_3);
++
++ memcpy(ve_ip_ct_net_table, ip_ct_net_table,
++ 2*sizeof(ctl_table));
++ memcpy(ve_ip_ct_ipv4_table, ip_ct_ipv4_table,
++ 2*sizeof(ctl_table));
++ memcpy(ve_ip_ct_netfilter_table, ip_ct_netfilter_table,
++ 3*sizeof(ctl_table));
++ memcpy(ve_ip_ct_sysctl_table, ip_ct_sysctl_table,
++ 21*sizeof(ctl_table));
++
++ ve_ip_ct_net_table[0].child = ve_ip_ct_ipv4_table;
++ ve_ip_ct_ipv4_table[0].child = ve_ip_ct_netfilter_table;
++ ve_ip_ct_netfilter_table[0].child = ve_ip_ct_sysctl_table;
++ }
++ ve_ip_ct_sysctl_table[0].data = &ve_ip_conntrack_max;
++ ve_ip_ct_netfilter_table[1].data = &ve_ip_conntrack_max;
++ ve_ip_ct_sysctl_table[1].data = &ve_ip_conntrack_count;
++ /* skip ve_ip_ct_sysctl_table[2].data as it is read-only and common
++ * for all environments */
++ ve_ip_ct_tcp_timeouts[1] = ip_ct_tcp_timeout_syn_sent;
++ ve_ip_ct_sysctl_table[3].data = &ve_ip_ct_tcp_timeouts[1];
++ ve_ip_ct_tcp_timeouts[2] = ip_ct_tcp_timeout_syn_recv;
++ ve_ip_ct_sysctl_table[4].data = &ve_ip_ct_tcp_timeouts[2];
++ ve_ip_ct_tcp_timeouts[3] = ip_ct_tcp_timeout_established;
++ ve_ip_ct_sysctl_table[5].data = &ve_ip_ct_tcp_timeouts[3];
++ ve_ip_ct_tcp_timeouts[4] = ip_ct_tcp_timeout_fin_wait;
++ ve_ip_ct_sysctl_table[6].data = &ve_ip_ct_tcp_timeouts[4];
++ ve_ip_ct_tcp_timeouts[5] = ip_ct_tcp_timeout_close_wait;
++ ve_ip_ct_sysctl_table[7].data = &ve_ip_ct_tcp_timeouts[5];
++ ve_ip_ct_tcp_timeouts[6] = ip_ct_tcp_timeout_last_ack;
++ ve_ip_ct_sysctl_table[8].data = &ve_ip_ct_tcp_timeouts[6];
++ ve_ip_ct_tcp_timeouts[7] = ip_ct_tcp_timeout_time_wait;
++ ve_ip_ct_sysctl_table[9].data = &ve_ip_ct_tcp_timeouts[7];
++ ve_ip_ct_tcp_timeouts[8] = ip_ct_tcp_timeout_close;
++ ve_ip_ct_sysctl_table[10].data = &ve_ip_ct_tcp_timeouts[8];
++ ve_ip_ct_udp_timeout = ip_ct_udp_timeout;
++ ve_ip_ct_sysctl_table[11].data = &ve_ip_ct_udp_timeout;
++ ve_ip_ct_udp_timeout_stream = ip_ct_udp_timeout_stream;
++ ve_ip_ct_sysctl_table[12].data = &ve_ip_ct_udp_timeout_stream;
++ ve_ip_ct_icmp_timeout = ip_ct_icmp_timeout;
++ ve_ip_ct_sysctl_table[13].data = &ve_ip_ct_icmp_timeout;
++ ve_ip_ct_generic_timeout = ip_ct_generic_timeout;
++ ve_ip_ct_sysctl_table[14].data = &ve_ip_ct_generic_timeout;
++ ve_ip_ct_log_invalid = ip_ct_log_invalid;
++ ve_ip_ct_sysctl_table[15].data = &ve_ip_ct_log_invalid;
++ ve_ip_ct_tcp_timeout_max_retrans = ip_ct_tcp_timeout_max_retrans;
++ ve_ip_ct_sysctl_table[16].data = &ve_ip_ct_tcp_timeout_max_retrans;
++ ve_ip_ct_tcp_loose = ip_ct_tcp_loose;
++ ve_ip_ct_sysctl_table[17].data = &ve_ip_ct_tcp_loose;
++ ve_ip_ct_tcp_be_liberal = ip_ct_tcp_be_liberal;
++ ve_ip_ct_sysctl_table[18].data = &ve_ip_ct_tcp_be_liberal;
++ ve_ip_ct_tcp_max_retrans = ip_ct_tcp_max_retrans;
++ ve_ip_ct_sysctl_table[19].data = &ve_ip_ct_tcp_max_retrans;
++ for (i = 0; i < 20; i++)
++ ve_ip_ct_sysctl_table[i].owner_env = get_exec_env();
++ ve_ip_ct_netfilter_table[1].owner_env = get_exec_env();
++ return 0;
++
++nomem_3:
++ kfree(ve_ip_ct_netfilter_table);
++ ve_ip_ct_netfilter_table = NULL;
++nomem_2:
++ kfree(ve_ip_ct_ipv4_table);
++ ve_ip_ct_ipv4_table = NULL;
++nomem_1:
++ kfree(ve_ip_ct_net_table);
++ ve_ip_ct_net_table = NULL;
++out:
++ return ret;
++}
++#endif /*CONFIG_VE*/
+ #endif /* CONFIG_SYSCTL */
+
+ static int init_or_cleanup(int init)
+@@ -792,9 +941,16 @@ static int init_or_cleanup(int init)
+
+ if (!init) goto cleanup;
+
++ ret = -ENOENT;
++ if (!ve_is_super(get_exec_env()))
++ __module_get(THIS_MODULE);
++
+ ret = ip_conntrack_init();
+ if (ret < 0)
+- goto cleanup_nothing;
++ goto cleanup_unget;
++
++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0)
++ return 0;
+
+ #ifdef CONFIG_PROC_FS
+ ret = -ENOMEM;
+@@ -804,98 +960,115 @@ static int init_or_cleanup(int init)
+ proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
+ &exp_file_ops);
+ if (!proc_exp) goto cleanup_proc;
++ proc_exp->proc_fops = &exp_file_ops;
+
+- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
+- if (!proc_stat)
+- goto cleanup_proc_exp;
++ if (ve_is_super(get_exec_env())) {
++ proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
++ if (!proc_stat)
++ goto cleanup_proc_exp;
+
+- proc_stat->proc_fops = &ct_cpu_seq_fops;
+- proc_stat->owner = THIS_MODULE;
++ proc_stat->proc_fops = &ct_cpu_seq_fops;
++ proc_stat->owner = THIS_MODULE;
++ }
+ #endif
+
+- ret = nf_register_hook(&ip_conntrack_defrag_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_defrag_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register pre-routing defrag hook.\n");
+ goto cleanup_proc_stat;
+ }
+- ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_defrag_local_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local_out defrag hook.\n");
+ goto cleanup_defragops;
+ }
+- ret = nf_register_hook(&ip_conntrack_in_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_in_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register pre-routing hook.\n");
+ goto cleanup_defraglocalops;
+ }
+- ret = nf_register_hook(&ip_conntrack_local_out_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_local_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local out hook.\n");
+ goto cleanup_inops;
+ }
+- ret = nf_register_hook(&ip_conntrack_helper_in_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_helper_in_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local in helper hook.\n");
+ goto cleanup_inandlocalops;
+ }
+- ret = nf_register_hook(&ip_conntrack_helper_out_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_helper_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register postrouting helper hook.\n");
+ goto cleanup_helperinops;
+ }
+- ret = nf_register_hook(&ip_conntrack_out_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_out_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register post-routing hook.\n");
+ goto cleanup_helperoutops;
+ }
+- ret = nf_register_hook(&ip_conntrack_local_in_ops);
++ ret = virt_nf_register_hook(&ip_conntrack_local_in_ops);
+ if (ret < 0) {
+ printk("ip_conntrack: can't register local in hook.\n");
+ goto cleanup_inoutandlocalops;
+ }
+ #ifdef CONFIG_SYSCTL
+- ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+- if (ip_ct_sysctl_header == NULL) {
++#ifdef CONFIG_VE_IPTABLES
++ ret = ip_conntrack_sysctl_init();
++ if (ret < 0)
++ goto cleanup_sysctl;
++#endif
++ ret = -ENOMEM;
++ ve_ip_ct_sysctl_header = register_sysctl_table(ve_ip_ct_net_table, 0);
++ if (ve_ip_ct_sysctl_header == NULL) {
+ printk("ip_conntrack: can't register to sysctl.\n");
+- ret = -ENOMEM;
+- goto cleanup_localinops;
++ goto cleanup_sysctl2;
+ }
+ #endif
+
+- return ret;
++ return 0;
+
+ cleanup:
++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0)
++ goto cleanup_init;
+ synchronize_net();
+ #ifdef CONFIG_SYSCTL
+- unregister_sysctl_table(ip_ct_sysctl_header);
+- cleanup_localinops:
++ unregister_sysctl_table(ve_ip_ct_sysctl_header);
++ cleanup_sysctl2:
++#ifdef CONFIG_VE_IPTABLES
++ ip_conntrack_sysctl_cleanup();
++ cleanup_sysctl:
++#endif
+ #endif
+- nf_unregister_hook(&ip_conntrack_local_in_ops);
++ virt_nf_unregister_hook(&ip_conntrack_local_in_ops);
+ cleanup_inoutandlocalops:
+- nf_unregister_hook(&ip_conntrack_out_ops);
++ virt_nf_unregister_hook(&ip_conntrack_out_ops);
+ cleanup_helperoutops:
+- nf_unregister_hook(&ip_conntrack_helper_out_ops);
++ virt_nf_unregister_hook(&ip_conntrack_helper_out_ops);
+ cleanup_helperinops:
+- nf_unregister_hook(&ip_conntrack_helper_in_ops);
++ virt_nf_unregister_hook(&ip_conntrack_helper_in_ops);
+ cleanup_inandlocalops:
+- nf_unregister_hook(&ip_conntrack_local_out_ops);
++ virt_nf_unregister_hook(&ip_conntrack_local_out_ops);
+ cleanup_inops:
+- nf_unregister_hook(&ip_conntrack_in_ops);
++ virt_nf_unregister_hook(&ip_conntrack_in_ops);
+ cleanup_defraglocalops:
+- nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
++ virt_nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
+ cleanup_defragops:
+- nf_unregister_hook(&ip_conntrack_defrag_ops);
++ virt_nf_unregister_hook(&ip_conntrack_defrag_ops);
+ cleanup_proc_stat:
+ #ifdef CONFIG_PROC_FS
+- remove_proc_entry("ip_conntrack", proc_net_stat);
++ if (ve_is_super(get_exec_env()))
++ remove_proc_entry("ip_conntrack", proc_net_stat);
+ cleanup_proc_exp:
+ proc_net_remove("ip_conntrack_expect");
+ cleanup_proc:
+ proc_net_remove("ip_conntrack");
+- cleanup_init:
+ #endif /* CONFIG_PROC_FS */
++ cleanup_init:
+ ip_conntrack_cleanup();
+- cleanup_nothing:
++ cleanup_unget:
++ if (!ve_is_super(get_exec_env()))
++ module_put(THIS_MODULE);
+ return ret;
+ }
+
+@@ -906,11 +1079,11 @@ int ip_conntrack_protocol_register(struc
+ int ret = 0;
+
+ write_lock_bh(&ip_conntrack_lock);
+- if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
++ if (ve_ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
+ ret = -EBUSY;
+ goto out;
+ }
+- ip_ct_protos[proto->proto] = proto;
++ ve_ip_ct_protos[proto->proto] = proto;
+ out:
+ write_unlock_bh(&ip_conntrack_lock);
+ return ret;
+@@ -919,7 +1092,7 @@ int ip_conntrack_protocol_register(struc
+ void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
+ {
+ write_lock_bh(&ip_conntrack_lock);
+- ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
++ ve_ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
+ write_unlock_bh(&ip_conntrack_lock);
+
+ /* Somebody could be still looking at the proto in bh. */
+@@ -929,17 +1102,39 @@ void ip_conntrack_protocol_unregister(st
+ ip_ct_iterate_cleanup(kill_proto, &proto->proto);
+ }
+
+-static int __init init(void)
++int init_iptable_conntrack(void)
+ {
+ return init_or_cleanup(1);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_conntrack(void)
+ {
+ init_or_cleanup(0);
+ }
+
+-module_init(init);
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_conntrack();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_conntrack);
++ KSYMRESOLVE(fini_iptable_conntrack);
++ KSYMMODRESOLVE(ip_conntrack);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip_conntrack);
++ KSYMUNRESOLVE(init_iptable_conntrack);
++ KSYMUNRESOLVE(fini_iptable_conntrack);
++ fini_iptable_conntrack();
++}
++
++subsys_initcall(init);
+ module_exit(fini);
+
+ /* Some modules need us, but don't depend directly on any symbol.
+@@ -956,15 +1151,20 @@ EXPORT_SYMBOL_GPL(ip_conntrack_unregiste
+ EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
+ EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+ #endif
++EXPORT_SYMBOL(ip_conntrack_disable_ve0);
+ EXPORT_SYMBOL(ip_conntrack_protocol_register);
+ EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
+ EXPORT_SYMBOL(ip_ct_get_tuple);
+ EXPORT_SYMBOL(invert_tuplepr);
+ EXPORT_SYMBOL(ip_conntrack_alter_reply);
++#ifndef CONFIG_VE_IPTABLES
+ EXPORT_SYMBOL(ip_conntrack_destroyed);
++#endif
+ EXPORT_SYMBOL(need_conntrack);
+ EXPORT_SYMBOL(ip_conntrack_helper_register);
+ EXPORT_SYMBOL(ip_conntrack_helper_unregister);
++EXPORT_SYMBOL(virt_ip_conntrack_helper_register);
++EXPORT_SYMBOL(virt_ip_conntrack_helper_unregister);
+ EXPORT_SYMBOL(ip_ct_iterate_cleanup);
+ EXPORT_SYMBOL(__ip_ct_refresh_acct);
+
+@@ -974,14 +1174,18 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_expect_
+ EXPORT_SYMBOL_GPL(ip_conntrack_expect_find);
+ EXPORT_SYMBOL(ip_conntrack_expect_related);
+ EXPORT_SYMBOL(ip_conntrack_unexpect_related);
++#ifndef CONFIG_VE_IPTABLES
+ EXPORT_SYMBOL_GPL(ip_conntrack_expect_list);
++#endif
+ EXPORT_SYMBOL_GPL(ip_ct_unlink_expect);
+
+ EXPORT_SYMBOL(ip_conntrack_tuple_taken);
+ EXPORT_SYMBOL(ip_ct_gather_frags);
+ EXPORT_SYMBOL(ip_conntrack_htable_size);
+ EXPORT_SYMBOL(ip_conntrack_lock);
++#ifndef CONFIG_VE_IPTABLES
+ EXPORT_SYMBOL(ip_conntrack_hash);
++#endif
+ EXPORT_SYMBOL(ip_conntrack_untracked);
+ EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
+ #ifdef CONFIG_IP_NF_NAT_NEEDED
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_core.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_core.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_core.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_core.c 2006-07-04 14:41:39.000000000 +0400
+@@ -21,6 +21,8 @@
+ #include <linux/icmp.h>
+ #include <linux/udp.h>
+ #include <linux/jhash.h>
++#include <linux/nfcalls.h>
++#include <ub/ub_mem.h>
+
+ #define ASSERT_READ_LOCK(x)
+ #define ASSERT_WRITE_LOCK(x)
+@@ -46,15 +48,24 @@ DEFINE_RWLOCK(ip_nat_lock);
+ /* Calculated at init based on memory size */
+ static unsigned int ip_nat_htable_size;
+
+-static struct list_head *bysource;
+-
+ #define MAX_IP_NAT_PROTO 256
++
++#ifdef CONFIG_VE_IPTABLES
++#define ve_ip_nat_bysource \
++ (get_exec_env()->_ip_conntrack->_ip_nat_bysource)
++#define ve_ip_nat_protos \
++ (get_exec_env()->_ip_conntrack->_ip_nat_protos)
++#else
++static struct list_head *bysource;
++#define ve_ip_nat_bysource bysource
+ static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
++#define ve_ip_nat_protos ip_nat_protos
++#endif
+
+ static inline struct ip_nat_protocol *
+ __ip_nat_proto_find(u_int8_t protonum)
+ {
+- return ip_nat_protos[protonum];
++ return ve_ip_nat_protos[protonum];
+ }
+
+ struct ip_nat_protocol *
+@@ -177,7 +188,7 @@ find_appropriate_src(const struct ip_con
+ struct ip_conntrack *ct;
+
+ read_lock_bh(&ip_nat_lock);
+- list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
++ list_for_each_entry(ct, &ve_ip_nat_bysource[h], nat.info.bysource) {
+ if (same_src(ct, tuple)) {
+ /* Copy source part from reply tuple. */
+ invert_tuplepr(result,
+@@ -291,13 +302,22 @@ get_unique_tuple(struct ip_conntrack_tup
+ ip_nat_proto_put(proto);
+ }
+
++void ip_nat_hash_conntrack(struct ip_conntrack *conntrack)
++{
++ unsigned int srchash
++ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
++ write_lock_bh(&ip_nat_lock);
++ list_add(&conntrack->nat.info.bysource, &ve_ip_nat_bysource[srchash]);
++ write_unlock_bh(&ip_nat_lock);
++}
++EXPORT_SYMBOL_GPL(ip_nat_hash_conntrack);
++
+ unsigned int
+ ip_nat_setup_info(struct ip_conntrack *conntrack,
+ const struct ip_nat_range *range,
+ unsigned int hooknum)
+ {
+ struct ip_conntrack_tuple curr_tuple, new_tuple;
+- struct ip_nat_info *info = &conntrack->nat.info;
+ int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
+ enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+@@ -332,14 +352,8 @@ ip_nat_setup_info(struct ip_conntrack *c
+ }
+
+ /* Place in source hash if this is the first time. */
+- if (have_to_hash) {
+- unsigned int srchash
+- = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+- .tuple);
+- write_lock_bh(&ip_nat_lock);
+- list_add(&info->bysource, &bysource[srchash]);
+- write_unlock_bh(&ip_nat_lock);
+- }
++ if (have_to_hash)
++ ip_nat_hash_conntrack(conntrack);
+
+ /* It's done. */
+ if (maniptype == IP_NAT_MANIP_DST)
+@@ -521,11 +535,11 @@ int ip_nat_protocol_register(struct ip_n
+ int ret = 0;
+
+ write_lock_bh(&ip_nat_lock);
+- if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
++ if (ve_ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
+ ret = -EBUSY;
+ goto out;
+ }
+- ip_nat_protos[proto->protonum] = proto;
++ ve_ip_nat_protos[proto->protonum] = proto;
+ out:
+ write_unlock_bh(&ip_nat_lock);
+ return ret;
+@@ -536,7 +550,7 @@ EXPORT_SYMBOL(ip_nat_protocol_register);
+ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
+ {
+ write_lock_bh(&ip_nat_lock);
+- ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
++ ve_ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
+ write_unlock_bh(&ip_nat_lock);
+
+ /* Someone could be still looking at the proto in a bh. */
+@@ -589,38 +603,55 @@ EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_
+ EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
+ #endif
+
+-static int __init ip_nat_init(void)
++static int ip_nat_init(void)
+ {
+ size_t i;
++ int ret;
+
+- /* Leave them the same for the moment. */
+- ip_nat_htable_size = ip_conntrack_htable_size;
++ if (ve_is_super(get_exec_env()))
++ ip_nat_htable_size = ip_conntrack_htable_size;
+
+ /* One vmalloc for both hash tables */
+- bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
+- if (!bysource)
+- return -ENOMEM;
++ ret = -ENOMEM;
++ ve_ip_nat_bysource =
++ ub_vmalloc(sizeof(struct list_head)*ip_nat_htable_size*2);
++ if (!ve_ip_nat_bysource)
++ goto nomem;
++
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_nat_protos =
++ ub_kmalloc(sizeof(void *)*MAX_IP_NAT_PROTO, GFP_KERNEL);
++ if (!ve_ip_nat_protos)
++ goto nomem2;
++#endif
+
+ /* Sew in builtin protocols. */
+ write_lock_bh(&ip_nat_lock);
+ for (i = 0; i < MAX_IP_NAT_PROTO; i++)
+- ip_nat_protos[i] = &ip_nat_unknown_protocol;
+- ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
+- ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
+- ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
++ ve_ip_nat_protos[i] = &ip_nat_unknown_protocol;
++ ve_ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
++ ve_ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
++ ve_ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
+ write_unlock_bh(&ip_nat_lock);
+
+ for (i = 0; i < ip_nat_htable_size; i++) {
+- INIT_LIST_HEAD(&bysource[i]);
++ INIT_LIST_HEAD(&ve_ip_nat_bysource[i]);
+ }
+
+ /* FIXME: Man, this is a hack. <SIGH> */
+ IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+- ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
++ ve_ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+
+- /* Initialize fake conntrack so that NAT will skip it */
+- ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
++ if (ve_is_super(get_exec_env()))
++ /* Initialize fake conntrack so that NAT will skip it */
++ ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+ return 0;
++#ifdef CONFIG_VE_IPTABLES
++nomem2:
++#endif
++ vfree(ve_ip_nat_bysource);
++nomem:
++ return ret;
+ }
+
+ /* Clear NAT section of all conntracks, in case we're loaded again. */
+@@ -631,14 +662,41 @@ static int clean_nat(struct ip_conntrack
+ return 0;
+ }
+
+-static void __exit ip_nat_cleanup(void)
++static void ip_nat_cleanup(void)
+ {
+ ip_ct_iterate_cleanup(&clean_nat, NULL);
+- ip_conntrack_destroyed = NULL;
+- vfree(bysource);
++ ve_ip_conntrack_destroyed = NULL;
++ vfree(ve_ip_nat_bysource);
++ ve_ip_nat_bysource = NULL;
++#ifdef CONFIG_VE_IPTABLES
++ kfree(ve_ip_nat_protos);
++ ve_ip_nat_protos = NULL;
++#endif
++}
++
++static int __init init(void)
++{
++ int err;
++
++ err = ip_nat_init();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(ip_nat_init);
++ KSYMRESOLVE(ip_nat_cleanup);
++ KSYMMODRESOLVE(ip_nat);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip_nat);
++ KSYMUNRESOLVE(ip_nat_cleanup);
++ KSYMUNRESOLVE(ip_nat_init);
++ ip_nat_cleanup();
+ }
+
+ MODULE_LICENSE("GPL");
+
+-module_init(ip_nat_init);
+-module_exit(ip_nat_cleanup);
++fs_initcall(init);
++module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_ftp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_ftp.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_ftp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_ftp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -19,6 +19,7 @@
+ #include <linux/netfilter_ipv4/ip_nat_rule.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+@@ -154,18 +155,43 @@ static unsigned int ip_nat_ftp(struct sk
+ return NF_ACCEPT;
+ }
+
+-static void __exit fini(void)
++#ifdef CONFIG_VE_IPTABLES
++#undef ve_ip_nat_ftp_hook
++#define ve_ip_nat_ftp_hook \
++ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook)
++#endif
++int init_iptable_nat_ftp(void)
+ {
+- ip_nat_ftp_hook = NULL;
++ BUG_ON(ve_ip_nat_ftp_hook);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_nat_ftp_hook = (ip_nat_helper_func)ip_nat_ftp;
++#else
++ ve_ip_nat_ftp_hook = ip_nat_ftp;
++#endif
++ return 0;
++}
++
++void fini_iptable_nat_ftp(void)
++{
++ ve_ip_nat_ftp_hook = NULL;
+ /* Make sure noone calls it, meanwhile. */
+ synchronize_net();
+ }
+
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip_nat_ftp);
++ KSYMUNRESOLVE(init_iptable_nat_ftp);
++ KSYMUNRESOLVE(fini_iptable_nat_ftp);
++ fini_iptable_nat_ftp();
++}
++
+ static int __init init(void)
+ {
+- BUG_ON(ip_nat_ftp_hook);
+- ip_nat_ftp_hook = ip_nat_ftp;
+- return 0;
++ KSYMRESOLVE(init_iptable_nat_ftp);
++ KSYMRESOLVE(fini_iptable_nat_ftp);
++ KSYMMODRESOLVE(ip_nat_ftp);
++ return init_iptable_nat_ftp();
+ }
+
+ /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_irc.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_irc.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_irc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_irc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -23,6 +23,7 @@
+ #include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+ #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+ #include <linux/moduleparam.h>
++#include <linux/nfcalls.h>
+
+ #if 0
+ #define DEBUGP printk
+@@ -96,18 +97,44 @@ static unsigned int help(struct sk_buff
+ return ret;
+ }
+
+-static void __exit fini(void)
++#ifdef CONFIG_VE_IPTABLES
++#undef ve_ip_nat_irc_hook
++#define ve_ip_nat_irc_hook \
++ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook)
++#endif
++
++int init_iptable_nat_irc(void)
++{
++ BUG_ON(ve_ip_nat_irc_hook);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_nat_irc_hook = (ip_nat_helper_func)help;
++#else
++ ve_ip_nat_irc_hook = help;
++#endif
++ return 0;
++}
++
++void fini_iptable_nat_irc(void)
+ {
+- ip_nat_irc_hook = NULL;
++ ve_ip_nat_irc_hook = NULL;
+ /* Make sure noone calls it, meanwhile. */
+ synchronize_net();
+ }
+
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip_nat_irc);
++ KSYMUNRESOLVE(init_iptable_nat_irc);
++ KSYMUNRESOLVE(fini_iptable_nat_irc);
++ fini_iptable_nat_irc();
++}
++
+ static int __init init(void)
+ {
+- BUG_ON(ip_nat_irc_hook);
+- ip_nat_irc_hook = help;
+- return 0;
++ KSYMRESOLVE(init_iptable_nat_irc);
++ KSYMRESOLVE(fini_iptable_nat_irc);
++ KSYMMODRESOLVE(ip_nat_irc);
++ return init_iptable_nat_irc();
+ }
+
+ /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_rule.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_rule.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_rule.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_rule.c 2006-07-04 14:41:39.000000000 +0400
+@@ -34,6 +34,13 @@
+ #define DEBUGP(format, args...)
+ #endif
+
++#ifdef CONFIG_VE_IPTABLES
++#define ve_ip_nat_table \
++ (get_exec_env()->_ip_conntrack->_ip_nat_table)
++#else
++#define ve_ip_nat_table &nat_table
++#endif
++
+ #define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
+
+ static struct
+@@ -41,7 +48,7 @@ static struct
+ struct ipt_replace repl;
+ struct ipt_standard entries[3];
+ struct ipt_error term;
+-} nat_initial_table __initdata
++} nat_initial_table
+ = { { "nat", NAT_VALID_HOOKS, 4,
+ sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+ { [NF_IP_PRE_ROUTING] = 0,
+@@ -235,6 +242,93 @@ static int ipt_dnat_checkentry(const cha
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat_to_user(void *target, void **dstptr,
++ int *size, int off)
++{
++ struct ipt_entry_target *pt;
++ struct ip_nat_multi_range_compat *pinfo;
++ struct compat_ip_nat_multi_range info;
++ u_int16_t tsize;
++
++ pt = (struct ipt_entry_target *)target;
++ tsize = pt->u.user.target_size;
++ if (__copy_to_user(*dstptr, pt, sizeof(struct ipt_entry_target)))
++ return -EFAULT;
++ pinfo = (struct ip_nat_multi_range_compat *)pt->data;
++ memset(&info, 0, sizeof(struct compat_ip_nat_multi_range));
++ info.rangesize = pinfo->rangesize;
++ info.range[0].flags = pinfo->range[0].flags;
++ info.range[0].min_ip = pinfo->range[0].min_ip;
++ info.range[0].max_ip = pinfo->range[0].max_ip;
++ info.range[0].min = pinfo->range[0].min;
++ info.range[0].max = pinfo->range[0].max;
++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_target),
++ &info, sizeof(struct compat_ip_nat_multi_range)))
++ return -EFAULT;
++ tsize -= off;
++ if (put_user(tsize, (u_int16_t *)*dstptr))
++ return -EFAULT;
++ *size -= off;
++ *dstptr += tsize;
++ return 0;
++}
++
++static int compat_from_user(void *target, void **dstptr,
++ int *size, int off)
++{
++ struct compat_ipt_entry_target *pt;
++ struct ipt_entry_target *dstpt;
++ struct compat_ip_nat_multi_range *pinfo;
++ struct ip_nat_multi_range_compat info;
++ u_int16_t tsize;
++
++ pt = (struct compat_ipt_entry_target *)target;
++ dstpt = (struct ipt_entry_target *)*dstptr;
++ tsize = pt->u.user.target_size;
++ memcpy(*dstptr, pt, sizeof(struct compat_ipt_entry_target));
++ pinfo = (struct compat_ip_nat_multi_range *)pt->data;
++ memset(&info, 0, sizeof(struct ip_nat_multi_range_compat));
++ info.rangesize = pinfo->rangesize;
++ info.range[0].flags = pinfo->range[0].flags;
++ info.range[0].min_ip = pinfo->range[0].min_ip;
++ info.range[0].max_ip = pinfo->range[0].max_ip;
++ info.range[0].min = pinfo->range[0].min;
++ info.range[0].max = pinfo->range[0].max;
++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_target),
++ &info, sizeof(struct ip_nat_multi_range_compat));
++ tsize += off;
++ dstpt->u.user.target_size = tsize;
++ *size += off;
++ *dstptr += tsize;
++ return 0;
++}
++
++static int compat(void *target, void **dstptr, int *size, int convert)
++{
++ int ret, off;
++
++ off = IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat)) -
++ COMPAT_IPT_ALIGN(sizeof(struct compat_ip_nat_multi_range));
++ switch (convert) {
++ case COMPAT_TO_USER:
++ ret = compat_to_user(target, dstptr, size, off);
++ break;
++ case COMPAT_FROM_USER:
++ ret = compat_from_user(target, dstptr, size, off);
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ ret = 0;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++#endif
++
+ inline unsigned int
+ alloc_null_binding(struct ip_conntrack *conntrack,
+ struct ip_nat_info *info,
+@@ -286,7 +380,7 @@ int ip_nat_rule_find(struct sk_buff **ps
+ {
+ int ret;
+
+- ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
++ ret = ipt_do_table(pskb, hooknum, in, out, ve_ip_nat_table, NULL);
+
+ if (ret == NF_ACCEPT) {
+ if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
+@@ -300,21 +394,33 @@ static struct ipt_target ipt_snat_reg =
+ .name = "SNAT",
+ .target = ipt_snat_target,
+ .checkentry = ipt_snat_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ };
+
+ static struct ipt_target ipt_dnat_reg = {
+ .name = "DNAT",
+ .target = ipt_dnat_target,
+ .checkentry = ipt_dnat_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ };
+
+-int __init ip_nat_rule_init(void)
++int ip_nat_rule_init(void)
+ {
+ int ret;
++ struct ipt_table *tmp_table;
++
++ tmp_table = ipt_register_table(&nat_table,
++ &nat_initial_table.repl);
++ if (IS_ERR(tmp_table))
++ return PTR_ERR(tmp_table);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_nat_table = tmp_table;
++#endif
+
+- ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
+- if (ret != 0)
+- return ret;
+ ret = ipt_register_target(&ipt_snat_reg);
+ if (ret != 0)
+ goto unregister_table;
+@@ -328,7 +434,10 @@ int __init ip_nat_rule_init(void)
+ unregister_snat:
+ ipt_unregister_target(&ipt_snat_reg);
+ unregister_table:
+- ipt_unregister_table(&nat_table);
++ ipt_unregister_table(ve_ip_nat_table);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_nat_table = NULL;
++#endif
+
+ return ret;
+ }
+@@ -337,5 +446,8 @@ void ip_nat_rule_cleanup(void)
+ {
+ ipt_unregister_target(&ipt_dnat_reg);
+ ipt_unregister_target(&ipt_snat_reg);
+- ipt_unregister_table(&nat_table);
++ ipt_unregister_table(ve_ip_nat_table);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip_nat_table = NULL;
++#endif
+ }
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_snmp_basic.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_snmp_basic.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_snmp_basic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_snmp_basic.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1000,12 +1000,12 @@ static unsigned char snmp_trap_decode(st
+
+ return 1;
+
++err_addr_free:
++ kfree((unsigned long *)trap->ip_address);
++
+ err_id_free:
+ kfree(trap->id);
+
+-err_addr_free:
+- kfree((unsigned long *)trap->ip_address);
+-
+ return 0;
+ }
+
+@@ -1123,11 +1123,10 @@ static int snmp_parse_mangle(unsigned ch
+ struct snmp_v1_trap trap;
+ unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+
+- /* Discard trap allocations regardless */
+- kfree(trap.id);
+- kfree((unsigned long *)trap.ip_address);
+-
+- if (!ret)
++ if (ret) {
++ kfree(trap.id);
++ kfree((unsigned long *)trap.ip_address);
++ } else
+ return ret;
+
+ } else {
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_standalone.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_standalone.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_standalone.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_standalone.c 2006-07-04 14:41:39.000000000 +0400
+@@ -30,6 +30,7 @@
+ #include <net/ip.h>
+ #include <net/checksum.h>
+ #include <linux/spinlock.h>
++#include <linux/nfcalls.h>
+
+ #define ASSERT_READ_LOCK(x)
+ #define ASSERT_WRITE_LOCK(x)
+@@ -358,45 +359,45 @@ static int init_or_cleanup(int init)
+ {
+ int ret = 0;
+
+- need_conntrack();
+-
+ if (!init) goto cleanup;
+
+-#ifdef CONFIG_XFRM
+- BUG_ON(ip_nat_decode_session != NULL);
+- ip_nat_decode_session = nat_decode_session;
+-#endif
++ if (!ve_is_super(get_exec_env()))
++ __module_get(THIS_MODULE);
++
+ ret = ip_nat_rule_init();
+ if (ret < 0) {
+ printk("ip_nat_init: can't setup rules.\n");
+- goto cleanup_decode_session;
++ goto cleanup_modput;
+ }
+- ret = nf_register_hook(&ip_nat_in_ops);
++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0)
++ return 0;
++
++ ret = virt_nf_register_hook(&ip_nat_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register in hook.\n");
+ goto cleanup_rule_init;
+ }
+- ret = nf_register_hook(&ip_nat_out_ops);
++ ret = virt_nf_register_hook(&ip_nat_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register out hook.\n");
+ goto cleanup_inops;
+ }
+- ret = nf_register_hook(&ip_nat_adjust_in_ops);
++ ret = virt_nf_register_hook(&ip_nat_adjust_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register adjust in hook.\n");
+ goto cleanup_outops;
+ }
+- ret = nf_register_hook(&ip_nat_adjust_out_ops);
++ ret = virt_nf_register_hook(&ip_nat_adjust_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register adjust out hook.\n");
+ goto cleanup_adjustin_ops;
+ }
+- ret = nf_register_hook(&ip_nat_local_out_ops);
++ ret = virt_nf_register_hook(&ip_nat_local_out_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register local out hook.\n");
+ goto cleanup_adjustout_ops;;
+ }
+- ret = nf_register_hook(&ip_nat_local_in_ops);
++ ret = virt_nf_register_hook(&ip_nat_local_in_ops);
+ if (ret < 0) {
+ printk("ip_nat_init: can't register local in hook.\n");
+ goto cleanup_localoutops;
+@@ -404,38 +405,76 @@ static int init_or_cleanup(int init)
+ return ret;
+
+ cleanup:
+- nf_unregister_hook(&ip_nat_local_in_ops);
++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0)
++ goto cleanup_rule_init;
++ virt_nf_unregister_hook(&ip_nat_local_in_ops);
+ cleanup_localoutops:
+- nf_unregister_hook(&ip_nat_local_out_ops);
++ virt_nf_unregister_hook(&ip_nat_local_out_ops);
+ cleanup_adjustout_ops:
+- nf_unregister_hook(&ip_nat_adjust_out_ops);
++ virt_nf_unregister_hook(&ip_nat_adjust_out_ops);
+ cleanup_adjustin_ops:
+- nf_unregister_hook(&ip_nat_adjust_in_ops);
++ virt_nf_unregister_hook(&ip_nat_adjust_in_ops);
+ cleanup_outops:
+- nf_unregister_hook(&ip_nat_out_ops);
++ virt_nf_unregister_hook(&ip_nat_out_ops);
+ cleanup_inops:
+- nf_unregister_hook(&ip_nat_in_ops);
++ virt_nf_unregister_hook(&ip_nat_in_ops);
+ cleanup_rule_init:
+ ip_nat_rule_cleanup();
+- cleanup_decode_session:
+-#ifdef CONFIG_XFRM
+- ip_nat_decode_session = NULL;
+- synchronize_net();
+-#endif
++ cleanup_modput:
++ if (!ve_is_super(get_exec_env()))
++ module_put(THIS_MODULE);
+ return ret;
+ }
+
+-static int __init init(void)
++int init_iptable_nat(void)
+ {
+ return init_or_cleanup(1);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_nat(void)
+ {
+ init_or_cleanup(0);
+ }
+
+-module_init(init);
++static int __init init(void)
++{
++ int err;
++
++ need_conntrack();
++
++#ifdef CONFIG_XFRM
++ BUG_ON(ip_nat_decode_session != NULL);
++ ip_nat_decode_session = nat_decode_session;
++#endif
++
++ err = init_iptable_nat();
++ if (err < 0) {
++#ifdef CONFIG_XFRM
++ ip_nat_decode_session = NULL;
++ synchronize_net();
++#endif
++ return err;
++ }
++
++ KSYMRESOLVE(init_iptable_nat);
++ KSYMRESOLVE(fini_iptable_nat);
++ KSYMMODRESOLVE(iptable_nat);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(iptable_nat);
++ KSYMUNRESOLVE(init_iptable_nat);
++ KSYMUNRESOLVE(fini_iptable_nat);
++ fini_iptable_nat();
++#ifdef CONFIG_XFRM
++ ip_nat_decode_session = NULL;
++ synchronize_net();
++#endif
++}
++
++fs_initcall(init);
+ module_exit(fini);
+
+ MODULE_LICENSE("GPL");
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_queue.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_queue.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_queue.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_queue.c 2006-07-04 14:41:39.000000000 +0400
+@@ -542,8 +542,17 @@ ipq_rcv_sk(struct sock *sk, int len)
+ down(&ipqnl_sem);
+
+ for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
++#ifdef CONFIG_VE
++ struct ve_struct *env;
++#endif
+ skb = skb_dequeue(&sk->sk_receive_queue);
++#ifdef CONFIG_VE
++ env = set_exec_env(VE_OWNER_SKB(skb));
+ ipq_rcv_skb(skb);
++ (void)set_exec_env(env);
++#else
++ ipq_rcv_skb(skb);
++#endif
+ kfree_skb(skb);
+ }
+
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_tables.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_tables.c 2006-07-04 14:41:39.000000000 +0400
+@@ -24,14 +24,17 @@
+ #include <linux/module.h>
+ #include <linux/icmp.h>
+ #include <net/ip.h>
++#include <net/compat.h>
+ #include <asm/uaccess.h>
+ #include <asm/semaphore.h>
+ #include <linux/proc_fs.h>
+ #include <linux/err.h>
+ #include <linux/cpumask.h>
++#include <ub/ub_mem.h>
+
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+@@ -70,6 +73,14 @@ do { \
+ #define inline
+ #endif
+
++#ifdef CONFIG_VE_IPTABLES
++/* include ve.h and define get_exec_env */
++#include <linux/sched.h>
++#define ve_ipt_standard_target (get_exec_env()->_ipt_standard_target)
++#else
++#define ve_ipt_standard_target &ipt_standard_target
++#endif
++
+ /*
+ We keep a set of rules for each CPU, so we can avoid write-locking
+ them in the softirq when updating the counters and therefore
+@@ -480,7 +491,7 @@ standard_check(const struct ipt_entry_ta
+ if (t->u.target_size
+ != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
+ duprintf("standard_check: target size %u != %u\n",
+- t->u.target_size,
++ t->u.target_size, (unsigned int)
+ IPT_ALIGN(sizeof(struct ipt_standard_target)));
+ return 0;
+ }
+@@ -565,7 +576,7 @@ check_entry(struct ipt_entry *e, const c
+ }
+ t->u.kernel.target = target;
+
+- if (t->u.kernel.target == &ipt_standard_target) {
++ if (t->u.kernel.target == ve_ipt_standard_target) {
+ if (!standard_check(t, size)) {
+ ret = -EINVAL;
+ goto cleanup_matches;
+@@ -790,32 +801,45 @@ get_counters(const struct xt_table_info
+ }
+ }
+
+-static int
+-copy_entries_to_user(unsigned int total_size,
+- struct ipt_table *table,
+- void __user *userptr)
++static inline struct xt_counters * alloc_counters(struct ipt_table *table)
+ {
+- unsigned int off, num, countersize;
+- struct ipt_entry *e;
++ unsigned int countersize;
+ struct xt_counters *counters;
+ struct xt_table_info *private = table->private;
+- int ret = 0;
+- void *loc_cpu_entry;
+
+ /* We need atomic snapshot of counters: rest doesn't change
+ (other than comefrom, which userspace doesn't care
+ about). */
+ countersize = sizeof(struct xt_counters) * private->number;
+- counters = vmalloc_node(countersize, numa_node_id());
++ counters = ub_vmalloc_node(countersize, numa_node_id());
+
+ if (counters == NULL)
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+
+ /* First, sum counters... */
+ write_lock_bh(&table->lock);
+ get_counters(private, counters);
+ write_unlock_bh(&table->lock);
+
++ return counters;
++}
++
++static int
++copy_entries_to_user(unsigned int total_size,
++ struct ipt_table *table,
++ void __user *userptr)
++{
++ unsigned int off, num;
++ struct ipt_entry *e;
++ struct xt_counters *counters;
++ struct xt_table_info *private = table->private;
++ int ret = 0;
++ void *loc_cpu_entry;
++
++ counters = alloc_counters(table);
++ if (IS_ERR(counters))
++ return PTR_ERR(counters);
++
+ /* choose the copy that is on our node/cpu, ...
+ * This choice is lazy (because current thread is
+ * allowed to migrate to another cpu)
+@@ -875,25 +899,391 @@ copy_entries_to_user(unsigned int total_
+ return ret;
+ }
+
++#ifdef CONFIG_COMPAT
++static DECLARE_MUTEX(compat_ipt_mutex);
++
++struct compat_delta {
++ struct compat_delta *next;
++ u_int16_t offset;
++ short delta;
++};
++
++static struct compat_delta *compat_offsets = NULL;
++
++static int compat_add_offset(u_int16_t offset, short delta)
++{
++ struct compat_delta *tmp;
++
++ tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
++ if (!tmp)
++ return -ENOMEM;
++ tmp->offset = offset;
++ tmp->delta = delta;
++ if (compat_offsets) {
++ tmp->next = compat_offsets->next;
++ compat_offsets->next = tmp;
++ } else {
++ compat_offsets = tmp;
++ tmp->next = NULL;
++ }
++ return 0;
++}
++
++static void compat_flush_offsets(void)
++{
++ struct compat_delta *tmp, *next;
++
++ if (compat_offsets) {
++ for(tmp = compat_offsets; tmp; tmp = next) {
++ next = tmp->next;
++ kfree(tmp);
++ }
++ compat_offsets = NULL;
++ }
++}
++
++static short compat_calc_jump(u_int16_t offset)
++{
++ struct compat_delta *tmp;
++ short delta;
++
++ for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next)
++ if (tmp->offset < offset)
++ delta += tmp->delta;
++ return delta;
++}
++
++struct compat_ipt_standard_target
++{
++ struct compat_ipt_entry_target target;
++ compat_int_t verdict;
++};
++
++#define IPT_ST_OFFSET (sizeof(struct ipt_standard_target) - \
++ sizeof(struct compat_ipt_standard_target))
++
++struct compat_ipt_standard
++{
++ struct compat_ipt_entry entry;
++ struct compat_ipt_standard_target target;
++};
++
++static int compat_ipt_standard_fn(void *target,
++ void **dstptr, int *size, int convert)
++{
++ struct compat_ipt_standard_target compat_st, *pcompat_st;
++ struct ipt_standard_target st, *pst;
++ int ret;
++
++ ret = 0;
++ switch (convert) {
++ case COMPAT_TO_USER:
++ pst = (struct ipt_standard_target *)target;
++ memcpy(&compat_st.target, &pst->target,
++ sizeof(struct ipt_entry_target));
++ compat_st.verdict = pst->verdict;
++ if (compat_st.verdict > 0)
++ compat_st.verdict -=
++ compat_calc_jump(compat_st.verdict);
++ compat_st.target.u.user.target_size =
++ sizeof(struct compat_ipt_standard_target);
++ if (__copy_to_user(*dstptr, &compat_st,
++ sizeof(struct compat_ipt_standard_target)))
++ ret = -EFAULT;
++ *size -= IPT_ST_OFFSET;
++ *dstptr += sizeof(struct compat_ipt_standard_target);
++ break;
++ case COMPAT_FROM_USER:
++ pcompat_st =
++ (struct compat_ipt_standard_target *)target;
++ memcpy(&st.target, &pcompat_st->target,
++ sizeof(struct ipt_entry_target));
++ st.verdict = pcompat_st->verdict;
++ if (st.verdict > 0)
++ st.verdict += compat_calc_jump(st.verdict);
++ st.target.u.user.target_size =
++ sizeof(struct ipt_standard_target);
++ memcpy(*dstptr, &st,
++ sizeof(struct ipt_standard_target));
++ *size += IPT_ST_OFFSET;
++ *dstptr += sizeof(struct ipt_standard_target);
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += IPT_ST_OFFSET;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++
++int ipt_target_align_compat(void *target, void **dstptr,
++ int *size, int off, int convert)
++{
++ struct compat_ipt_entry_target *pcompat;
++ struct ipt_entry_target *pt;
++ u_int16_t tsize;
++ int ret;
++
++ ret = 0;
++ switch (convert) {
++ case COMPAT_TO_USER:
++ pt = (struct ipt_entry_target *)target;
++ tsize = pt->u.user.target_size;
++ if (__copy_to_user(*dstptr, pt, tsize)) {
++ ret = -EFAULT;
++ break;
++ }
++ tsize -= off;
++ if (put_user(tsize, (u_int16_t *)*dstptr))
++ ret = -EFAULT;
++ *size -= off;
++ *dstptr += tsize;
++ break;
++ case COMPAT_FROM_USER:
++ pcompat = (struct compat_ipt_entry_target *)target;
++ pt = (struct ipt_entry_target *)*dstptr;
++ tsize = pcompat->u.user.target_size;
++ memcpy(pt, pcompat, tsize);
++ tsize += off;
++ pt->u.user.target_size = tsize;
++ *size += off;
++ *dstptr += tsize;
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++
++int ipt_match_align_compat(void *match, void **dstptr,
++ int *size, int off, int convert)
++{
++ struct compat_ipt_entry_match *pcompat_m;
++ struct ipt_entry_match *pm;
++ u_int16_t msize;
++ int ret;
++
++ ret = 0;
++ switch (convert) {
++ case COMPAT_TO_USER:
++ pm = (struct ipt_entry_match *)match;
++ msize = pm->u.user.match_size;
++ if (__copy_to_user(*dstptr, pm, msize)) {
++ ret = -EFAULT;
++ break;
++ }
++ msize -= off;
++ if (put_user(msize, (u_int16_t *)*dstptr))
++ ret = -EFAULT;
++ *size -= off;
++ *dstptr += msize;
++ break;
++ case COMPAT_FROM_USER:
++ pcompat_m = (struct compat_ipt_entry_match *)match;
++ pm = (struct ipt_entry_match *)*dstptr;
++ msize = pcompat_m->u.user.match_size;
++ memcpy(pm, pcompat_m, msize);
++ msize += off;
++ pm->u.user.match_size = msize;
++ *size += off;
++ *dstptr += msize;
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++
++static int icmp_compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_icmp)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_icmp));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++
++static inline int
++compat_calc_match(struct ipt_entry_match *m, int * size)
++{
++ if (m->u.kernel.match->compat)
++ m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE);
++ return 0;
++}
++
++static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info,
++ void *base, struct xt_table_info *newinfo)
++{
++ struct ipt_entry_target *t;
++ u_int16_t entry_offset;
++ int off, i, ret;
++
++ off = 0;
++ entry_offset = (void *)e - base;
++ IPT_MATCH_ITERATE(e, compat_calc_match, &off);
++ t = ipt_get_target(e);
++ if (t->u.kernel.target->compat)
++ t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE);
++ newinfo->size -= off;
++ ret = compat_add_offset(entry_offset, off);
++ if (ret)
++ return ret;
++
++ for (i = 0; i< NF_IP_NUMHOOKS; i++) {
++ if (info->hook_entry[i] && (e < (struct ipt_entry *)
++ (base + info->hook_entry[i])))
++ newinfo->hook_entry[i] -= off;
++ if (info->underflow[i] && (e < (struct ipt_entry *)
++ (base + info->underflow[i])))
++ newinfo->underflow[i] -= off;
++ }
++ return 0;
++}
++
++static int compat_table_info(struct xt_table_info *info,
++ struct xt_table_info *newinfo)
++{
++ void *loc_cpu_entry;
++ int i;
++
++ if (!newinfo || !info)
++ return -EINVAL;
++
++ memset(newinfo, 0, sizeof(struct xt_table_info));
++ newinfo->size = info->size;
++ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
++ newinfo->hook_entry[i] = info->hook_entry[i];
++ newinfo->underflow[i] = info->underflow[i];
++ }
++ loc_cpu_entry = info->entries[raw_smp_processor_id()];
++ return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size,
++ compat_calc_entry, info, loc_cpu_entry, newinfo);
++}
++#endif
++
++static int get_info(void __user *user, int *len)
++{
++ char name[IPT_TABLE_MAXNAMELEN];
++ struct ipt_table *t;
++ int ret, size;
++
++#ifdef CONFIG_COMPAT
++ if (is_current_32bits())
++ size = sizeof(struct compat_ipt_getinfo);
++ else
++#endif
++ size = sizeof(struct ipt_getinfo);
++
++ if (*len != size) {
++ duprintf("length %u != %u\n", *len,
++ (unsigned int)sizeof(struct ipt_getinfo));
++ return -EINVAL;
++ }
++
++ if (copy_from_user(name, user, sizeof(name)) != 0)
++ return -EFAULT;
++
++ name[IPT_TABLE_MAXNAMELEN-1] = '\0';
++#ifdef CONFIG_COMPAT
++ down(&compat_ipt_mutex);
++#endif
++ t = try_then_request_module(xt_find_table_lock(AF_INET, name),
++ "iptable_%s", name);
++ if (t && !IS_ERR(t)) {
++ struct ipt_getinfo info;
++ struct xt_table_info *private = t->private;
++#ifdef CONFIG_COMPAT
++ struct compat_ipt_getinfo compat_info;
++#endif
++ void *pinfo;
++
++#ifdef CONFIG_COMPAT
++ if (is_current_32bits()) {
++ struct xt_table_info tmp;
++ ret = compat_table_info(private, &tmp);
++ compat_flush_offsets();
++ memcpy(compat_info.hook_entry, tmp.hook_entry,
++ sizeof(compat_info.hook_entry));
++ memcpy(compat_info.underflow, tmp.underflow,
++ sizeof(compat_info.underflow));
++ compat_info.valid_hooks = t->valid_hooks;
++ compat_info.num_entries = private->number;
++ compat_info.size = tmp.size;
++ strcpy(compat_info.name, name);
++ pinfo = (void *)&compat_info;
++ } else
++#endif
++ {
++ info.valid_hooks = t->valid_hooks;
++ memcpy(info.hook_entry, private->hook_entry,
++ sizeof(info.hook_entry));
++ memcpy(info.underflow, private->underflow,
++ sizeof(info.underflow));
++ info.num_entries = private->number;
++ info.size = private->size;
++ strcpy(info.name, name);
++ pinfo = (void *)&info;
++ }
++
++ if (copy_to_user(user, pinfo, *len) != 0)
++ ret = -EFAULT;
++ else
++ ret = 0;
++
++ xt_table_unlock(t);
++ module_put(t->me);
++ } else
++ ret = t ? PTR_ERR(t) : -ENOENT;
++#ifdef CONFIG_COMPAT
++ up(&compat_ipt_mutex);
++#endif
++ return ret;
++}
++
+ static int
+-get_entries(const struct ipt_get_entries *entries,
+- struct ipt_get_entries __user *uptr)
++get_entries(struct ipt_get_entries __user *uptr, int *len)
+ {
+ int ret;
++ struct ipt_get_entries get;
+ struct ipt_table *t;
+
+- t = xt_find_table_lock(AF_INET, entries->name);
++ if (*len < sizeof(get)) {
++ duprintf("get_entries: %u < %d\n", *len,
++ (unsigned int)sizeof(get));
++ return -EINVAL;
++ }
++ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
++ return -EFAULT;
++ if (*len != sizeof(struct ipt_get_entries) + get.size) {
++ duprintf("get_entries: %u != %u\n", *len,
++ (unsigned int)(sizeof(struct ipt_get_entries) +
++ get.size));
++ return -EINVAL;
++ }
++
++ t = xt_find_table_lock(AF_INET, get.name);
+ if (t && !IS_ERR(t)) {
+ struct xt_table_info *private = t->private;
+ duprintf("t->private->number = %u\n",
+ private->number);
+- if (entries->size == private->size)
++ if (get.size == private->size)
+ ret = copy_entries_to_user(private->size,
+ t, uptr->entrytable);
+ else {
+ duprintf("get_entries: I've got %u not %u!\n",
+ private->size,
+- entries->size);
++ get.size);
+ ret = -EINVAL;
+ }
+ module_put(t->me);
+@@ -905,71 +1295,39 @@ get_entries(const struct ipt_get_entries
+ }
+
+ static int
+-do_replace(void __user *user, unsigned int len)
++__do_replace(const char *name, unsigned int valid_hooks,
++ struct xt_table_info *newinfo, unsigned int num_counters,
++ void __user *counters_ptr)
+ {
+ int ret;
+- struct ipt_replace tmp;
+ struct ipt_table *t;
+- struct xt_table_info *newinfo, *oldinfo;
++ struct xt_table_info *oldinfo;
+ struct xt_counters *counters;
+- void *loc_cpu_entry, *loc_cpu_old_entry;
+-
+- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+- return -EFAULT;
+-
+- /* Hack: Causes ipchains to give correct error msg --RR */
+- if (len != sizeof(tmp) + tmp.size)
+- return -ENOPROTOOPT;
+-
+- /* overflow check */
+- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
+- SMP_CACHE_BYTES)
+- return -ENOMEM;
+- if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
+- return -ENOMEM;
+-
+- newinfo = xt_alloc_table_info(tmp.size);
+- if (!newinfo)
+- return -ENOMEM;
+-
+- /* choose the copy that is our node/cpu */
+- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
+- tmp.size) != 0) {
+- ret = -EFAULT;
+- goto free_newinfo;
+- }
++ void *loc_cpu_old_entry;
+
+- counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters));
++ ret = 0;
++ counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters));
+ if (!counters) {
+ ret = -ENOMEM;
+- goto free_newinfo;
++ goto out;
+ }
+
+- ret = translate_table(tmp.name, tmp.valid_hooks,
+- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
+- tmp.hook_entry, tmp.underflow);
+- if (ret != 0)
+- goto free_newinfo_counters;
+-
+- duprintf("ip_tables: Translated table\n");
+-
+- t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name),
+- "iptable_%s", tmp.name);
++ t = try_then_request_module(xt_find_table_lock(AF_INET, name),
++ "iptable_%s", name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free_newinfo_counters_untrans;
+ }
+
+ /* You lied! */
+- if (tmp.valid_hooks != t->valid_hooks) {
++ if (valid_hooks != t->valid_hooks) {
+ duprintf("Valid hook crap: %08X vs %08X\n",
+- tmp.valid_hooks, t->valid_hooks);
++ valid_hooks, t->valid_hooks);
+ ret = -EINVAL;
+ goto put_module;
+ }
+
+- oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret);
++ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret);
+ if (!oldinfo)
+ goto put_module;
+
+@@ -989,8 +1347,8 @@ do_replace(void __user *user, unsigned i
+ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+ IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+ xt_free_table_info(oldinfo);
+- if (copy_to_user(tmp.counters, counters,
+- sizeof(struct xt_counters) * tmp.num_counters) != 0)
++ if (copy_to_user(counters_ptr, counters,
++ sizeof(struct xt_counters) * num_counters) != 0)
+ ret = -EFAULT;
+ vfree(counters);
+ xt_table_unlock(t);
+@@ -1000,9 +1358,62 @@ do_replace(void __user *user, unsigned i
+ module_put(t->me);
+ xt_table_unlock(t);
+ free_newinfo_counters_untrans:
+- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
+- free_newinfo_counters:
+ vfree(counters);
++ out:
++ return ret;
++}
++
++static int
++do_replace(void __user *user, unsigned int len)
++{
++ int ret;
++ struct ipt_replace tmp;
++ struct xt_table_info *newinfo;
++ void *loc_cpu_entry;
++
++ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
++ return -EFAULT;
++
++ /* Hack: Causes ipchains to give correct error msg --RR */
++ if (len != sizeof(tmp) + tmp.size)
++ return -ENOPROTOOPT;
++
++ /* overflow check */
++ if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
++ SMP_CACHE_BYTES)
++ return -ENOMEM;
++ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
++ return -ENOMEM;
++
++ newinfo = xt_alloc_table_info(tmp.size);
++ if (!newinfo)
++ return -ENOMEM;
++
++ /* choose the copy that is our node/cpu */
++ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
++ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
++ tmp.size) != 0) {
++ ret = -EFAULT;
++ goto free_newinfo;
++ }
++
++ ret = translate_table(tmp.name, tmp.valid_hooks,
++ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
++ tmp.hook_entry, tmp.underflow);
++ if (ret != 0)
++ goto free_newinfo;
++
++ duprintf("ip_tables: Translated table\n");
++
++ ret = __do_replace(tmp.name, tmp.valid_hooks,
++ newinfo, tmp.num_counters,
++ tmp.counters);
++ if (ret)
++ goto free_newinfo_untrans;
++ return 0;
++
++ free_newinfo_untrans:
++ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
+ free_newinfo:
+ xt_free_table_info(newinfo);
+ return ret;
+@@ -1034,28 +1445,56 @@ static int
+ do_add_counters(void __user *user, unsigned int len)
+ {
+ unsigned int i;
+- struct xt_counters_info tmp, *paddc;
++ struct xt_counters_info tmp;
++ struct xt_counters *paddc;
++ unsigned int num_counters;
++ char *name;
++ int size;
++ void *ptmp;
+ struct ipt_table *t;
+ struct xt_table_info *private;
+ int ret = 0;
+ void *loc_cpu_entry;
++#ifdef CONFIG_COMPAT
++ struct compat_xt_counters_info compat_tmp;
+
+- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
++ if (is_current_32bits()) {
++ ptmp = &compat_tmp;
++ size = sizeof(struct compat_xt_counters_info);
++ } else
++#endif
++ {
++ ptmp = &tmp;
++ size = sizeof(struct xt_counters_info);
++ }
++
++ if (copy_from_user(ptmp, user, size) != 0)
+ return -EFAULT;
+
+- if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters))
++#ifdef CONFIG_COMPAT
++ if (is_current_32bits()) {
++ num_counters = compat_tmp.num_counters;
++ name = compat_tmp.name;
++ } else
++#endif
++ {
++ num_counters = tmp.num_counters;
++ name = tmp.name;
++ }
++
++ if (len != size + num_counters * sizeof(struct xt_counters))
+ return -EINVAL;
+
+- paddc = vmalloc_node(len, numa_node_id());
++ paddc = ub_vmalloc_node(len - size, numa_node_id());
+ if (!paddc)
+ return -ENOMEM;
+
+- if (copy_from_user(paddc, user, len) != 0) {
++ if (copy_from_user(paddc, user + size, len - size) != 0) {
+ ret = -EFAULT;
+ goto free;
+ }
+
+- t = xt_find_table_lock(AF_INET, tmp.name);
++ t = xt_find_table_lock(AF_INET, name);
+ if (!t || IS_ERR(t)) {
+ ret = t ? PTR_ERR(t) : -ENOENT;
+ goto free;
+@@ -1063,7 +1502,7 @@ do_add_counters(void __user *user, unsig
+
+ write_lock_bh(&t->lock);
+ private = t->private;
+- if (private->number != paddc->num_counters) {
++ if (private->number != num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+@@ -1074,7 +1513,7 @@ do_add_counters(void __user *user, unsig
+ IPT_ENTRY_ITERATE(loc_cpu_entry,
+ private->size,
+ add_counter_to_entry,
+- paddc->counters,
++ paddc,
+ &i);
+ unlock_up_free:
+ write_unlock_bh(&t->lock);
+@@ -1086,14 +1525,590 @@ do_add_counters(void __user *user, unsig
+ return ret;
+ }
+
++#ifdef CONFIG_COMPAT
++struct compat_ipt_replace {
++ char name[IPT_TABLE_MAXNAMELEN];
++ u32 valid_hooks;
++ u32 num_entries;
++ u32 size;
++ u32 hook_entry[NF_IP_NUMHOOKS];
++ u32 underflow[NF_IP_NUMHOOKS];
++ u32 num_counters;
++ compat_uptr_t counters; /* struct ipt_counters * */
++ struct compat_ipt_entry entries[0];
++};
++
++static inline int compat_copy_match_to_user(struct ipt_entry_match *m,
++ void __user **dstptr, compat_uint_t *size)
++{
++ if (m->u.kernel.match->compat)
++ m->u.kernel.match->compat(m, dstptr, size, COMPAT_TO_USER);
++ else {
++ if (__copy_to_user(*dstptr, m, m->u.match_size))
++ return -EFAULT;
++ *dstptr += m->u.match_size;
++ }
++ return 0;
++}
++
++static int compat_copy_entry_to_user(struct ipt_entry *e,
++ void __user **dstptr, compat_uint_t *size)
++{
++ struct ipt_entry_target __user *t;
++ struct compat_ipt_entry __user *ce;
++ u_int16_t target_offset, next_offset;
++ compat_uint_t origsize;
++ int ret;
++
++ ret = -EFAULT;
++ origsize = *size;
++ ce = (struct compat_ipt_entry __user *)*dstptr;
++ if (__copy_to_user(ce, e, sizeof(struct ipt_entry)))
++ goto out;
++
++ *dstptr += sizeof(struct compat_ipt_entry);
++ ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size);
++ target_offset = e->target_offset - (origsize - *size);
++ if (ret)
++ goto out;
++ t = ipt_get_target(e);
++ if (t->u.kernel.target->compat) {
++ ret = t->u.kernel.target->compat(t,
++ dstptr, size, COMPAT_TO_USER);
++ if (ret)
++ goto out;
++ } else {
++ ret = -EFAULT;
++ if (__copy_to_user(*dstptr, t, t->u.target_size))
++ goto out;
++ *dstptr += t->u.target_size;
++ }
++ ret = -EFAULT;
++ next_offset = e->next_offset - (origsize - *size);
++ if (__put_user(target_offset, &ce->target_offset))
++ goto out;
++ if (__put_user(next_offset, &ce->next_offset))
++ goto out;
++ return 0;
++out:
++ return ret;
++}
++
++static inline int
++compat_check_calc_match(struct ipt_entry_match *m,
++ const char *name,
++ const struct ipt_ip *ip,
++ unsigned int hookmask,
++ int *size, int *i)
++{
++ struct ipt_match *match;
++
++ match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name,
++ m->u.user.revision),
++ "ipt_%s", m->u.user.name);
++ if (IS_ERR(match) || !match) {
++ duprintf("compat_check_calc_match: `%s' not found\n",
++ m->u.user.name);
++ return match ? PTR_ERR(match) : -ENOENT;
++ }
++ m->u.kernel.match = match;
++
++ if (m->u.kernel.match->compat)
++ m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE);
++
++ (*i)++;
++ return 0;
++}
++
++static inline int
++check_compat_entry_size_and_hooks(struct ipt_entry *e,
++ struct xt_table_info *newinfo,
++ unsigned int *size,
++ unsigned char *base,
++ unsigned char *limit,
++ unsigned int *hook_entries,
++ unsigned int *underflows,
++ unsigned int *i,
++ const char *name)
++{
++ struct ipt_entry_target *t;
++ struct ipt_target *target;
++ u_int16_t entry_offset;
++ int ret, off, h, j;
++
++ duprintf("check_compat_entry_size_and_hooks %p\n", e);
++ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0
++ || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
++ duprintf("Bad offset %p, limit = %p\n", e, limit);
++ return -EINVAL;
++ }
++
++ if (e->next_offset < sizeof(struct compat_ipt_entry) +
++ sizeof(struct compat_ipt_entry_target)) {
++ duprintf("checking: element %p size %u\n",
++ e, e->next_offset);
++ return -EINVAL;
++ }
++
++ if (!ip_checkentry(&e->ip)) {
++ duprintf("ip_tables: ip check failed %p %s.\n", e, name);
++ return -EINVAL;
++ }
++
++ off = 0;
++ entry_offset = (void *)e - (void *)base;
++ j = 0;
++ ret = IPT_MATCH_ITERATE(e, compat_check_calc_match, name, &e->ip,
++ e->comefrom, &off, &j);
++ if (ret != 0)
++ goto out;
++
++ t = ipt_get_target(e);
++ target = try_then_request_module(xt_find_target(AF_INET,
++ t->u.user.name,
++ t->u.user.revision),
++ "ipt_%s", t->u.user.name);
++ if (IS_ERR(target) || !target) {
++ duprintf("check_entry: `%s' not found\n", t->u.user.name);
++ ret = target ? PTR_ERR(target) : -ENOENT;
++ goto out;
++ }
++ t->u.kernel.target = target;
++
++ if (t->u.kernel.target->compat)
++ t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE);
++ *size += off;
++ ret = compat_add_offset(entry_offset, off);
++ if (ret)
++ goto out;
++
++ /* Check hooks & underflows */
++ for (h = 0; h < NF_IP_NUMHOOKS; h++) {
++ if ((unsigned char *)e - base == hook_entries[h])
++ newinfo->hook_entry[h] = hook_entries[h];
++ if ((unsigned char *)e - base == underflows[h])
++ newinfo->underflow[h] = underflows[h];
++ }
++
++ /* Clear counters and comefrom */
++ e->counters = ((struct ipt_counters) { 0, 0 });
++ e->comefrom = 0;
++
++ (*i)++;
++ return 0;
++out:
++ IPT_MATCH_ITERATE(e, cleanup_match, &j);
++ return ret;
++}
++
++static inline int compat_copy_match_from_user(struct ipt_entry_match *m,
++ void **dstptr, compat_uint_t *size, const char *name,
++ const struct ipt_ip *ip, unsigned int hookmask)
++{
++ struct ipt_entry_match *dm;
++
++ dm = (struct ipt_entry_match *)*dstptr;
++ if (m->u.kernel.match->compat)
++ m->u.kernel.match->compat(m, dstptr, size, COMPAT_FROM_USER);
++ else {
++ memcpy(*dstptr, m, m->u.match_size);
++ *dstptr += m->u.match_size;
++ }
++
++ if (dm->u.kernel.match->checkentry
++ && !dm->u.kernel.match->checkentry(name, ip, dm->data,
++ dm->u.match_size - sizeof(*dm),
++ hookmask)) {
++ module_put(dm->u.kernel.match->me);
++ duprintf("ip_tables: check failed for `%s'.\n",
++ dm->u.kernel.match->name);
++ return -EINVAL;
++ }
++
++ return 0;
++}
++
++static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr,
++ unsigned int *size, const char *name,
++ struct xt_table_info *newinfo, unsigned char *base)
++{
++ struct ipt_entry_target *t;
++ struct ipt_entry *de;
++ unsigned int origsize;
++ int ret, h;
++
++ ret = 0;
++ origsize = *size;
++ de = (struct ipt_entry *)*dstptr;
++ memcpy(de, e, sizeof(struct ipt_entry));
++
++ *dstptr += sizeof(struct compat_ipt_entry);
++ ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size,
++ name, &de->ip, de->comefrom);
++ if (ret)
++ goto out;
++ de->target_offset = e->target_offset - (origsize - *size);
++ t = ipt_get_target(e);
++ if (t->u.kernel.target->compat)
++ t->u.kernel.target->compat(t,
++ dstptr, size, COMPAT_FROM_USER);
++ else {
++ memcpy(*dstptr, t, t->u.target_size);
++ *dstptr += t->u.target_size;
++ }
++
++ de->next_offset = e->next_offset - (origsize - *size);
++ for (h = 0; h < NF_IP_NUMHOOKS; h++) {
++ if ((unsigned char *)de - base < newinfo->hook_entry[h])
++ newinfo->hook_entry[h] -= origsize - *size;
++ if ((unsigned char *)de - base < newinfo->underflow[h])
++ newinfo->underflow[h] -= origsize - *size;
++ }
++
++ ret = -EINVAL;
++ t = ipt_get_target(de);
++ if (t->u.kernel.target == &ipt_standard_target) {
++ if (!standard_check(t, *size))
++ goto out;
++ } else if (t->u.kernel.target->checkentry
++ && !t->u.kernel.target->checkentry(name, de, t->data,
++ t->u.target_size
++ - sizeof(*t),
++ de->comefrom)) {
++ module_put(t->u.kernel.target->me);
++ duprintf("ip_tables: compat: check failed for `%s'.\n",
++ t->u.kernel.target->name);
++ goto out;
++ }
++ ret = 0;
++out:
++ return ret;
++}
++
++static int
++translate_compat_table(const char *name,
++ unsigned int valid_hooks,
++ struct xt_table_info **pinfo,
++ void **pentry0,
++ unsigned int total_size,
++ unsigned int number,
++ unsigned int *hook_entries,
++ unsigned int *underflows)
++{
++ unsigned int i;
++ struct xt_table_info *newinfo, *info;
++ void *pos, *entry0, *entry1;
++ unsigned int size;
++ int ret;
++
++ info = *pinfo;
++ entry0 = *pentry0;
++ size = total_size;
++ info->number = number;
++
++ /* Init all hooks to impossible value. */
++ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
++ info->hook_entry[i] = 0xFFFFFFFF;
++ info->underflow[i] = 0xFFFFFFFF;
++ }
++
++ duprintf("translate_compat_table: size %u\n", info->size);
++ i = 0;
++ down(&compat_ipt_mutex);
++ /* Walk through entries, checking offsets. */
++ ret = IPT_ENTRY_ITERATE(entry0, total_size,
++ check_compat_entry_size_and_hooks,
++ info, &size, entry0,
++ entry0 + total_size,
++ hook_entries, underflows, &i, name);
++ if (ret != 0)
++ goto out_unlock;
++
++ ret = -EINVAL;
++ if (i != number) {
++ duprintf("translate_compat_table: %u not %u entries\n",
++ i, number);
++ goto out_unlock;
++ }
++
++ /* Check hooks all assigned */
++ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
++ /* Only hooks which are valid */
++ if (!(valid_hooks & (1 << i)))
++ continue;
++ if (info->hook_entry[i] == 0xFFFFFFFF) {
++ duprintf("Invalid hook entry %u %u\n",
++ i, hook_entries[i]);
++ goto out_unlock;
++ }
++ if (info->underflow[i] == 0xFFFFFFFF) {
++ duprintf("Invalid underflow %u %u\n",
++ i, underflows[i]);
++ goto out_unlock;
++ }
++ }
++
++ ret = -ENOMEM;
++ newinfo = xt_alloc_table_info(size);
++ if (!newinfo)
++ goto out_unlock;
++
++ newinfo->number = number;
++ for (i = 0; i < NF_IP_NUMHOOKS; i++) {
++ newinfo->hook_entry[i] = info->hook_entry[i];
++ newinfo->underflow[i] = info->underflow[i];
++ }
++ entry1 = newinfo->entries[raw_smp_processor_id()];
++ pos = entry1;
++ size = total_size;
++ ret = IPT_ENTRY_ITERATE(entry0, total_size,
++ compat_copy_entry_from_user, &pos, &size,
++ name, newinfo, entry1);
++ compat_flush_offsets();
++ up(&compat_ipt_mutex);
++ if (ret)
++ goto free_newinfo;
++
++ ret = -ELOOP;
++ if (!mark_source_chains(newinfo, valid_hooks, entry1))
++ goto free_newinfo;
++
++ /* And one copy for every other CPU */
++ for_each_cpu(i)
++ if (newinfo->entries[i] && newinfo->entries[i] != entry1)
++ memcpy(newinfo->entries[i], entry1, newinfo->size);
++
++ *pinfo = newinfo;
++ *pentry0 = entry1;
++ xt_free_table_info(info);
++ return 0;
++
++free_newinfo:
++ xt_free_table_info(newinfo);
++out:
++ return ret;
++out_unlock:
++ up(&compat_ipt_mutex);
++ goto out;
++}
++
++static int
++compat_do_replace(void __user *user, unsigned int len)
++{
++ int ret;
++ struct compat_ipt_replace tmp;
++ struct xt_table_info *newinfo;
++ void *loc_cpu_entry;
++
++ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
++ return -EFAULT;
++
++ /* Hack: Causes ipchains to give correct error msg --RR */
++ if (len != sizeof(tmp) + tmp.size)
++ return -ENOPROTOOPT;
++
++ /* overflow check */
++ if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS -
++ SMP_CACHE_BYTES)
++ return -ENOMEM;
++ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
++ return -ENOMEM;
++
++ newinfo = xt_alloc_table_info(tmp.size);
++ if (!newinfo)
++ return -ENOMEM;
++
++ /* choose the copy that is our node/cpu */
++ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
++ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
++ tmp.size) != 0) {
++ ret = -EFAULT;
++ goto free_newinfo;
++ }
++
++ ret = translate_compat_table(tmp.name, tmp.valid_hooks,
++ &newinfo, &loc_cpu_entry, tmp.size,
++ tmp.num_entries, tmp.hook_entry, tmp.underflow);
++ if (ret != 0)
++ goto free_newinfo;
++
++ duprintf("compat_do_replace: Translated table\n");
++
++ ret = __do_replace(tmp.name, tmp.valid_hooks,
++ newinfo, tmp.num_counters,
++ compat_ptr(tmp.counters));
++ if (ret)
++ goto free_newinfo_untrans;
++ return 0;
++
++ free_newinfo_untrans:
++ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
++ free_newinfo:
++ xt_free_table_info(newinfo);
++ return ret;
++}
++
++struct compat_ipt_get_entries
++{
++ char name[IPT_TABLE_MAXNAMELEN];
++ compat_uint_t size;
++ struct compat_ipt_entry entrytable[0];
++};
++
++static int compat_copy_entries_to_user(unsigned int total_size,
++ struct ipt_table *table, void __user *userptr)
++{
++ unsigned int off, num;
++ struct compat_ipt_entry e;
++ struct xt_counters *counters;
++ struct xt_table_info *private = table->private;
++ void __user *pos;
++ unsigned int size;
++ int ret = 0;
++ void *loc_cpu_entry;
++
++ counters = alloc_counters(table);
++ if (IS_ERR(counters))
++ return PTR_ERR(counters);
++
++ /* choose the copy that is on our node/cpu, ...
++ * This choice is lazy (because current thread is
++ * allowed to migrate to another cpu)
++ */
++ loc_cpu_entry = private->entries[raw_smp_processor_id()];
++ pos = userptr;
++ size = total_size;
++ ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size,
++ compat_copy_entry_to_user, &pos, &size);
++ if (ret)
++ goto free_counters;
++
++ /* ... then go back and fix counters and names */
++ for (off = 0, num = 0; off < size; off += e.next_offset, num++) {
++ unsigned int i;
++ struct ipt_entry_match m;
++ struct ipt_entry_target t;
++
++ ret = -EFAULT;
++ if (copy_from_user(&e, userptr + off,
++ sizeof(struct compat_ipt_entry)))
++ goto free_counters;
++ if (copy_to_user(userptr + off +
++ offsetof(struct compat_ipt_entry, counters),
++ &counters[num], sizeof(counters[num])))
++ goto free_counters;
++
++ for (i = sizeof(struct compat_ipt_entry);
++ i < e.target_offset; i += m.u.match_size) {
++ if (copy_from_user(&m, userptr + off + i,
++ sizeof(struct ipt_entry_match)))
++ goto free_counters;
++ if (copy_to_user(userptr + off + i +
++ offsetof(struct ipt_entry_match, u.user.name),
++ m.u.kernel.match->name,
++ strlen(m.u.kernel.match->name) + 1))
++ goto free_counters;
++ }
++
++ if (copy_from_user(&t, userptr + off + e.target_offset,
++ sizeof(struct ipt_entry_target)))
++ goto free_counters;
++ if (copy_to_user(userptr + off + e.target_offset +
++ offsetof(struct ipt_entry_target, u.user.name),
++ t.u.kernel.target->name,
++ strlen(t.u.kernel.target->name) + 1))
++ goto free_counters;
++ }
++ ret = 0;
++free_counters:
++ vfree(counters);
++ return ret;
++}
++
++static int
++compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len)
++{
++ int ret;
++ struct compat_ipt_get_entries get;
++ struct ipt_table *t;
++
++
++ if (*len < sizeof(get)) {
++ duprintf("compat_get_entries: %u < %u\n",
++ *len, (unsigned int)sizeof(get));
++ return -EINVAL;
++ }
++
++ if (copy_from_user(&get, uptr, sizeof(get)) != 0)
++ return -EFAULT;
++
++ if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
++ duprintf("compat_get_entries: %u != %u\n", *len,
++ (unsigned int)(sizeof(struct compat_ipt_get_entries) +
++ get.size));
++ return -EINVAL;
++ }
++
++ down(&compat_ipt_mutex);
++ t = xt_find_table_lock(AF_INET, get.name);
++ if (t && !IS_ERR(t)) {
++ struct xt_table_info *private = t->private;
++ struct xt_table_info info;
++ duprintf("t->private->number = %u\n",
++ private->number);
++ ret = compat_table_info(private, &info);
++ if (!ret && get.size == info.size) {
++ ret = compat_copy_entries_to_user(private->size,
++ t, uptr->entrytable);
++ } else if (!ret) {
++ duprintf("compat_get_entries: I've got %u not %u!\n",
++ private->size,
++ get.size);
++ ret = -EINVAL;
++ }
++ compat_flush_offsets();
++ module_put(t->me);
++ xt_table_unlock(t);
++ } else
++ ret = t ? PTR_ERR(t) : -ENOENT;
++
++ up(&compat_ipt_mutex);
++ return ret;
++}
++
++static int
++compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
++{
++ int ret;
++
++ switch (cmd) {
++ case IPT_SO_GET_INFO:
++ ret = get_info(user, len);
++ break;
++ case IPT_SO_GET_ENTRIES:
++ ret = compat_get_entries(user, len);
++ break;
++ default:
++ duprintf("compat_do_ipt_get_ctl: unknown request %i\n", cmd);
++ ret = -EINVAL;
++ }
++ return ret;
++}
++#endif
++
+ static int
+ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+ {
+ int ret;
+
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+
++#ifdef CONFIG_COMPAT
++ if (is_current_32bits() && (cmd == IPT_SO_SET_REPLACE))
++ return compat_do_replace(user, len);
++#endif
++
+ switch (cmd) {
+ case IPT_SO_SET_REPLACE:
+ ret = do_replace(user, len);
+@@ -1116,69 +2131,22 @@ do_ipt_get_ctl(struct sock *sk, int cmd,
+ {
+ int ret;
+
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+
+- switch (cmd) {
+- case IPT_SO_GET_INFO: {
+- char name[IPT_TABLE_MAXNAMELEN];
+- struct ipt_table *t;
+-
+- if (*len != sizeof(struct ipt_getinfo)) {
+- duprintf("length %u != %u\n", *len,
+- sizeof(struct ipt_getinfo));
+- ret = -EINVAL;
+- break;
+- }
+-
+- if (copy_from_user(name, user, sizeof(name)) != 0) {
+- ret = -EFAULT;
+- break;
+- }
+- name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+-
+- t = try_then_request_module(xt_find_table_lock(AF_INET, name),
+- "iptable_%s", name);
+- if (t && !IS_ERR(t)) {
+- struct ipt_getinfo info;
+- struct xt_table_info *private = t->private;
+-
+- info.valid_hooks = t->valid_hooks;
+- memcpy(info.hook_entry, private->hook_entry,
+- sizeof(info.hook_entry));
+- memcpy(info.underflow, private->underflow,
+- sizeof(info.underflow));
+- info.num_entries = private->number;
+- info.size = private->size;
+- memcpy(info.name, name, sizeof(info.name));
+-
+- if (copy_to_user(user, &info, *len) != 0)
+- ret = -EFAULT;
+- else
+- ret = 0;
+- xt_table_unlock(t);
+- module_put(t->me);
+- } else
+- ret = t ? PTR_ERR(t) : -ENOENT;
+- }
+- break;
++#ifdef CONFIG_COMPAT
++ if (is_current_32bits())
++ return compat_do_ipt_get_ctl(sk, cmd, user, len);
++#endif
+
+- case IPT_SO_GET_ENTRIES: {
+- struct ipt_get_entries get;
++ switch (cmd) {
++ case IPT_SO_GET_INFO:
++ ret = get_info(user, len);
++ break;
+
+- if (*len < sizeof(get)) {
+- duprintf("get_entries: %u < %u\n", *len, sizeof(get));
+- ret = -EINVAL;
+- } else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+- ret = -EFAULT;
+- } else if (*len != sizeof(struct ipt_get_entries) + get.size) {
+- duprintf("get_entries: %u != %u\n", *len,
+- sizeof(struct ipt_get_entries) + get.size);
+- ret = -EINVAL;
+- } else
+- ret = get_entries(&get, user);
++ case IPT_SO_GET_ENTRIES:
++ ret = get_entries(user, len);
+ break;
+- }
+
+ case IPT_SO_GET_REVISION_MATCH:
+ case IPT_SO_GET_REVISION_TARGET: {
+@@ -1214,7 +2182,8 @@ do_ipt_get_ctl(struct sock *sk, int cmd,
+ return ret;
+ }
+
+-int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl)
++struct xt_table *ipt_register_table(struct xt_table *table,
++ const struct ipt_replace *repl)
+ {
+ int ret;
+ struct xt_table_info *newinfo;
+@@ -1224,7 +2193,7 @@ int ipt_register_table(struct xt_table *
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo)
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+
+ /* choose the copy on our node/cpu
+ * but dont care of preemption
+@@ -1239,15 +2208,14 @@ int ipt_register_table(struct xt_table *
+ repl->underflow);
+ if (ret != 0) {
+ xt_free_table_info(newinfo);
+- return ret;
++ return ERR_PTR(ret);
+ }
+
+- if (xt_register_table(table, &bootstrap, newinfo) != 0) {
++ table = virt_xt_register_table(table, &bootstrap, newinfo);
++ if (IS_ERR(table))
+ xt_free_table_info(newinfo);
+- return ret;
+- }
+
+- return 0;
++ return table;
+ }
+
+ void ipt_unregister_table(struct ipt_table *table)
+@@ -1255,7 +2223,7 @@ void ipt_unregister_table(struct ipt_tab
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+
+- private = xt_unregister_table(table);
++ private = virt_xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+@@ -1263,6 +2231,29 @@ void ipt_unregister_table(struct ipt_tab
+ xt_free_table_info(private);
+ }
+
++void ipt_flush_table(struct xt_table *table)
++{
++ struct xt_table *t;
++ void *loc_cpu_entry;
++
++ if (table == NULL)
++ return;
++
++ t = xt_find_table_lock(AF_INET, table->name);
++ if (t && !IS_ERR(t)) {
++ struct xt_table_info *private;
++ private = t->private;
++ loc_cpu_entry = private->entries[raw_smp_processor_id()];
++ IPT_ENTRY_ITERATE(loc_cpu_entry, private->size,
++ cleanup_entry, NULL);
++ if (private->number > private->initial_entries)
++ module_put(t->me);
++ private->size = 0;
++ xt_table_unlock(t);
++ module_put(t->me);
++ }
++}
++
+ /* Returns 1 if the type and code is matched by the range, 0 otherwise */
+ static inline int
+ icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+@@ -1327,6 +2318,9 @@ icmp_checkentry(const char *tablename,
+ /* The built-in targets: standard (NULL) and error. */
+ static struct ipt_target ipt_standard_target = {
+ .name = IPT_STANDARD_TARGET,
++#ifdef CONFIG_COMPAT
++ .compat = &compat_ipt_standard_fn,
++#endif
+ };
+
+ static struct ipt_target ipt_error_target = {
+@@ -1348,43 +2342,107 @@ static struct ipt_match icmp_matchstruct
+ .name = "icmp",
+ .match = &icmp_match,
+ .checkentry = &icmp_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &icmp_compat,
++#endif
+ };
+
+-static int __init init(void)
++static int init_iptables(void)
+ {
+ int ret;
+
+- xt_proto_init(AF_INET);
++ if (ve_ipt_standard_target != NULL)
++ return -EEXIST;
++
++ ret = xt_register_target(AF_INET, &ipt_standard_target);
++ if (ret)
++ goto out;
++#ifdef CONFIG_VE_IPTABLES
++ ve_ipt_standard_target = xt_find_target(AF_INET, IPT_STANDARD_TARGET, 0);
++ if (IS_ERR(ve_ipt_standard_target))
++ goto out_standard;
++#endif
++ ret = xt_register_target(AF_INET, &ipt_error_target);
++ if (ret)
++ goto out_error;
++ ret = xt_register_match(AF_INET, &icmp_matchstruct);
++ if (ret)
++ goto out_icmp;
++ ret = xt_proto_init(AF_INET);
++ if (ret)
++ goto out_proc;
++ return 0;
++
++out_proc:
++ xt_unregister_match(AF_INET, &icmp_matchstruct);
++out_icmp:
++ xt_unregister_target(AF_INET, &ipt_error_target);
++out_error:
++#ifdef CONFIG_VE_IPTABLES
++ ve_ipt_standard_target = NULL;
++out_standard:
++#endif
++ xt_unregister_target(AF_INET, &ipt_standard_target);
++out:
++ return ret;
++}
++
++static void fini_iptables(void)
++{
++ xt_proto_fini(AF_INET);
++ xt_unregister_match(AF_INET, &icmp_matchstruct);
++ xt_unregister_target(AF_INET, &ipt_error_target);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ipt_standard_target = NULL;
++#endif
++ xt_unregister_target(AF_INET, &ipt_standard_target);
++}
+
+- /* Noone else will be downing sem now, so we won't sleep */
+- xt_register_target(AF_INET, &ipt_standard_target);
+- xt_register_target(AF_INET, &ipt_error_target);
+- xt_register_match(AF_INET, &icmp_matchstruct);
++static int __init init(void)
++{
++ int ret;
++
++ ret = init_iptables();
++ if (ret)
++ goto out;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ipt_sockopts);
+ if (ret < 0) {
+ duprintf("Unable to register sockopts.\n");
+- return ret;
++ goto out_sockopts;
+ }
+
++ KSYMRESOLVE(init_iptables);
++ KSYMRESOLVE(fini_iptables);
++ KSYMRESOLVE(ipt_flush_table);
++ KSYMMODRESOLVE(ip_tables);
+ printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n");
+ return 0;
++
++out_sockopts:
++ fini_iptables();
++out:
++ return ret;
+ }
+
+ static void __exit fini(void)
+ {
++ KSYMMODUNRESOLVE(ip_tables);
++ KSYMUNRESOLVE(init_iptables);
++ KSYMUNRESOLVE(fini_iptables);
++ KSYMUNRESOLVE(ipt_flush_table);
+ nf_unregister_sockopt(&ipt_sockopts);
+-
+- xt_unregister_match(AF_INET, &icmp_matchstruct);
+- xt_unregister_target(AF_INET, &ipt_error_target);
+- xt_unregister_target(AF_INET, &ipt_standard_target);
+-
+- xt_proto_fini(AF_INET);
++ fini_iptables();
+ }
+
+ EXPORT_SYMBOL(ipt_register_table);
+ EXPORT_SYMBOL(ipt_unregister_table);
+ EXPORT_SYMBOL(ipt_do_table);
+-module_init(init);
++#ifdef CONFIG_COMPAT
++EXPORT_SYMBOL(ipt_match_align_compat);
++EXPORT_SYMBOL(ipt_target_align_compat);
++#endif
++EXPORT_SYMBOL(ipt_flush_table);
++subsys_initcall(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_LOG.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_LOG.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_LOG.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_LOG.c 2006-07-04 14:41:39.000000000 +0400
+@@ -18,6 +18,7 @@
+ #include <net/udp.h>
+ #include <net/tcp.h>
+ #include <net/route.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+@@ -463,10 +464,25 @@ static int ipt_log_checkentry(const char
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int ipt_log_compat(void *target,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_log_info)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_log_info));
++ return ipt_target_align_compat(target, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_target ipt_log_reg = {
+ .name = "LOG",
+ .target = ipt_log_target,
+ .checkentry = ipt_log_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = ipt_log_compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+@@ -476,24 +492,44 @@ static struct nf_logger ipt_log_logger =
+ .me = THIS_MODULE,
+ };
+
++int init_iptable_LOG(void)
++{
++ return ipt_register_target(&ipt_log_reg);
++}
++
++void fini_iptable_LOG(void)
++{
++ ipt_unregister_target(&ipt_log_reg);
++}
++
+ static int __init init(void)
+ {
+- if (ipt_register_target(&ipt_log_reg))
+- return -EINVAL;
++ int err;
++
++ err = init_iptable_LOG();
++ if (err < 0)
++ return err;
+ if (nf_log_register(PF_INET, &ipt_log_logger) < 0) {
+- printk(KERN_WARNING "ipt_LOG: not logging via system console "
++ ve_printk(VE_LOG, KERN_WARNING "ipt_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET\n");
+ /* we cannot make module load fail here, since otherwise
+ * iptables userspace would abort */
+ }
+
++
++ KSYMRESOLVE(init_iptable_LOG);
++ KSYMRESOLVE(fini_iptable_LOG);
++ KSYMMODRESOLVE(ipt_LOG);
+ return 0;
+ }
+
+ static void __exit fini(void)
+ {
++ KSYMMODUNRESOLVE(ipt_LOG);
++ KSYMUNRESOLVE(init_iptable_LOG);
++ KSYMUNRESOLVE(fini_iptable_LOG);
+ nf_log_unregister_logger(&ipt_log_logger);
+- ipt_unregister_target(&ipt_log_reg);
++ fini_iptable_LOG();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_MASQUERADE.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-07-04 14:41:39.000000000 +0400
+@@ -120,6 +120,7 @@ masquerade_target(struct sk_buff **pskb,
+ return ip_nat_setup_info(ct, &newrange, hooknum);
+ }
+
++#if 0
+ static inline int
+ device_cmp(struct ip_conntrack *i, void *ifindex)
+ {
+@@ -175,6 +176,7 @@ static struct notifier_block masq_dev_no
+ static struct notifier_block masq_inet_notifier = {
+ .notifier_call = masq_inet_event,
+ };
++#endif
+
+ static struct ipt_target masquerade = {
+ .name = "MASQUERADE",
+@@ -189,12 +191,16 @@ static int __init init(void)
+
+ ret = ipt_register_target(&masquerade);
+
++#if 0
++/* These notifiers are unnecessary and may
++ lead to oops in virtual environments */
+ if (ret == 0) {
+ /* Register for device down reports */
+ register_netdevice_notifier(&masq_dev_notifier);
+ /* Register IP address change reports */
+ register_inetaddr_notifier(&masq_inet_notifier);
+ }
++#endif
+
+ return ret;
+ }
+@@ -202,8 +208,8 @@ static int __init init(void)
+ static void __exit fini(void)
+ {
+ ipt_unregister_target(&masquerade);
+- unregister_netdevice_notifier(&masq_dev_notifier);
+- unregister_inetaddr_notifier(&masq_inet_notifier);
++/* unregister_netdevice_notifier(&masq_dev_notifier);
++ unregister_inetaddr_notifier(&masq_inet_notifier); */
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REDIRECT.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_REDIRECT.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REDIRECT.c 2006-07-04 14:41:39.000000000 +0400
+@@ -17,6 +17,7 @@
+ #include <linux/inetdevice.h>
+ #include <net/protocol.h>
+ #include <net/checksum.h>
++#include <linux/nfcalls.h>
+ #include <linux/netfilter_ipv4.h>
+ #include <linux/netfilter_ipv4/ip_nat_rule.h>
+
+@@ -25,7 +26,7 @@ MODULE_AUTHOR("Netfilter Core Team <core
+ MODULE_DESCRIPTION("iptables REDIRECT target module");
+
+ #if 0
+-#define DEBUGP printk
++#define DEBUGP ve_printk
+ #else
+ #define DEBUGP(format, args...)
+ #endif
+@@ -94,8 +95,14 @@ redirect_target(struct sk_buff **pskb,
+
+ rcu_read_lock();
+ indev = __in_dev_get_rcu((*pskb)->dev);
+- if (indev && (ifa = indev->ifa_list))
++ if (indev && (ifa = indev->ifa_list)) {
++ /* because of venet device specific, we should use
++ * second ifa in the list */
++ if (IN_LOOPBACK(ntohl(ifa->ifa_local)) &&
++ ifa->ifa_next)
++ ifa = ifa->ifa_next;
+ newdst = ifa->ifa_local;
++ }
+ rcu_read_unlock();
+
+ if (!newdst)
+@@ -119,15 +126,37 @@ static struct ipt_target redirect_reg =
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_REDIRECT(void)
+ {
+ return ipt_register_target(&redirect_reg);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_REDIRECT(void)
+ {
+ ipt_unregister_target(&redirect_reg);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_REDIRECT();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_REDIRECT);
++ KSYMRESOLVE(fini_iptable_REDIRECT);
++ KSYMMODRESOLVE(ipt_REDIRECT);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_REDIRECT);
++ KSYMUNRESOLVE(init_iptable_REDIRECT);
++ KSYMUNRESOLVE(fini_iptable_REDIRECT);
++ fini_iptable_REDIRECT();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REJECT.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_REJECT.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REJECT.c 2006-07-04 14:41:39.000000000 +0400
+@@ -22,6 +22,7 @@
+ #include <net/ip.h>
+ #include <net/tcp.h>
+ #include <net/route.h>
++#include <linux/nfcalls.h>
+ #include <net/dst.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+ #include <linux/netfilter_ipv4/ipt_REJECT.h>
+@@ -322,22 +323,59 @@ static int check(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *target,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_reject_info)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_reject_info));
++ return ipt_target_align_compat(target, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_target ipt_reject_reg = {
+ .name = "REJECT",
+ .target = reject,
+ .checkentry = check,
++#ifdef CONFIG_COMPAT
++ .compat = compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_REJECT(void)
+ {
+ return ipt_register_target(&ipt_reject_reg);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_REJECT(void)
+ {
+ ipt_unregister_target(&ipt_reject_reg);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_REJECT();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_REJECT);
++ KSYMRESOLVE(fini_iptable_REJECT);
++ KSYMMODRESOLVE(ipt_REJECT);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_REJECT);
++ KSYMUNRESOLVE(init_iptable_REJECT);
++ KSYMUNRESOLVE(fini_iptable_REJECT);
++ fini_iptable_REJECT();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_TCPMSS.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TCPMSS.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_TCPMSS.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TCPMSS.c 2006-07-04 14:41:39.000000000 +0400
+@@ -13,6 +13,7 @@
+
+ #include <linux/ip.h>
+ #include <net/tcp.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv4/ip_tables.h>
+ #include <linux/netfilter_ipv4/ipt_TCPMSS.h>
+@@ -242,22 +243,59 @@ ipt_tcpmss_checkentry(const char *tablen
+ return 0;
+ }
+
++#ifdef CONFIG_COMPAT
++static int ipt_tcpmss_compat(void *target,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_tcpmss_info)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_tcpmss_info));
++ return ipt_target_align_compat(target, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_target ipt_tcpmss_reg = {
+ .name = "TCPMSS",
+ .target = ipt_tcpmss_target,
+ .checkentry = ipt_tcpmss_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = ipt_tcpmss_compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_TCPMSS(void)
+ {
+ return ipt_register_target(&ipt_tcpmss_reg);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_TCPMSS(void)
+ {
+ ipt_unregister_target(&ipt_tcpmss_reg);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_TCPMSS();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_TCPMSS);
++ KSYMRESOLVE(fini_iptable_TCPMSS);
++ KSYMMODRESOLVE(ipt_TCPMSS);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_TCPMSS);
++ KSYMUNRESOLVE(init_iptable_TCPMSS);
++ KSYMUNRESOLVE(fini_iptable_TCPMSS);
++ fini_iptable_TCPMSS();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_TOS.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TOS.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_TOS.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TOS.c 2006-07-04 14:41:39.000000000 +0400
+@@ -15,6 +15,7 @@
+
+ #include <linux/netfilter_ipv4/ip_tables.h>
+ #include <linux/netfilter_ipv4/ipt_TOS.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+@@ -83,22 +84,59 @@ checkentry(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *target,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_tos_target_info)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_tos_target_info));
++ return ipt_target_align_compat(target, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_target ipt_tos_reg = {
+ .name = "TOS",
+ .target = target,
+ .checkentry = checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_TOS(void)
+ {
+ return ipt_register_target(&ipt_tos_reg);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_TOS(void)
+ {
+ ipt_unregister_target(&ipt_tos_reg);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_TOS();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_TOS);
++ KSYMRESOLVE(fini_iptable_TOS);
++ KSYMMODRESOLVE(ipt_TOS);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_TOS);
++ KSYMUNRESOLVE(init_iptable_TOS);
++ KSYMUNRESOLVE(fini_iptable_TOS);
++ fini_iptable_TOS();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_multiport.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_multiport.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_multiport.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_multiport.c 2006-07-04 14:41:39.000000000 +0400
+@@ -13,6 +13,7 @@
+ #include <linux/types.h>
+ #include <linux/udp.h>
+ #include <linux/skbuff.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv4/ipt_multiport.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+@@ -21,6 +22,13 @@ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+ MODULE_DESCRIPTION("iptables multiple port match module");
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_multiport_match (*(get_exec_env()->_multiport_match))
++#else
++#define ve_multiport_match multiport_match
++#endif
++
+ #if 0
+ #define duprintf(format, args...) printk(format , ## args)
+ #else
+@@ -174,11 +182,36 @@ checkentry_v1(const char *tablename,
+ return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1)));
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_multiport)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_multiport));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++
++static int compat_v1(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_multiport_v1)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_multiport_v1));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_match multiport_match = {
+ .name = "multiport",
+ .revision = 0,
+ .match = &match,
+ .checkentry = &checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+@@ -187,10 +220,13 @@ static struct ipt_match multiport_match_
+ .revision = 1,
+ .match = &match_v1,
+ .checkentry = &checkentry_v1,
++#ifdef CONFIG_COMPAT
++ .compat = &compat_v1,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_multiport(void)
+ {
+ int err;
+
+@@ -204,11 +240,33 @@ static int __init init(void)
+ return err;
+ }
+
+-static void __exit fini(void)
++void fini_iptable_multiport(void)
+ {
+ ipt_unregister_match(&multiport_match);
+ ipt_unregister_match(&multiport_match_v1);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_multiport();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_multiport);
++ KSYMRESOLVE(fini_iptable_multiport);
++ KSYMMODRESOLVE(ipt_multiport);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_multiport);
++ KSYMUNRESOLVE(init_iptable_multiport);
++ KSYMUNRESOLVE(fini_iptable_multiport);
++ fini_iptable_multiport();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_tos.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_tos.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_tos.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_tos.c 2006-07-04 14:41:39.000000000 +0400
+@@ -10,6 +10,7 @@
+
+ #include <linux/module.h>
+ #include <linux/skbuff.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv4/ipt_tos.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+@@ -17,6 +18,13 @@
+ MODULE_LICENSE("GPL");
+ MODULE_DESCRIPTION("iptables TOS match module");
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_tos_match (*(get_exec_env()->_tos_match))
++#else
++#define ve_tos_match tos_match
++#endif
++
+ static int
+ match(const struct sk_buff *skb,
+ const struct net_device *in,
+@@ -44,22 +52,59 @@ checkentry(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_tos_info)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_tos_info));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_match tos_match = {
+ .name = "tos",
+ .match = &match,
+ .checkentry = &checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_tos(void)
+ {
+ return ipt_register_match(&tos_match);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_tos(void)
+ {
+ ipt_unregister_match(&tos_match);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_tos();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_tos);
++ KSYMRESOLVE(fini_iptable_tos);
++ KSYMMODRESOLVE(ipt_tos);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_tos);
++ KSYMUNRESOLVE(init_iptable_tos);
++ KSYMUNRESOLVE(fini_iptable_tos);
++ fini_iptable_tos();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_ttl.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_ttl.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_ttl.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_ttl.c 2006-07-04 14:41:39.000000000 +0400
+@@ -11,6 +11,7 @@
+
+ #include <linux/module.h>
+ #include <linux/skbuff.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv4/ipt_ttl.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+@@ -57,22 +58,58 @@ static int checkentry(const char *tablen
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = IPT_ALIGN(sizeof(struct ipt_ttl_info)) -
++ COMPAT_IPT_ALIGN(sizeof(struct ipt_ttl_info));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++#endif
++
+ static struct ipt_match ttl_match = {
+ .name = "ttl",
+ .match = &match,
+ .checkentry = &checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_iptable_ttl(void)
+ {
+ return ipt_register_match(&ttl_match);
+ }
+
+-static void __exit fini(void)
++void fini_iptable_ttl(void)
+ {
+ ipt_unregister_match(&ttl_match);
++}
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_ttl();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_ttl);
++ KSYMRESOLVE(fini_iptable_ttl);
++ KSYMMODRESOLVE(ipt_ttl);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ipt_ttl);
++ KSYMUNRESOLVE(init_iptable_ttl);
++ KSYMUNRESOLVE(fini_iptable_ttl);
++ fini_iptable_ttl();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_filter.c linux-2.6.16-026test015/net/ipv4/netfilter/iptable_filter.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/iptable_filter.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/iptable_filter.c 2006-07-04 14:41:39.000000000 +0400
+@@ -12,12 +12,20 @@
+
+ #include <linux/module.h>
+ #include <linux/moduleparam.h>
++#include <linux/nfcalls.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+ MODULE_DESCRIPTION("iptables filter table");
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_packet_filter (get_exec_env()->_ve_ipt_filter_pf)
++#else
++#define ve_packet_filter &packet_filter
++#endif
++
+ #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
+
+ static struct
+@@ -25,7 +33,7 @@ static struct
+ struct ipt_replace repl;
+ struct ipt_standard entries[3];
+ struct ipt_error term;
+-} initial_table __initdata
++} initial_table
+ = { { "filter", FILTER_VALID_HOOKS, 4,
+ sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+ { [NF_IP_LOCAL_IN] = 0,
+@@ -90,7 +98,7 @@ ipt_hook(unsigned int hook,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+ {
+- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
++ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL);
+ }
+
+ static unsigned int
+@@ -108,7 +116,7 @@ ipt_local_out_hook(unsigned int hook,
+ return NF_ACCEPT;
+ }
+
+- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
++ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL);
+ }
+
+ static struct nf_hook_ops ipt_ops[] = {
+@@ -139,56 +147,89 @@ static struct nf_hook_ops ipt_ops[] = {
+ static int forward = NF_ACCEPT;
+ module_param(forward, bool, 0000);
+
+-static int __init init(void)
++int init_iptable_filter(void)
+ {
+ int ret;
+-
+- if (forward < 0 || forward > NF_MAX_VERDICT) {
+- printk("iptables forward must be 0 or 1\n");
+- return -EINVAL;
+- }
+-
+- /* Entry 1 is the FORWARD hook */
+- initial_table.entries[1].target.verdict = -forward - 1;
++ struct ipt_table *tmp_filter;
+
+ /* Register table */
+- ret = ipt_register_table(&packet_filter, &initial_table.repl);
+- if (ret < 0)
+- return ret;
++ tmp_filter = ipt_register_table(&packet_filter,
++ &initial_table.repl);
++ if (IS_ERR(tmp_filter))
++ return PTR_ERR(tmp_filter);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_filter = tmp_filter;
++#endif
+
+ /* Register hooks */
+- ret = nf_register_hook(&ipt_ops[0]);
++ ret = virt_nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+- ret = nf_register_hook(&ipt_ops[1]);
++ ret = virt_nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+- ret = nf_register_hook(&ipt_ops[2]);
++ ret = virt_nf_register_hook(&ipt_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+ return ret;
+
+ cleanup_hook1:
+- nf_unregister_hook(&ipt_ops[1]);
++ virt_nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+- nf_unregister_hook(&ipt_ops[0]);
++ virt_nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+- ipt_unregister_table(&packet_filter);
++ ipt_unregister_table(ve_packet_filter);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_filter = NULL;
++#endif
+
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_iptable_filter(void)
+ {
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+- nf_unregister_hook(&ipt_ops[i]);
++ virt_nf_unregister_hook(&ipt_ops[i]);
+
+- ipt_unregister_table(&packet_filter);
++ ipt_unregister_table(ve_packet_filter);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_filter = NULL;
++#endif
++}
++
++static int __init init(void)
++{
++ int err;
++
++ if (forward < 0 || forward > NF_MAX_VERDICT) {
++ printk("iptables forward must be 0 or 1\n");
++ return -EINVAL;
++ }
++
++ /* Entry 1 is the FORWARD hook */
++ initial_table.entries[1].target.verdict = -forward - 1;
++
++ err = init_iptable_filter();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_filter);
++ KSYMRESOLVE(fini_iptable_filter);
++ KSYMMODRESOLVE(iptable_filter);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(iptable_filter);
++ KSYMUNRESOLVE(init_iptable_filter);
++ KSYMUNRESOLVE(fini_iptable_filter);
++ fini_iptable_filter();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_mangle.c linux-2.6.16-026test015/net/ipv4/netfilter/iptable_mangle.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/iptable_mangle.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/iptable_mangle.c 2006-07-04 14:41:39.000000000 +0400
+@@ -17,6 +17,7 @@
+ #include <linux/skbuff.h>
+ #include <net/sock.h>
+ #include <net/route.h>
++#include <linux/nfcalls.h>
+ #include <linux/ip.h>
+
+ MODULE_LICENSE("GPL");
+@@ -35,7 +36,7 @@ static struct
+ struct ipt_replace repl;
+ struct ipt_standard entries[5];
+ struct ipt_error term;
+-} initial_table __initdata
++} initial_table
+ = { { "mangle", MANGLE_VALID_HOOKS, 6,
+ sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
+ { [NF_IP_PRE_ROUTING] = 0,
+@@ -112,6 +113,13 @@ static struct ipt_table packet_mangler =
+ .af = AF_INET,
+ };
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_packet_mangler (get_exec_env()->_ipt_mangle_table)
++#else
++#define ve_packet_mangler &packet_mangler
++#endif
++
+ /* The work comes in here from netfilter.c. */
+ static unsigned int
+ ipt_route_hook(unsigned int hook,
+@@ -120,7 +128,7 @@ ipt_route_hook(unsigned int hook,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+ {
+- return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
++ return ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL);
+ }
+
+ static unsigned int
+@@ -149,7 +157,8 @@ ipt_local_hook(unsigned int hook,
+ daddr = (*pskb)->nh.iph->daddr;
+ tos = (*pskb)->nh.iph->tos;
+
+- ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
++ ret = ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL);
++
+ /* Reroute for ANY change. */
+ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
+ && ((*pskb)->nh.iph->saddr != saddr
+@@ -201,60 +210,103 @@ static struct nf_hook_ops ipt_ops[] = {
+ },
+ };
+
+-static int __init init(void)
++static int mangle_init(struct nf_hook_ops ipt_ops[])
+ {
+ int ret;
++ struct ipt_table *tmp_mangler;
+
+ /* Register table */
+- ret = ipt_register_table(&packet_mangler, &initial_table.repl);
+- if (ret < 0)
+- return ret;
++ tmp_mangler = ipt_register_table(&packet_mangler,
++ &initial_table.repl);
++ if (IS_ERR(tmp_mangler))
++ return PTR_ERR(tmp_mangler);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_mangler = tmp_mangler;
++#endif
+
+ /* Register hooks */
+- ret = nf_register_hook(&ipt_ops[0]);
++ ret = virt_nf_register_hook(&ipt_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+- ret = nf_register_hook(&ipt_ops[1]);
++ ret = virt_nf_register_hook(&ipt_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+- ret = nf_register_hook(&ipt_ops[2]);
++ ret = virt_nf_register_hook(&ipt_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+- ret = nf_register_hook(&ipt_ops[3]);
++ ret = virt_nf_register_hook(&ipt_ops[3]);
+ if (ret < 0)
+ goto cleanup_hook2;
+
+- ret = nf_register_hook(&ipt_ops[4]);
++ ret = virt_nf_register_hook(&ipt_ops[4]);
+ if (ret < 0)
+ goto cleanup_hook3;
+
+ return ret;
+
+ cleanup_hook3:
+- nf_unregister_hook(&ipt_ops[3]);
++ virt_nf_unregister_hook(&ipt_ops[3]);
+ cleanup_hook2:
+- nf_unregister_hook(&ipt_ops[2]);
++ virt_nf_unregister_hook(&ipt_ops[2]);
+ cleanup_hook1:
+- nf_unregister_hook(&ipt_ops[1]);
++ virt_nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+- nf_unregister_hook(&ipt_ops[0]);
++ virt_nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+- ipt_unregister_table(&packet_mangler);
++ ipt_unregister_table(ve_packet_mangler);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_mangler = NULL;
++#endif
+
+ return ret;
+ }
+
+-static void __exit fini(void)
++static void mangle_fini(struct nf_hook_ops ipt_ops[])
+ {
+ unsigned int i;
+
+- for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+- nf_unregister_hook(&ipt_ops[i]);
++ for (i = 0; i < 5; i++)
++ virt_nf_unregister_hook(&ipt_ops[i]);
++
++ ipt_unregister_table(ve_packet_mangler);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_mangler = NULL;
++#endif
++}
++
++int init_iptable_mangle(void)
++{
++ return mangle_init(ipt_ops);
++}
++
++void fini_iptable_mangle(void)
++{
++ mangle_fini(ipt_ops);
++}
++
++static int __init init(void)
++{
++ int err;
++
++ err = init_iptable_mangle();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_iptable_mangle);
++ KSYMRESOLVE(fini_iptable_mangle);
++ KSYMMODRESOLVE(iptable_mangle);
++ return 0;
++}
+
+- ipt_unregister_table(&packet_mangler);
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(iptable_mangle);
++ KSYMUNRESOLVE(init_iptable_mangle);
++ KSYMUNRESOLVE(fini_iptable_mangle);
++ fini_iptable_mangle();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_raw.c linux-2.6.16-026test015/net/ipv4/netfilter/iptable_raw.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/iptable_raw.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/iptable_raw.c 2006-07-04 14:41:39.000000000 +0400
+@@ -118,12 +118,13 @@ static struct nf_hook_ops ipt_ops[] = {
+
+ static int __init init(void)
+ {
++ struct ipt_table *tmp;
+ int ret;
+
+ /* Register table */
+- ret = ipt_register_table(&packet_raw, &initial_table.repl);
+- if (ret < 0)
+- return ret;
++ tmp = ipt_register_table(&packet_raw, &initial_table.repl);
++ if (IS_ERR(tmp))
++ return PTR_ERR(tmp);
+
+ /* Register hooks */
+ ret = nf_register_hook(&ipt_ops[0]);
+diff -upr linux-2.6.16.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c linux-2.6.16-026test015/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+--- linux-2.6.16.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2006-07-04 14:41:36.000000000 +0400
+@@ -354,6 +354,7 @@ getorigdst(struct sock *sk, int optval,
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u3.ip;
++ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+diff -upr linux-2.6.16.orig/net/ipv4/proc.c linux-2.6.16-026test015/net/ipv4/proc.c
+--- linux-2.6.16.orig/net/ipv4/proc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/proc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -258,11 +258,12 @@ static int snmp_seq_show(struct seq_file
+ seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+ seq_printf(seq, "\nIp: %d %d",
+- ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl);
++ ve_ipv4_devconf.forwarding ? 1 : 2,
++ sysctl_ip_default_ttl);
+
+ for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+- fold_field((void **) ip_statistics,
++ fold_field((void **) ve_ip_statistics,
+ snmp4_ipstats_list[i].entry));
+
+ seq_puts(seq, "\nIcmp:");
+@@ -272,7 +273,7 @@ static int snmp_seq_show(struct seq_file
+ seq_puts(seq, "\nIcmp:");
+ for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+- fold_field((void **) icmp_statistics,
++ fold_field((void **) ve_icmp_statistics,
+ snmp4_icmp_list[i].entry));
+
+ seq_puts(seq, "\nTcp:");
+@@ -284,11 +285,11 @@ static int snmp_seq_show(struct seq_file
+ /* MaxConn field is signed, RFC 2012 */
+ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+ seq_printf(seq, " %ld",
+- fold_field((void **) tcp_statistics,
++ fold_field((void **) ve_tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ else
+ seq_printf(seq, " %lu",
+- fold_field((void **) tcp_statistics,
++ fold_field((void **) ve_tcp_statistics,
+ snmp4_tcp_list[i].entry));
+ }
+
+@@ -299,7 +300,7 @@ static int snmp_seq_show(struct seq_file
+ seq_puts(seq, "\nUdp:");
+ for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+- fold_field((void **) udp_statistics,
++ fold_field((void **) ve_udp_statistics,
+ snmp4_udp_list[i].entry));
+
+ seq_putc(seq, '\n');
+@@ -333,7 +334,7 @@ static int netstat_seq_show(struct seq_f
+ seq_puts(seq, "\nTcpExt:");
+ for (i = 0; snmp4_net_list[i].name != NULL; i++)
+ seq_printf(seq, " %lu",
+- fold_field((void **) net_statistics,
++ fold_field((void **) ve_net_statistics,
+ snmp4_net_list[i].entry));
+
+ seq_putc(seq, '\n');
+@@ -357,10 +358,10 @@ int __init ip_misc_proc_init(void)
+ {
+ int rc = 0;
+
+- if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
++ if (!proc_glob_fops_create("net/netstat", S_IRUGO, &netstat_seq_fops))
+ goto out_netstat;
+
+- if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
++ if (!proc_glob_fops_create("net/snmp", S_IRUGO, &snmp_seq_fops))
+ goto out_snmp;
+
+ if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
+@@ -368,9 +369,9 @@ int __init ip_misc_proc_init(void)
+ out:
+ return rc;
+ out_sockstat:
+- proc_net_remove("snmp");
++ remove_proc_glob_entry("net/snmp", NULL);
+ out_snmp:
+- proc_net_remove("netstat");
++ remove_proc_glob_entry("net/netstat", NULL);
+ out_netstat:
+ rc = -ENOMEM;
+ goto out;
+diff -upr linux-2.6.16.orig/net/ipv4/raw.c linux-2.6.16-026test015/net/ipv4/raw.c
+--- linux-2.6.16.orig/net/ipv4/raw.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/raw.c 2006-07-04 14:41:38.000000000 +0400
+@@ -114,7 +114,8 @@ struct sock *__raw_v4_lookup(struct sock
+ if (inet->num == num &&
+ !(inet->daddr && inet->daddr != raddr) &&
+ !(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
+- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
++ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) &&
++ ve_accessible_strict(VE_OWNER_SK(sk), get_exec_env()))
+ goto found; /* gotcha */
+ }
+ sk = NULL;
+@@ -753,8 +754,12 @@ static struct sock *raw_get_first(struct
+ struct hlist_node *node;
+
+ sk_for_each(sk, node, &raw_v4_htable[state->bucket])
+- if (sk->sk_family == PF_INET)
++ if (sk->sk_family == PF_INET) {
++ if (!ve_accessible(VE_OWNER_SK(sk),
++ get_exec_env()))
++ continue;
+ goto found;
++ }
+ }
+ sk = NULL;
+ found:
+@@ -768,8 +773,14 @@ static struct sock *raw_get_next(struct
+ do {
+ sk = sk_next(sk);
+ try_again:
+- ;
+- } while (sk && sk->sk_family != PF_INET);
++ if (!sk)
++ break;
++ if (sk->sk_family != PF_INET)
++ continue;
++ if (ve_accessible(VE_OWNER_SK(sk),
++ get_exec_env()))
++ break;
++ } while (1);
+
+ if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
+ sk = sk_head(&raw_v4_htable[state->bucket]);
+@@ -886,13 +897,13 @@ static struct file_operations raw_seq_fo
+
+ int __init raw_proc_init(void)
+ {
+- if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
++ if (!proc_glob_fops_create("net/raw", S_IRUGO, &raw_seq_fops))
+ return -ENOMEM;
+ return 0;
+ }
+
+ void __init raw_proc_exit(void)
+ {
+- proc_net_remove("raw");
++ remove_proc_glob_entry("net/raw", NULL);
+ }
+ #endif /* CONFIG_PROC_FS */
+diff -upr linux-2.6.16.orig/net/ipv4/route.c linux-2.6.16-026test015/net/ipv4/route.c
+--- linux-2.6.16.orig/net/ipv4/route.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/route.c 2006-07-04 14:41:39.000000000 +0400
+@@ -114,6 +114,8 @@
+
+ #define RT_GC_TIMEOUT (300*HZ)
+
++int ip_rt_src_check = 1;
++
+ static int ip_rt_min_delay = 2 * HZ;
+ static int ip_rt_max_delay = 10 * HZ;
+ static int ip_rt_max_size;
+@@ -253,11 +255,28 @@ static unsigned int rt_hash_code(u32 dad
+ & rt_hash_mask);
+ }
+
++void prepare_rt_cache(void)
++{
++#ifdef CONFIG_VE
++ struct rtable *r;
++ int i;
++
++ for (i = rt_hash_mask; i >= 0; i--) {
++ spin_lock_bh(rt_hash_lock_addr(i));
++ for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
++ r->fl.owner_env = get_ve0();
++ }
++ spin_unlock_bh(rt_hash_lock_addr(i));
++ }
++#endif
++}
++
+ #ifdef CONFIG_PROC_FS
+ struct rt_cache_iter_state {
+ int bucket;
+ };
+
++static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r);
+ static struct rtable *rt_cache_get_first(struct seq_file *seq)
+ {
+ struct rtable *r = NULL;
+@@ -270,6 +289,8 @@ static struct rtable *rt_cache_get_first
+ break;
+ rcu_read_unlock_bh();
+ }
++ if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env()))
++ r = rt_cache_get_next(seq, r);
+ return r;
+ }
+
+@@ -277,14 +298,19 @@ static struct rtable *rt_cache_get_next(
+ {
+ struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+
+- r = r->u.rt_next;
++start:
++ do {
++ r = r->u.rt_next;
++ } while (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env()));
+ while (!r) {
+ rcu_read_unlock_bh();
+ if (--st->bucket < 0)
+- break;
++ goto out;
+ rcu_read_lock_bh();
+ r = rt_hash_table[st->bucket].chain;
+ }
++ goto start;
++out:
+ return r;
+ }
+
+@@ -556,7 +582,8 @@ static inline int compare_keys(struct fl
+ {
+ return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
+ fl1->oif == fl2->oif &&
+- fl1->iif == fl2->iif;
++ fl1->iif == fl2->iif &&
++ ve_accessible_strict(fl1->owner_env, fl2->owner_env);
+ }
+
+ #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+@@ -670,26 +697,105 @@ static void rt_check_expire(unsigned lon
+ mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
+ }
+
++typedef unsigned long rt_flush_gen_t;
++
++#ifdef CONFIG_VE
++
++static rt_flush_gen_t rt_flush_gen;
++
++/* called under rt_flush_lock */
++static void set_rt_flush_required(struct ve_struct *env)
++{
++ /*
++ * If the global generation rt_flush_gen is equal to G, then
++ * the pass considering entries labelled by G is yet to come.
++ */
++ env->rt_flush_required = rt_flush_gen;
++}
++
++static spinlock_t rt_flush_lock;
++static rt_flush_gen_t reset_rt_flush_required(void)
++{
++ rt_flush_gen_t g;
++
++ spin_lock_bh(&rt_flush_lock);
++ g = rt_flush_gen++;
++ spin_unlock_bh(&rt_flush_lock);
++ return g;
++}
++
++static int check_rt_flush_required(struct ve_struct *env, rt_flush_gen_t gen)
++{
++ /* can be checked without the lock */
++ return env->rt_flush_required >= gen;
++}
++
++#else
++
++static void set_rt_flush_required(struct ve_struct *env)
++{
++}
++
++static rt_flush_gen_t reset_rt_flush_required(void)
++{
++ return 0;
++}
++
++#endif
++
+ /* This can run from both BH and non-BH contexts, the latter
+ * in the case of a forced flush event.
+ */
+ static void rt_run_flush(unsigned long dummy)
+ {
+ int i;
+- struct rtable *rth, *next;
++ struct rtable * rth, * next;
++ struct rtable * tail;
++ rt_flush_gen_t gen;
+
+ rt_deadline = 0;
+
+ get_random_bytes(&rt_hash_rnd, 4);
+
++ gen = reset_rt_flush_required();
++
+ for (i = rt_hash_mask; i >= 0; i--) {
++#ifdef CONFIG_VE
++ struct rtable ** prev, * p;
++
++ spin_lock_bh(rt_hash_lock_addr(i));
++ rth = rt_hash_table[i].chain;
++
++ /* defer releasing the head of the list after spin_unlock */
++ for (tail = rth; tail; tail = tail->u.rt_next)
++ if (!check_rt_flush_required(tail->fl.owner_env, gen))
++ break;
++ if (rth != tail)
++ rt_hash_table[i].chain = tail;
++
++ /* call rt_free on entries after the tail requiring flush */
++ prev = &rt_hash_table[i].chain;
++ for (p = *prev; p; p = next) {
++ next = p->u.rt_next;
++ if (!check_rt_flush_required(p->fl.owner_env, gen)) {
++ prev = &p->u.rt_next;
++ } else {
++ *prev = next;
++ rt_free(p);
++ }
++ }
++
++#else
+ spin_lock_bh(rt_hash_lock_addr(i));
+ rth = rt_hash_table[i].chain;
+ if (rth)
+ rt_hash_table[i].chain = NULL;
++ tail = NULL;
++
++#endif
+ spin_unlock_bh(rt_hash_lock_addr(i));
+
+- for (; rth; rth = next) {
++ for (; rth != tail; rth = next) {
+ next = rth->u.rt_next;
+ rt_free(rth);
+ }
+@@ -728,6 +834,8 @@ void rt_cache_flush(int delay)
+ delay = tmo;
+ }
+
++ set_rt_flush_required(get_exec_env());
++
+ if (delay <= 0) {
+ spin_unlock_bh(&rt_flush_lock);
+ rt_run_flush(0);
+@@ -743,9 +851,30 @@ void rt_cache_flush(int delay)
+
+ static void rt_secret_rebuild(unsigned long dummy)
+ {
++ int i;
++ struct rtable *rth, *next;
+ unsigned long now = jiffies;
+
+- rt_cache_flush(0);
++ spin_lock_bh(&rt_flush_lock);
++ del_timer(&rt_flush_timer);
++ spin_unlock_bh(&rt_flush_lock);
++
++ rt_deadline = 0;
++ get_random_bytes(&rt_hash_rnd, 4);
++
++ for (i = rt_hash_mask; i >= 0; i--) {
++ spin_lock_bh(rt_hash_lock_addr(i));
++ rth = rt_hash_table[i].chain;
++ if (rth)
++ rt_hash_table[i].chain = NULL;
++ spin_unlock_bh(rt_hash_lock_addr(i));
++
++ for (; rth; rth = next) {
++ next = rth->u.rt_next;
++ rt_free(rth);
++ }
++ }
++
+ mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+ }
+
+@@ -1118,7 +1247,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
+ struct rtable *rth, **rthp;
+ u32 skeys[2] = { saddr, 0 };
+ int ikeys[2] = { dev->ifindex, 0 };
++ struct ve_struct *ve;
+
++ ve = get_exec_env();
+ tos &= IPTOS_RT_MASK;
+
+ if (!in_dev)
+@@ -1154,6 +1285,10 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
+ rth->fl.fl4_src != skeys[i] ||
+ rth->fl.fl4_tos != tos ||
+ rth->fl.oif != ikeys[k] ||
++#ifdef CONFIG_VE
++ !ve_accessible_strict(rth->fl.owner_env,
++ ve) ||
++#endif
+ rth->fl.iif != 0) {
+ rthp = &rth->u.rt_next;
+ continue;
+@@ -1192,6 +1327,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd
+ rt->u.dst.neighbour = NULL;
+ rt->u.dst.hh = NULL;
+ rt->u.dst.xfrm = NULL;
++#ifdef CONFIG_VE
++ rt->fl.owner_env = ve;
++#endif
+
+ rt->rt_flags |= RTCF_REDIRECTED;
+
+@@ -1631,6 +1769,9 @@ static int ip_route_input_mc(struct sk_b
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= skb->nfmark;
+ #endif
++#ifdef CONFIG_VE
++ rth->fl.owner_env = get_exec_env();
++#endif
+ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ #ifdef CONFIG_NET_CLS_ROUTE
+@@ -1776,6 +1917,9 @@ static inline int __mkroute_input(struct
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= skb->nfmark;
+ #endif
++#ifdef CONFIG_VE
++ rth->fl.owner_env = get_exec_env();
++#endif
+ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ rth->rt_gateway = daddr;
+@@ -2021,6 +2165,9 @@ local_input:
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= skb->nfmark;
+ #endif
++#ifdef CONFIG_VE
++ rth->fl.owner_env = get_exec_env();
++#endif
+ rth->fl.fl4_src = saddr;
+ rth->rt_src = saddr;
+ #ifdef CONFIG_NET_CLS_ROUTE
+@@ -2100,6 +2247,9 @@ int ip_route_input(struct sk_buff *skb,
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark == skb->nfmark &&
+ #endif
++#ifdef CONFIG_VE
++ rth->fl.owner_env == get_exec_env() &&
++#endif
+ rth->fl.fl4_tos == tos) {
+ rth->u.dst.lastuse = jiffies;
+ dst_hold(&rth->u.dst);
+@@ -2226,6 +2376,9 @@ static inline int __mkroute_output(struc
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
+ #endif
++#ifdef CONFIG_VE
++ rth->fl.owner_env = get_exec_env();
++#endif
+ rth->rt_dst = fl->fl4_dst;
+ rth->rt_src = fl->fl4_src;
+ rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
+@@ -2399,10 +2552,13 @@ static int ip_route_output_slow(struct r
+ ZERONET(oldflp->fl4_src))
+ goto out;
+
+- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+- dev_out = ip_dev_find(oldflp->fl4_src);
+- if (dev_out == NULL)
+- goto out;
++ if (ip_rt_src_check) {
++ /* It is equivalent to
++ inet_addr_type(saddr) == RTN_LOCAL */
++ dev_out = ip_dev_find(oldflp->fl4_src);
++ if (dev_out == NULL)
++ goto out;
++ }
+
+ /* I removed check for oif == dev_out->oif here.
+ It was wrong for two reasons:
+@@ -2429,6 +2585,12 @@ static int ip_route_output_slow(struct r
+ Luckily, this hack is good workaround.
+ */
+
++ if (dev_out == NULL) {
++ dev_out = ip_dev_find(oldflp->fl4_src);
++ if (dev_out == NULL)
++ goto out;
++ }
++
+ fl.oif = dev_out->ifindex;
+ goto make_route;
+ }
+@@ -2575,6 +2737,7 @@ int __ip_route_output_key(struct rtable
+ #ifdef CONFIG_IP_ROUTE_FWMARK
+ rth->fl.fl4_fwmark == flp->fl4_fwmark &&
+ #endif
++ ve_accessible_strict(rth->fl.owner_env, get_exec_env()) &&
+ !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+ (IPTOS_RT_MASK | RTO_ONLINK))) {
+
+@@ -2705,7 +2868,7 @@ static int rt_fill_info(struct sk_buff *
+ u32 dst = rt->rt_dst;
+
+ if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
+- ipv4_devconf.mc_forwarding) {
++ ve_ipv4_devconf.mc_forwarding) {
+ int err = ipmr_get_route(skb, r, nowait);
+ if (err <= 0) {
+ if (!nowait) {
+@@ -2750,7 +2913,10 @@ int inet_rtm_getroute(struct sk_buff *in
+ /* Reserve room for dummy headers, this skb can pass
+ through good chunk of routing engine.
+ */
+- skb->mac.raw = skb->data;
++ skb->mac.raw = skb->nh.raw = skb->data;
++
++ /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
++ skb->nh.iph->protocol = IPPROTO_ICMP;
+ skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+ if (rta[RTA_SRC - 1])
+@@ -2853,22 +3019,22 @@ void ip_rt_multicast_event(struct in_dev
+ }
+
+ #ifdef CONFIG_SYSCTL
+-static int flush_delay;
++int ipv4_flush_delay;
+
+-static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
++int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+ {
+ if (write) {
+ proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+- rt_cache_flush(flush_delay);
++ rt_cache_flush(ipv4_flush_delay);
+ return 0;
+ }
+
+ return -EINVAL;
+ }
+
+-static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
++int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
+ int __user *name,
+ int nlen,
+ void __user *oldval,
+@@ -2890,7 +3056,7 @@ ctl_table ipv4_route_table[] = {
+ {
+ .ctl_name = NET_IPV4_ROUTE_FLUSH,
+ .procname = "flush",
+- .data = &flush_delay,
++ .data = &ipv4_flush_delay,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = &ipv4_sysctl_rtcache_flush,
+@@ -3184,15 +3350,18 @@ int __init ip_rt_init(void)
+ #ifdef CONFIG_PROC_FS
+ {
+ struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
+- if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
+- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
+- proc_net_stat))) {
++
++ if (!proc_glob_fops_create("net/rt_cache",
++ S_IRUGO, &rt_cache_seq_fops))
++ return -ENOMEM;
++
++ if (!(rtstat_pde = create_proc_glob_entry("net/stat/rt_cache",
++ S_IRUGO, NULL)))
+ return -ENOMEM;
+- }
+ rtstat_pde->proc_fops = &rt_cpu_seq_fops;
+ }
+ #ifdef CONFIG_NET_CLS_ROUTE
+- create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
++ create_proc_read_entry("net/rt_acct", 0, NULL, ip_rt_acct_read, NULL);
+ #endif
+ #endif
+ #ifdef CONFIG_XFRM
+diff -upr linux-2.6.16.orig/net/ipv4/sysctl_net_ipv4.c linux-2.6.16-026test015/net/ipv4/sysctl_net_ipv4.c
+--- linux-2.6.16.orig/net/ipv4/sysctl_net_ipv4.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/sysctl_net_ipv4.c 2006-07-04 14:41:39.000000000 +0400
+@@ -22,6 +22,9 @@
+ /* From af_inet.c */
+ extern int sysctl_ip_nonlocal_bind;
+
++int sysctl_tcp_use_sg = 1;
++EXPORT_SYMBOL(sysctl_tcp_use_sg);
++
+ #ifdef CONFIG_SYSCTL
+ static int zero;
+ static int tcp_retr1_max = 255;
+@@ -33,22 +36,21 @@ struct ipv4_config ipv4_config;
+
+ #ifdef CONFIG_SYSCTL
+
+-static
+ int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+ {
+- int val = ipv4_devconf.forwarding;
++ int val = ve_ipv4_devconf.forwarding;
+ int ret;
+
+ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+- if (write && ipv4_devconf.forwarding != val)
++ if (write && ve_ipv4_devconf.forwarding != val)
+ inet_forward_change();
+
+ return ret;
+ }
+
+-static int ipv4_sysctl_forward_strategy(ctl_table *table,
++int ipv4_sysctl_forward_strategy(ctl_table *table,
+ int __user *name, int nlen,
+ void __user *oldval, size_t __user *oldlenp,
+ void __user *newval, size_t newlen,
+@@ -664,6 +666,14 @@ ctl_table ipv4_table[] = {
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
++ {
++ .ctl_name = NET_TCP_USE_SG,
++ .procname = "tcp_use_sg",
++ .data = &sysctl_tcp_use_sg,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = &proc_dointvec,
++ },
+
+ { .ctl_name = 0 }
+ };
+diff -upr linux-2.6.16.orig/net/ipv4/tcp.c linux-2.6.16-026test015/net/ipv4/tcp.c
+--- linux-2.6.16.orig/net/ipv4/tcp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/tcp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -248,6 +248,7 @@
+ */
+
+ #include <linux/config.h>
++#include <linux/kmem_cache.h>
+ #include <linux/module.h>
+ #include <linux/types.h>
+ #include <linux/fcntl.h>
+@@ -263,6 +264,9 @@
+ #include <net/xfrm.h>
+ #include <net/ip.h>
+
++#include <ub/ub_orphan.h>
++#include <ub/ub_net.h>
++#include <ub/ub_tcp.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/ioctls.h>
+@@ -321,6 +325,7 @@ unsigned int tcp_poll(struct file *file,
+ unsigned int mask;
+ struct sock *sk = sock->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
++ int check_send_space;
+
+ poll_wait(file, sk->sk_sleep, wait);
+ if (sk->sk_state == TCP_LISTEN)
+@@ -335,6 +340,21 @@ unsigned int tcp_poll(struct file *file,
+ if (sk->sk_err)
+ mask = POLLERR;
+
++ check_send_space = 1;
++#ifdef CONFIG_USER_RESOURCE
++ if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) {
++ unsigned long size;
++ size = MAX_TCP_HEADER + tp->mss_cache;
++ if (size > SOCK_MIN_UBCSPACE)
++ size = SOCK_MIN_UBCSPACE;
++ size = skb_charge_size(size);
++ if (ub_sock_makewres_tcp(sk, size)) {
++ check_send_space = 0;
++ ub_sock_sndqueueadd_tcp(sk, size);
++ }
++ }
++#endif
++
+ /*
+ * POLLHUP is certainly not done right. But poll() doesn't
+ * have a notion of HUP in just one direction, and for a
+@@ -378,7 +398,7 @@ unsigned int tcp_poll(struct file *file,
+ sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+ mask |= POLLIN | POLLRDNORM;
+
+- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
++ if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) {
+ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else { /* send SIGIO later */
+@@ -528,16 +548,23 @@ static ssize_t do_tcp_sendpages(struct s
+ int copy, i, can_coalesce;
+ int offset = poffset % PAGE_SIZE;
+ int size = min_t(size_t, psize, PAGE_SIZE - offset);
++ unsigned long chargesize = 0;
+
+ if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
+ new_segment:
++ chargesize = 0;
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+
++ chargesize = skb_charge_size(MAX_TCP_HEADER +
++ tp->mss_cache);
++ if (ub_sock_getwres_tcp(sk, chargesize) < 0)
++ goto wait_for_ubspace;
+ skb = sk_stream_alloc_pskb(sk, 0, 0,
+ sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF);
+
+ skb_entail(sk, tp, skb);
+ copy = size_goal;
+@@ -593,10 +620,14 @@ new_segment:
+ wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ wait_for_memory:
++ ub_sock_retwres_tcp(sk, chargesize,
++ skb_charge_size(MAX_TCP_HEADER + tp->mss_cache));
++ chargesize = 0;
++wait_for_ubspace:
+ if (copied)
+ tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
++ if ((err = sk_stream_wait_memory(sk, &timeo, chargesize)) != 0)
+ goto do_error;
+
+ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+@@ -699,6 +730,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru
+ while (--iovlen >= 0) {
+ int seglen = iov->iov_len;
+ unsigned char __user *from = iov->iov_base;
++ unsigned long chargesize = 0;
+
+ iov++;
+
+@@ -709,18 +741,26 @@ int tcp_sendmsg(struct kiocb *iocb, stru
+
+ if (!sk->sk_send_head ||
+ (copy = size_goal - skb->len) <= 0) {
++ unsigned long size;
+
+ new_segment:
+ /* Allocate new segment. If the interface is SG,
+ * allocate skb fitting to single page.
+ */
++ chargesize = 0;
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+-
+- skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
+- 0, sk->sk_allocation);
++ size = select_size(sk, tp);
++ chargesize = skb_charge_size(MAX_TCP_HEADER +
++ size);
++ if (ub_sock_getwres_tcp(sk, chargesize) < 0)
++ goto wait_for_ubspace;
++ skb = sk_stream_alloc_pskb(sk, size, 0,
++ sk->sk_allocation);
+ if (!skb)
+ goto wait_for_memory;
++ ub_skb_set_charge(skb, sk, chargesize,
++ UB_TCPSNDBUF);
+
+ /*
+ * Check whether we can use HW checksum.
+@@ -768,6 +808,7 @@ new_segment:
+ } else if (page) {
+ if (off == PAGE_SIZE) {
+ put_page(page);
++ ub_sock_tcp_detachpage(sk);
+ TCP_PAGE(sk) = page = NULL;
+ off = 0;
+ }
+@@ -781,6 +822,9 @@ new_segment:
+ goto wait_for_memory;
+
+ if (!page) {
++ chargesize = PAGE_SIZE;
++ if (ub_sock_tcp_chargepage(sk) < 0)
++ goto wait_for_ubspace;
+ /* Allocate new cache page. */
+ if (!(page = sk_stream_alloc_page(sk)))
+ goto wait_for_memory;
+@@ -812,7 +856,8 @@ new_segment:
+ } else if (off + copy < PAGE_SIZE) {
+ get_page(page);
+ TCP_PAGE(sk) = page;
+- }
++ } else
++ ub_sock_tcp_detachpage(sk);
+ }
+
+ TCP_OFF(sk) = off + copy;
+@@ -843,10 +888,15 @@ new_segment:
+ wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ wait_for_memory:
++ ub_sock_retwres_tcp(sk, chargesize,
++ skb_charge_size(MAX_TCP_HEADER+tp->mss_cache));
++ chargesize = 0;
++wait_for_ubspace:
+ if (copied)
+ tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
++ if ((err = sk_stream_wait_memory(sk, &timeo,
++ chargesize)) != 0)
+ goto do_error;
+
+ mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+@@ -944,7 +994,18 @@ static void cleanup_rbuf(struct sock *sk
+ #if TCP_DEBUG
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+- BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
++ if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) {
++ printk("KERNEL: assertion: skb==NULL || "
++ "before(tp->copied_seq, skb->end_seq)\n");
++ printk("VE%u pid %d comm %.16s\n",
++ (get_exec_env() ? VEID(get_exec_env()) : 0),
++ current->pid, current->comm);
++ printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied,
++ tp->copied_seq, tp->rcv_nxt);
++ printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n",
++ skb->len, TCP_SKB_CB(skb)->seq,
++ TCP_SKB_CB(skb)->end_seq);
++ }
+ #endif
+
+ if (inet_csk_ack_scheduled(sk)) {
+@@ -1168,7 +1229,22 @@ int tcp_recvmsg(struct kiocb *iocb, stru
+ goto found_ok_skb;
+ if (skb->h.th->fin)
+ goto found_fin_ok;
+- BUG_TRAP(flags & MSG_PEEK);
++ if (!(flags & MSG_PEEK)) {
++ printk("KERNEL: assertion: flags&MSG_PEEK\n");
++ printk("VE%u pid %d comm %.16s\n",
++ (get_exec_env() ?
++ VEID(get_exec_env()) : 0),
++ current->pid, current->comm);
++ printk("flags=0x%x, len=%d, copied_seq=%d, "
++ "rcv_nxt=%d\n", flags, len,
++ tp->copied_seq, tp->rcv_nxt);
++ printk("skb->len=%d, *seq=%d, skb->seq=%d, "
++ "skb->end_seq=%d, offset=%d\n",
++ skb->len, *seq,
++ TCP_SKB_CB(skb)->seq,
++ TCP_SKB_CB(skb)->end_seq,
++ offset);
++ }
+ skb = skb->next;
+ } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+@@ -1231,8 +1307,18 @@ int tcp_recvmsg(struct kiocb *iocb, stru
+
+ tp->ucopy.len = len;
+
+- BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
+- (flags & (MSG_PEEK | MSG_TRUNC)));
++ if (!(tp->copied_seq == tp->rcv_nxt ||
++ (flags&(MSG_PEEK|MSG_TRUNC)))) {
++ printk("KERNEL: assertion: tp->copied_seq == "
++ "tp->rcv_nxt || ...\n");
++ printk("VE%u pid %d comm %.16s\n",
++ (get_exec_env() ?
++ VEID(get_exec_env()) : 0),
++ current->pid, current->comm);
++ printk("flags=0x%x, len=%d, copied_seq=%d, "
++ "rcv_nxt=%d\n", flags, len,
++ tp->copied_seq, tp->rcv_nxt);
++ }
+
+ /* Ugly... If prequeue is not empty, we have to
+ * process it before releasing socket, otherwise
+@@ -1583,7 +1669,7 @@ adjudge_to_death:
+ if (tmo > TCP_TIMEWAIT_LEN) {
+ inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
+ } else {
+- atomic_inc(sk->sk_prot->orphan_count);
++ ub_inc_orphan_count(sk);
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+@@ -1591,9 +1677,7 @@ adjudge_to_death:
+ }
+ if (sk->sk_state != TCP_CLOSE) {
+ sk_stream_mem_reclaim(sk);
+- if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
+- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
++ if (ub_too_many_orphans(sk, ub_get_orphan_count(sk))) {
+ if (net_ratelimit())
+ printk(KERN_INFO "TCP: too many of orphaned "
+ "sockets\n");
+@@ -1602,7 +1686,7 @@ adjudge_to_death:
+ NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+ }
+ }
+- atomic_inc(sk->sk_prot->orphan_count);
++ ub_inc_orphan_count(sk);
+
+ if (sk->sk_state == TCP_CLOSE)
+ inet_csk_destroy_sock(sk);
+@@ -2051,7 +2135,7 @@ void __init tcp_init(void)
+ tcp_hashinfo.bind_bucket_cachep =
+ kmem_cache_create("tcp_bind_bucket",
+ sizeof(struct inet_bind_bucket), 0,
+- SLAB_HWCACHE_ALIGN, NULL, NULL);
++ SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL);
+ if (!tcp_hashinfo.bind_bucket_cachep)
+ panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+
+diff -upr linux-2.6.16.orig/net/ipv4/tcp_input.c linux-2.6.16-026test015/net/ipv4/tcp_input.c
+--- linux-2.6.16.orig/net/ipv4/tcp_input.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/tcp_input.c 2006-07-04 14:41:37.000000000 +0400
+@@ -72,6 +72,8 @@
+ #include <linux/ipsec.h>
+ #include <asm/unaligned.h>
+
++#include <ub/ub_tcp.h>
++
+ int sysctl_tcp_timestamps = 1;
+ int sysctl_tcp_window_scaling = 1;
+ int sysctl_tcp_sack = 1;
+@@ -252,7 +254,7 @@ static void tcp_grow_window(struct sock
+ /* Check #1 */
+ if (tp->rcv_ssthresh < tp->window_clamp &&
+ (int)tp->rcv_ssthresh < tcp_space(sk) &&
+- !tcp_memory_pressure) {
++ ub_tcp_rmem_allows_expand(sk)) {
+ int incr;
+
+ /* Check #2. Increase window, if skb with such overhead
+@@ -321,6 +323,8 @@ static void tcp_init_buffer_space(struct
+
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
++
++ ub_tcp_update_maxadvmss(sk);
+ }
+
+ /* 5. Recalculate window clamp after socket hit its memory bounds. */
+@@ -332,7 +336,7 @@ static void tcp_clamp_window(struct sock
+
+ if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+ !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+- !tcp_memory_pressure &&
++ !ub_tcp_memory_pressure(sk) &&
+ atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+ sysctl_tcp_rmem[2]);
+@@ -3118,7 +3122,7 @@ queue_and_out:
+ !sk_stream_rmem_schedule(sk, skb))) {
+ if (tcp_prune_queue(sk) < 0 ||
+ !sk_stream_rmem_schedule(sk, skb))
+- goto drop;
++ goto drop_part;
+ }
+ sk_stream_set_owner_r(skb, sk);
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+@@ -3162,6 +3166,12 @@ out_of_window:
+ drop:
+ __kfree_skb(skb);
+ return;
++
++drop_part:
++ if (after(tp->copied_seq, tp->rcv_nxt))
++ tp->rcv_nxt = tp->copied_seq;
++ __kfree_skb(skb);
++ return;
+ }
+
+ /* Out of window. F.e. zero window probe. */
+@@ -3333,6 +3343,10 @@ tcp_collapse(struct sock *sk, struct sk_
+ nskb = alloc_skb(copy+header, GFP_ATOMIC);
+ if (!nskb)
+ return;
++ if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) {
++ kfree_skb(nskb);
++ return;
++ }
+ skb_reserve(nskb, header);
+ memcpy(nskb->head, skb->head, header);
+ nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
+@@ -3429,7 +3443,7 @@ static int tcp_prune_queue(struct sock *
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ tcp_clamp_window(sk, tp);
+- else if (tcp_memory_pressure)
++ else if (ub_tcp_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
+ tcp_collapse_ofo_queue(sk);
+@@ -3505,7 +3519,7 @@ static int tcp_should_expand_sndbuf(stru
+ return 0;
+
+ /* If we are under global TCP memory pressure, do not expand. */
+- if (tcp_memory_pressure)
++ if (ub_tcp_memory_pressure(sk))
+ return 0;
+
+ /* If we are under soft global TCP memory pressure, do not expand. */
+@@ -3898,6 +3912,10 @@ int tcp_rcv_established(struct sock *sk,
+
+ if ((int)skb->truesize > sk->sk_forward_alloc)
+ goto step5;
++ /* This is OK not to try to free memory here.
++ * Do this below on slow path. Den */
++ if (ub_tcprcvbuf_charge(sk, skb) < 0)
++ goto step5;
+
+ NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
+
+diff -upr linux-2.6.16.orig/net/ipv4/tcp_ipv4.c linux-2.6.16-026test015/net/ipv4/tcp_ipv4.c
+--- linux-2.6.16.orig/net/ipv4/tcp_ipv4.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/tcp_ipv4.c 2006-07-04 14:41:39.000000000 +0400
+@@ -72,6 +72,8 @@
+ #include <net/timewait_sock.h>
+ #include <net/xfrm.h>
+
++#include <ub/ub_tcp.h>
++
+ #include <linux/inet.h>
+ #include <linux/ipv6.h>
+ #include <linux/stddef.h>
+@@ -705,6 +707,7 @@ struct request_sock_ops tcp_request_sock
+ .destructor = tcp_v4_reqsk_destructor,
+ .send_reset = tcp_v4_send_reset,
+ };
++EXPORT_SYMBOL_GPL(tcp_request_sock_ops);
+
+ static struct timewait_sock_ops tcp_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct tcp_timewait_sock),
+@@ -979,12 +982,15 @@ static int tcp_v4_checksum_init(struct s
+ */
+ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+ {
++ struct user_beancounter *ub;
++
++ ub = set_exec_ub(sock_bc(sk)->ub);
+ if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ TCP_CHECK_TIMER(sk);
+ if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
+ goto reset;
+ TCP_CHECK_TIMER(sk);
+- return 0;
++ goto restore_context;
+ }
+
+ if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
+@@ -998,7 +1004,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc
+ if (nsk != sk) {
+ if (tcp_child_process(sk, nsk, skb))
+ goto reset;
+- return 0;
++ goto restore_context;
+ }
+ }
+
+@@ -1006,6 +1012,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc
+ if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
+ goto reset;
+ TCP_CHECK_TIMER(sk);
++
++restore_context:
++ (void)set_exec_ub(ub);
+ return 0;
+
+ reset:
+@@ -1017,7 +1026,7 @@ discard:
+ * might be destroyed here. This current version compiles correctly,
+ * but you have been warned.
+ */
+- return 0;
++ goto restore_context;
+
+ csum_err:
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+@@ -1302,6 +1311,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
+ * If sendmsg cached page exists, toss it.
+ */
+ if (sk->sk_sndmsg_page) {
++ /* queue is empty, uncharge */
++ ub_sock_tcp_detachpage(sk);
+ __free_page(sk->sk_sndmsg_page);
+ sk->sk_sndmsg_page = NULL;
+ }
+@@ -1316,16 +1327,34 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
+ #ifdef CONFIG_PROC_FS
+ /* Proc filesystem TCP sock list dumping. */
+
+-static inline struct inet_timewait_sock *tw_head(struct hlist_head *head)
++static inline struct inet_timewait_sock *tw_head(struct hlist_head *head,
++ envid_t veid)
+ {
+- return hlist_empty(head) ? NULL :
+- list_entry(head->first, struct inet_timewait_sock, tw_node);
++ struct inet_timewait_sock *tw;
++ struct hlist_node *pos;
++
++ if (hlist_empty(head))
++ return NULL;
++ hlist_for_each_entry(tw, pos, head, tw_node) {
++ if (!ve_accessible_veid(tw->tw_owner_env, veid))
++ continue;
++ return tw;
++ }
++ return NULL;
+ }
+
+-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
++static inline struct inet_timewait_sock *
++ tw_next(struct inet_timewait_sock *tw, envid_t veid)
+ {
+- return tw->tw_node.next ?
+- hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
++ while (1) {
++ if (tw->tw_node.next == NULL)
++ return NULL;
++ tw = hlist_entry(tw->tw_node.next, typeof(*tw), tw_node);
++ if (!ve_accessible_veid(tw->tw_owner_env, veid))
++ continue;
++ return tw;
++ }
++ return NULL; /* make compiler happy */
+ }
+
+ static void *listening_get_next(struct seq_file *seq, void *cur)
+@@ -1334,7 +1363,9 @@ static void *listening_get_next(struct s
+ struct hlist_node *node;
+ struct sock *sk = cur;
+ struct tcp_iter_state* st = seq->private;
++ struct ve_struct *ve;
+
++ ve = get_exec_env();
+ if (!sk) {
+ st->bucket = 0;
+ sk = sk_head(&tcp_hashinfo.listening_hash[0]);
+@@ -1374,6 +1405,8 @@ get_req:
+ }
+ get_sk:
+ sk_for_each_from(sk, node) {
++ if (!ve_accessible(VE_OWNER_SK(sk), ve))
++ continue;
+ if (sk->sk_family == st->family) {
+ cur = sk;
+ goto out;
+@@ -1414,7 +1447,9 @@ static void *established_get_first(struc
+ {
+ struct tcp_iter_state* st = seq->private;
+ void *rc = NULL;
++ struct ve_struct *ve;
+
++ ve = get_exec_env();
+ for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+ struct sock *sk;
+ struct hlist_node *node;
+@@ -1425,6 +1460,8 @@ static void *established_get_first(struc
+
+ read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
+ sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
++ if (!ve_accessible(VE_OWNER_SK(sk), ve))
++ continue;
+ if (sk->sk_family != st->family) {
+ continue;
+ }
+@@ -1434,6 +1471,8 @@ static void *established_get_first(struc
+ st->state = TCP_SEQ_STATE_TIME_WAIT;
+ inet_twsk_for_each(tw, node,
+ &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
++ if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve)))
++ continue;
+ if (tw->tw_family != st->family) {
+ continue;
+ }
+@@ -1453,16 +1492,17 @@ static void *established_get_next(struct
+ struct inet_timewait_sock *tw;
+ struct hlist_node *node;
+ struct tcp_iter_state* st = seq->private;
++ struct ve_struct *ve;
+
++ ve = get_exec_env();
+ ++st->num;
+
+ if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
+ tw = cur;
+- tw = tw_next(tw);
++ tw = tw_next(tw, VEID(ve));
+ get_tw:
+- while (tw && tw->tw_family != st->family) {
+- tw = tw_next(tw);
+- }
++ while (tw && tw->tw_family != st->family)
++ tw = tw_next(tw, VEID(ve));
+ if (tw) {
+ cur = tw;
+ goto out;
+@@ -1484,12 +1524,15 @@ get_tw:
+ sk = sk_next(sk);
+
+ sk_for_each_from(sk, node) {
++ if (!ve_accessible(VE_OWNER_SK(sk), ve))
++ continue;
+ if (sk->sk_family == st->family)
+ goto found;
+ }
+
+ st->state = TCP_SEQ_STATE_TIME_WAIT;
+- tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
++ tw = tw_head(&tcp_hashinfo.ehash[st->bucket +
++ tcp_hashinfo.ehash_size].chain, VEID(ve));
+ goto get_tw;
+ found:
+ cur = sk;
+@@ -1635,7 +1678,7 @@ int tcp_proc_register(struct tcp_seq_afi
+ afinfo->seq_fops->llseek = seq_lseek;
+ afinfo->seq_fops->release = seq_release_private;
+
+- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
++ p = proc_glob_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+ if (p)
+ p->data = afinfo;
+ else
+@@ -1647,7 +1690,8 @@ void tcp_proc_unregister(struct tcp_seq_
+ {
+ if (!afinfo)
+ return;
+- proc_net_remove(afinfo->name);
++
++ remove_proc_glob_entry(afinfo->name, NULL);
+ memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+ }
+
+@@ -1777,7 +1821,7 @@ out:
+ static struct file_operations tcp4_seq_fops;
+ static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+ .owner = THIS_MODULE,
+- .name = "tcp",
++ .name = "net/tcp",
+ .family = AF_INET,
+ .seq_show = tcp4_seq_show,
+ .seq_fops = &tcp4_seq_fops,
+@@ -1844,6 +1888,86 @@ void __init tcp_v4_init(struct net_proto
+ tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
+ }
+
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++static void tcp_kill_ve_onesk(struct sock *sk)
++{
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ /* Check the assumed state of the socket. */
++ if (!sock_flag(sk, SOCK_DEAD)) {
++ static int printed;
++invalid:
++ if (!printed)
++ printk(KERN_DEBUG "Killing sk: dead %d, state %d, "
++ "wrseq %u unseq %u, wrqu %d.\n",
++ sock_flag(sk, SOCK_DEAD), sk->sk_state,
++ tp->write_seq, tp->snd_una,
++ !skb_queue_empty(&sk->sk_write_queue));
++ printed = 1;
++ return;
++ }
++
++ tcp_send_active_reset(sk, GFP_ATOMIC);
++ switch (sk->sk_state) {
++ case TCP_FIN_WAIT1:
++ case TCP_CLOSING:
++ /* In these 2 states the peer may want us to retransmit
++ * some data and/or FIN. Entering "resetting mode"
++ * instead.
++ */
++ tcp_time_wait(sk, TCP_CLOSE, 0);
++ break;
++ case TCP_FIN_WAIT2:
++ /* By some reason the socket may stay in this state
++ * without turning into a TW bucket. Fix it.
++ */
++ tcp_time_wait(sk, TCP_FIN_WAIT2, 0);
++ break;
++ case TCP_LAST_ACK:
++ /* Just jump into CLOSED state. */
++ tcp_done(sk);
++ break;
++ default:
++ /* The socket must be already close()d. */
++ goto invalid;
++ }
++}
++
++void tcp_v4_kill_ve_sockets(struct ve_struct *envid)
++{
++ struct inet_ehash_bucket *head;
++ int i;
++
++ /* alive */
++ local_bh_disable();
++ head = tcp_hashinfo.ehash;
++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
++ struct sock *sk;
++ struct hlist_node *node;
++more_work:
++ write_lock(&head[i].lock);
++ sk_for_each(sk, node, &head[i].chain) {
++ if (ve_accessible_strict(VE_OWNER_SK(sk), envid)) {
++ sock_hold(sk);
++ write_unlock(&head[i].lock);
++
++ bh_lock_sock(sk);
++ /* sk might have disappeared from the hash before
++ * we got the lock */
++ if (sk->sk_state != TCP_CLOSE)
++ tcp_kill_ve_onesk(sk);
++ bh_unlock_sock(sk);
++ sock_put(sk);
++ goto more_work;
++ }
++ }
++ write_unlock(&head[i].lock);
++ }
++ local_bh_enable();
++}
++EXPORT_SYMBOL(tcp_v4_kill_ve_sockets);
++#endif
++
+ EXPORT_SYMBOL(ipv4_specific);
+ EXPORT_SYMBOL(tcp_hashinfo);
+ EXPORT_SYMBOL(tcp_prot);
+diff -upr linux-2.6.16.orig/net/ipv4/tcp_minisocks.c linux-2.6.16-026test015/net/ipv4/tcp_minisocks.c
+--- linux-2.6.16.orig/net/ipv4/tcp_minisocks.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/tcp_minisocks.c 2006-07-04 14:41:39.000000000 +0400
+@@ -29,6 +29,8 @@
+ #include <net/inet_common.h>
+ #include <net/xfrm.h>
+
++#include <ub/ub_net.h>
++
+ #ifdef CONFIG_SYSCTL
+ #define SYNC_INIT 0 /* let the user enable it */
+ #else
+@@ -307,6 +309,8 @@ void tcp_time_wait(struct sock *sk, int
+ tw->tw_ipv6only = np->ipv6only;
+ }
+ #endif
++ tw->tw_owner_env = VEID(VE_OWNER_SK(sk));
++
+ /* Linkage updates. */
+ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
+
+@@ -355,6 +359,8 @@ struct sock *tcp_create_openreq_child(st
+ struct tcp_sock *newtp;
+
+ /* Now setup tcp_sock */
++ SET_VE_OWNER_SK(newsk, VE_OWNER_SK(sk));
++
+ newtp = tcp_sk(newsk);
+ newtp->pred_flags = 0;
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+diff -upr linux-2.6.16.orig/net/ipv4/tcp_output.c linux-2.6.16-026test015/net/ipv4/tcp_output.c
+--- linux-2.6.16.orig/net/ipv4/tcp_output.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/tcp_output.c 2006-07-04 14:41:37.000000000 +0400
+@@ -42,6 +42,9 @@
+ #include <linux/module.h>
+ #include <linux/smp_lock.h>
+
++#include <ub/ub_net.h>
++#include <ub/ub_tcp.h>
++
+ /* People can turn this off for buggy TCP's found in printers etc. */
+ int sysctl_tcp_retrans_collapse = 1;
+
+@@ -528,16 +531,26 @@ int tcp_fragment(struct sock *sk, struct
+ if (nsize < 0)
+ nsize = 0;
+
+- if (skb_cloned(skb) &&
+- skb_is_nonlinear(skb) &&
+- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+- return -ENOMEM;
++ if (skb_cloned(skb) && skb_is_nonlinear(skb)) {
++ unsigned long chargesize;
++ chargesize = skb_bc(skb)->charged;
++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
++ return -ENOMEM;
++ ub_sock_retwres_tcp(sk, chargesize, chargesize);
++ ub_tcpsndbuf_charge_forced(sk, skb);
++ }
+
+ /* Get a new skb... force flag on. */
+ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+ if (buff == NULL)
+ return -ENOMEM; /* We'll just try again later. */
+- sk_charge_skb(sk, buff);
++ if (ub_tcpsndbuf_charge(sk, buff) < 0) {
++ kfree_skb(buff);
++ return -ENOMEM;
++ }
++
++ buff->truesize = skb->len - len;
++ skb->truesize -= buff->truesize;
+
+ /* Correct the sequence numbers. */
+ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+@@ -978,6 +991,11 @@ static int tso_fragment(struct sock *sk,
+ if (unlikely(buff == NULL))
+ return -ENOMEM;
+
++ if (ub_tcpsndbuf_charge(sk, buff) < 0) {
++ kfree_skb(buff);
++ return -ENOMEM;
++ }
++
+ buff->truesize = nlen;
+ skb->truesize -= nlen;
+
+@@ -1281,7 +1299,7 @@ u32 __tcp_select_window(struct sock *sk)
+ if (free_space < full_space/2) {
+ icsk->icsk_ack.quick = 0;
+
+- if (tcp_memory_pressure)
++ if (ub_tcp_shrink_rcvbuf(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
+
+ if (free_space < mss)
+@@ -1708,6 +1726,7 @@ void tcp_send_fin(struct sock *sk)
+ break;
+ yield();
+ }
++ ub_tcpsndbuf_charge_forced(sk, skb);
+
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, MAX_TCP_HEADER);
+@@ -1777,6 +1796,10 @@ int tcp_send_synack(struct sock *sk)
+ struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+ if (nskb == NULL)
+ return -ENOMEM;
++ if (ub_tcpsndbuf_charge(sk, skb) < 0) {
++ kfree_skb(nskb);
++ return -ENOMEM;
++ }
+ __skb_unlink(skb, &sk->sk_write_queue);
+ skb_header_release(nskb);
+ __skb_queue_head(&sk->sk_write_queue, nskb);
+@@ -1928,6 +1951,10 @@ int tcp_connect(struct sock *sk)
+ buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
+ if (unlikely(buff == NULL))
+ return -ENOBUFS;
++ if (ub_tcpsndbuf_charge(sk, buff) < 0) {
++ kfree_skb(buff);
++ return -ENOBUFS;
++ }
+
+ /* Reserve space for headers. */
+ skb_reserve(buff, MAX_TCP_HEADER);
+diff -upr linux-2.6.16.orig/net/ipv4/tcp_timer.c linux-2.6.16-026test015/net/ipv4/tcp_timer.c
+--- linux-2.6.16.orig/net/ipv4/tcp_timer.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/tcp_timer.c 2006-07-04 14:41:39.000000000 +0400
+@@ -22,6 +22,8 @@
+
+ #include <linux/module.h>
+ #include <net/tcp.h>
++#include <ub/ub_orphan.h>
++#include <ub/ub_tcp.h>
+
+ int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+ int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
+@@ -67,7 +69,7 @@ static void tcp_write_err(struct sock *s
+ static int tcp_out_of_resources(struct sock *sk, int do_reset)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+- int orphans = atomic_read(&tcp_orphan_count);
++ int orphans = ub_get_orphan_count(sk);
+
+ /* If peer does not open window for long time, or did not transmit
+ * anything for long time, penalize it. */
+@@ -78,9 +80,7 @@ static int tcp_out_of_resources(struct s
+ if (sk->sk_err_soft)
+ orphans <<= 1;
+
+- if (orphans >= sysctl_tcp_max_orphans ||
+- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
++ if (ub_too_many_orphans(sk, orphans)) {
+ if (net_ratelimit())
+ printk(KERN_INFO "Out of socket memory\n");
+
+@@ -173,9 +173,12 @@ static int tcp_write_timeout(struct sock
+ static void tcp_delack_timer(unsigned long data)
+ {
+ struct sock *sk = (struct sock*)data;
++ struct ve_struct *env;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
++ env = set_exec_env(VE_OWNER_SK(sk));
++
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+@@ -224,11 +227,12 @@ static void tcp_delack_timer(unsigned lo
+ TCP_CHECK_TIMER(sk);
+
+ out:
+- if (tcp_memory_pressure)
++ if (ub_tcp_memory_pressure(sk))
+ sk_stream_mem_reclaim(sk);
+ out_unlock:
+ bh_unlock_sock(sk);
+ sock_put(sk);
++ (void)set_exec_env(env);
+ }
+
+ static void tcp_probe_timer(struct sock *sk)
+@@ -283,8 +287,11 @@ static void tcp_probe_timer(struct sock
+ static void tcp_retransmit_timer(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
++ struct ve_struct *env;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
++ env = set_exec_env(VE_OWNER_SK(sk));
++
+ if (!tp->packets_out)
+ goto out;
+
+@@ -381,15 +388,19 @@ out_reset_timer:
+ if (icsk->icsk_retransmits > sysctl_tcp_retries1)
+ __sk_dst_reset(sk);
+
+-out:;
++out:
++ (void)set_exec_env(env);
+ }
+
+ static void tcp_write_timer(unsigned long data)
+ {
+ struct sock *sk = (struct sock*)data;
++ struct ve_struct *env;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int event;
+
++ env = set_exec_env(VE_OWNER_SK(sk));
++
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later */
+@@ -423,6 +434,7 @@ out:
+ out_unlock:
+ bh_unlock_sock(sk);
+ sock_put(sk);
++ (void)set_exec_env(env);
+ }
+
+ /*
+@@ -450,10 +462,13 @@ void tcp_set_keepalive(struct sock *sk,
+ static void tcp_keepalive_timer (unsigned long data)
+ {
+ struct sock *sk = (struct sock *) data;
++ struct ve_struct *env;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ __u32 elapsed;
+
++ env = set_exec_env(VE_OWNER_SK(sk));
++
+ /* Only process if socket is not in use. */
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+@@ -525,4 +540,5 @@ death:
+ out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
++ (void)set_exec_env(env);
+ }
+diff -upr linux-2.6.16.orig/net/ipv4/udp.c linux-2.6.16-026test015/net/ipv4/udp.c
+--- linux-2.6.16.orig/net/ipv4/udp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv4/udp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -127,7 +127,9 @@ static int udp_v4_get_port(struct sock *
+ struct hlist_node *node;
+ struct sock *sk2;
+ struct inet_sock *inet = inet_sk(sk);
++ struct ve_struct *env;
+
++ env = VE_OWNER_SK(sk);
+ write_lock_bh(&udp_hash_lock);
+ if (snum == 0) {
+ int best_size_so_far, best, result, i;
+@@ -141,7 +143,7 @@ static int udp_v4_get_port(struct sock *
+ struct hlist_head *list;
+ int size;
+
+- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
++ list = &udp_hash[udp_hashfn(result, VEID(env))];
+ if (hlist_empty(list)) {
+ if (result > sysctl_local_port_range[1])
+ result = sysctl_local_port_range[0] +
+@@ -163,7 +165,7 @@ static int udp_v4_get_port(struct sock *
+ result = sysctl_local_port_range[0]
+ + ((result - sysctl_local_port_range[0]) &
+ (UDP_HTABLE_SIZE - 1));
+- if (!udp_lport_inuse(result))
++ if (!udp_lport_inuse(result, env))
+ break;
+ }
+ if (i >= (1 << 16) / UDP_HTABLE_SIZE)
+@@ -172,11 +174,12 @@ gotit:
+ udp_port_rover = snum = result;
+ } else {
+ sk_for_each(sk2, node,
+- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
++ &udp_hash[udp_hashfn(snum, VEID(env))]) {
+ struct inet_sock *inet2 = inet_sk(sk2);
+
+ if (inet2->num == snum &&
+ sk2 != sk &&
++ ve_accessible_strict(VE_OWNER_SK(sk2), env) &&
+ !ipv6_only_sock(sk2) &&
+ (!sk2->sk_bound_dev_if ||
+ !sk->sk_bound_dev_if ||
+@@ -190,7 +193,7 @@ gotit:
+ }
+ inet->num = snum;
+ if (sk_unhashed(sk)) {
+- struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
++ struct hlist_head *h = &udp_hash[udp_hashfn(snum, VEID(env))];
+
+ sk_add_node(sk, h);
+ sock_prot_inc_use(sk->sk_prot);
+@@ -228,11 +231,15 @@ static struct sock *udp_v4_lookup_longwa
+ struct hlist_node *node;
+ unsigned short hnum = ntohs(dport);
+ int badness = -1;
++ struct ve_struct *env;
+
+- sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) {
++ env = get_exec_env();
++ sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+- if (inet->num == hnum && !ipv6_only_sock(sk)) {
++ if (inet->num == hnum &&
++ ve_accessible_strict(VE_OWNER_SK(sk), env) &&
++ !ipv6_only_sock(sk)) {
+ int score = (sk->sk_family == PF_INET ? 1 : 0);
+ if (inet->rcv_saddr) {
+ if (inet->rcv_saddr != daddr)
+@@ -1049,7 +1056,8 @@ static int udp_v4_mcast_deliver(struct s
+ int dif;
+
+ read_lock(&udp_hash_lock);
+- sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
++ sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest),
++ VEID(VE_OWNER_SKB(skb)))]);
+ dif = skb->dev->ifindex;
+ sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+ if (sk) {
+@@ -1367,10 +1375,14 @@ static struct sock *udp_get_first(struct
+ {
+ struct sock *sk;
+ struct udp_iter_state *state = seq->private;
++ struct ve_struct *env;
+
++ env = get_exec_env();
+ for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+ struct hlist_node *node;
+ sk_for_each(sk, node, &udp_hash[state->bucket]) {
++ if (!ve_accessible(VE_OWNER_SK(sk), env))
++ continue;
+ if (sk->sk_family == state->family)
+ goto found;
+ }
+@@ -1387,8 +1399,13 @@ static struct sock *udp_get_next(struct
+ do {
+ sk = sk_next(sk);
+ try_again:
+- ;
+- } while (sk && sk->sk_family != state->family);
++ if (!sk)
++ break;
++ if (sk->sk_family != state->family)
++ continue;
++ if (ve_accessible(VE_OWNER_SK(sk), get_exec_env()))
++ break;
++ } while (1);
+
+ if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
+ sk = sk_head(&udp_hash[state->bucket]);
+@@ -1474,7 +1491,7 @@ int udp_proc_register(struct udp_seq_afi
+ afinfo->seq_fops->llseek = seq_lseek;
+ afinfo->seq_fops->release = seq_release_private;
+
+- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
++ p = proc_glob_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+ if (p)
+ p->data = afinfo;
+ else
+@@ -1486,7 +1503,8 @@ void udp_proc_unregister(struct udp_seq_
+ {
+ if (!afinfo)
+ return;
+- proc_net_remove(afinfo->name);
++
++ remove_proc_glob_entry(afinfo->name, NULL);
+ memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+ }
+
+@@ -1529,7 +1547,7 @@ static int udp4_seq_show(struct seq_file
+ static struct file_operations udp4_seq_fops;
+ static struct udp_seq_afinfo udp4_seq_afinfo = {
+ .owner = THIS_MODULE,
+- .name = "udp",
++ .name = "net/udp",
+ .family = AF_INET,
+ .seq_show = udp4_seq_show,
+ .seq_fops = &udp4_seq_fops,
+diff -upr linux-2.6.16.orig/net/ipv6/addrconf.c linux-2.6.16-026test015/net/ipv6/addrconf.c
+--- linux-2.6.16.orig/net/ipv6/addrconf.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/addrconf.c 2006-07-04 14:41:39.000000000 +0400
+@@ -100,6 +100,7 @@
+ #define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b)))
+
+ #ifdef CONFIG_SYSCTL
++static struct addrconf_sysctl_table * __addrconf_sysctl_register(struct inet6_dev *idev, char *devname, int ifindex, struct ipv6_devconf *p);
+ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p);
+ static void addrconf_sysctl_unregister(struct ipv6_devconf *p);
+ #endif
+@@ -133,8 +134,6 @@ static DEFINE_SPINLOCK(addrconf_verify_l
+ static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
+ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
+
+-static int addrconf_ifdown(struct net_device *dev, int how);
+-
+ static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags);
+ static void addrconf_dad_timer(unsigned long data);
+ static void addrconf_dad_completed(struct inet6_ifaddr *ifp);
+@@ -149,7 +148,7 @@ static int ipv6_chk_same_addr(const stru
+
+ static struct notifier_block *inet6addr_chain;
+
+-struct ipv6_devconf ipv6_devconf = {
++struct ipv6_devconf global_ipv6_devconf = {
+ .forwarding = 0,
+ .hop_limit = IPV6_DEFAULT_HOPLIMIT,
+ .mtu6 = IPV6_MIN_MTU,
+@@ -171,7 +170,7 @@ struct ipv6_devconf ipv6_devconf = {
+ .max_addresses = IPV6_MAX_ADDRESSES,
+ };
+
+-static struct ipv6_devconf ipv6_devconf_dflt = {
++struct ipv6_devconf global_ipv6_devconf_dflt = {
+ .forwarding = 0,
+ .hop_limit = IPV6_DEFAULT_HOPLIMIT,
+ .mtu6 = IPV6_MIN_MTU,
+@@ -192,6 +191,12 @@ static struct ipv6_devconf ipv6_devconf_
+ .max_addresses = IPV6_MAX_ADDRESSES,
+ };
+
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++#define ipv6_devconf_dflt (*(get_exec_env()->_ipv6_devconf_dflt))
++#else
++#define ipv6_devconf_dflt global_ipv6_devconf_dflt
++#endif
++
+ /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
+ #if 0
+ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+@@ -463,8 +468,8 @@ static void addrconf_forward_change(void
+ read_lock(&addrconf_lock);
+ idev = __in6_dev_get(dev);
+ if (idev) {
+- int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding);
+- idev->cnf.forwarding = ipv6_devconf.forwarding;
++ int changed = (!idev->cnf.forwarding) ^ (!ve_ipv6_devconf.forwarding);
++ idev->cnf.forwarding = ve_ipv6_devconf.forwarding;
+ if (changed)
+ dev_forward_change(idev);
+ }
+@@ -1148,9 +1153,10 @@ int ipv6_chk_addr(struct in6_addr *addr,
+ read_lock_bh(&addrconf_hash_lock);
+ for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
+ if (ipv6_addr_equal(&ifp->addr, addr) &&
+- !(ifp->flags&IFA_F_TENTATIVE)) {
++ !(ifp->flags&IFA_F_TENTATIVE) &&
++ ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) {
+ if (dev == NULL || ifp->idev->dev == dev ||
+- !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))
++ !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict))
+ break;
+ }
+ }
+@@ -1166,7 +1172,9 @@ int ipv6_chk_same_addr(const struct in6_
+
+ for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+- if (dev == NULL || ifp->idev->dev == dev)
++ if ((dev == NULL &&
++ ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env()))
++ || ifp->idev->dev == dev)
+ break;
+ }
+ }
+@@ -1180,9 +1188,10 @@ struct inet6_ifaddr * ipv6_get_ifaddr(st
+
+ read_lock_bh(&addrconf_hash_lock);
+ for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) {
+- if (ipv6_addr_equal(&ifp->addr, addr)) {
++ if (ipv6_addr_equal(&ifp->addr, addr) &&
++ ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) {
+ if (dev == NULL || ifp->idev->dev == dev ||
+- !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
++ !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) {
+ in6_ifa_hold(ifp);
+ break;
+ }
+@@ -1842,7 +1851,7 @@ err_exit:
+ /*
+ * Manual configuration of address on an interface
+ */
+-static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen)
++int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen)
+ {
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *idev;
+@@ -1871,6 +1880,7 @@ static int inet6_addr_add(int ifindex, s
+
+ return PTR_ERR(ifp);
+ }
++EXPORT_SYMBOL_GPL(inet6_addr_add);
+
+ static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen)
+ {
+@@ -1911,7 +1921,7 @@ int addrconf_add_ifaddr(void __user *arg
+ struct in6_ifreq ireq;
+ int err;
+
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+@@ -1928,7 +1938,7 @@ int addrconf_del_ifaddr(void __user *arg
+ struct in6_ifreq ireq;
+ int err;
+
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
+@@ -2270,7 +2280,7 @@ static struct notifier_block ipv6_dev_no
+ .priority = 0
+ };
+
+-static int addrconf_ifdown(struct net_device *dev, int how)
++int addrconf_ifdown(struct net_device *dev, int how)
+ {
+ struct inet6_dev *idev;
+ struct inet6_ifaddr *ifa, **bifa;
+@@ -2278,7 +2288,7 @@ static int addrconf_ifdown(struct net_de
+
+ ASSERT_RTNL();
+
+- if (dev == &loopback_dev && how == 1)
++ if (dev == get_ve0()->_loopback_dev && how == 1)
+ how = 0;
+
+ rt6_ifdown(dev);
+@@ -2386,10 +2396,12 @@ static int addrconf_ifdown(struct net_de
+ }
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(addrconf_ifdown);
+
+ static void addrconf_rs_timer(unsigned long data)
+ {
+ struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data;
++ struct ve_struct *old_env = set_exec_env(ifp->idev->dev->owner_env);
+
+ if (ifp->idev->cnf.forwarding)
+ goto out;
+@@ -2428,6 +2440,7 @@ static void addrconf_rs_timer(unsigned l
+
+ out:
+ in6_ifa_put(ifp);
++ set_exec_env(old_env);
+ }
+
+ /*
+@@ -2495,6 +2508,7 @@ static void addrconf_dad_timer(unsigned
+ struct inet6_dev *idev = ifp->idev;
+ struct in6_addr unspec;
+ struct in6_addr mcaddr;
++ struct ve_struct *old_env = set_exec_env(ifp->idev->dev->owner_env);
+
+ read_lock_bh(&idev->lock);
+ if (idev->dead) {
+@@ -2527,6 +2541,7 @@ static void addrconf_dad_timer(unsigned
+ ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec);
+ out:
+ in6_ifa_put(ifp);
++ set_exec_env(old_env);
+ }
+
+ static void addrconf_dad_completed(struct inet6_ifaddr *ifp)
+@@ -2594,8 +2609,11 @@ static struct inet6_ifaddr *if6_get_firs
+
+ for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
+ ifa = inet6_addr_lst[state->bucket];
+- if (ifa)
+- break;
++ while (ifa) {
++ if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env()))
++ return ifa;
++ ifa = ifa->lst_next;
++ }
+ }
+ return ifa;
+ }
+@@ -2606,6 +2624,11 @@ static struct inet6_ifaddr *if6_get_next
+
+ ifa = ifa->lst_next;
+ try_again:
++ while (ifa) {
++ if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env()))
++ break;
++ ifa = ifa->lst_next;
++ }
+ if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) {
+ ifa = inet6_addr_lst[state->bucket];
+ goto try_again;
+@@ -2697,14 +2720,14 @@ static struct file_operations if6_fops =
+
+ int __init if6_proc_init(void)
+ {
+- if (!proc_net_fops_create("if_inet6", S_IRUGO, &if6_fops))
++ if (!proc_glob_fops_create("net/if_inet6", S_IRUGO, &if6_fops))
+ return -ENOMEM;
+ return 0;
+ }
+
+ void if6_proc_exit(void)
+ {
+- proc_net_remove("if_inet6");
++ remove_proc_glob_entry("net/if_inet6", NULL);
+ }
+ #endif /* CONFIG_PROC_FS */
+
+@@ -2717,6 +2740,7 @@ static void addrconf_verify(unsigned lon
+ struct inet6_ifaddr *ifp;
+ unsigned long now, next;
+ int i;
++ struct ve_struct *old_env;
+
+ spin_lock_bh(&addrconf_verify_lock);
+ now = jiffies;
+@@ -2737,6 +2761,8 @@ restart:
+ if (ifp->flags & IFA_F_PERMANENT)
+ continue;
+
++ old_env = set_exec_env(ifp->idev->dev->owner_env);
++
+ spin_lock(&ifp->lock);
+ age = (now - ifp->tstamp) / HZ;
+
+@@ -2751,6 +2777,7 @@ restart:
+ in6_ifa_hold(ifp);
+ read_unlock(&addrconf_hash_lock);
+ ipv6_del_addr(ifp);
++ set_exec_env(old_env);
+ goto restart;
+ } else if (age >= ifp->prefered_lft) {
+ /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */
+@@ -2772,6 +2799,7 @@ restart:
+
+ ipv6_ifa_notify(0, ifp);
+ in6_ifa_put(ifp);
++ set_exec_env(old_env);
+ goto restart;
+ }
+ #ifdef CONFIG_IPV6_PRIVACY
+@@ -2793,6 +2821,7 @@ restart:
+ ipv6_create_tempaddr(ifpub, ifp);
+ in6_ifa_put(ifpub);
+ in6_ifa_put(ifp);
++ set_exec_env(old_env);
+ goto restart;
+ }
+ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+@@ -2805,6 +2834,7 @@ restart:
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+ spin_unlock(&ifp->lock);
+ }
++ set_exec_env(old_env);
+ }
+ read_unlock(&addrconf_hash_lock);
+ }
+@@ -3360,7 +3390,7 @@ int addrconf_sysctl_forward(ctl_table *c
+ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+ if (write && valp != &ipv6_devconf_dflt.forwarding) {
+- if (valp != &ipv6_devconf.forwarding) {
++ if (valp != &ve_ipv6_devconf.forwarding) {
+ if ((!*valp) ^ (!val)) {
+ struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
+ if (idev == NULL)
+@@ -3368,7 +3398,7 @@ int addrconf_sysctl_forward(ctl_table *c
+ dev_forward_change(idev);
+ }
+ } else {
+- ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding;
++ ipv6_devconf_dflt.forwarding = ve_ipv6_devconf.forwarding;
+ addrconf_forward_change();
+ }
+ if (*valp)
+@@ -3411,7 +3441,7 @@ static int addrconf_sysctl_forward_strat
+ }
+
+ if (valp != &ipv6_devconf_dflt.forwarding) {
+- if (valp != &ipv6_devconf.forwarding) {
++ if (valp != &ve_ipv6_devconf.forwarding) {
+ struct inet6_dev *idev = (struct inet6_dev *)table->extra1;
+ int changed;
+ if (unlikely(idev == NULL))
+@@ -3447,7 +3477,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_FORWARDING,
+ .procname = "forwarding",
+- .data = &ipv6_devconf.forwarding,
++ .data = &global_ipv6_devconf.forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &addrconf_sysctl_forward,
+@@ -3456,7 +3486,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_HOP_LIMIT,
+ .procname = "hop_limit",
+- .data = &ipv6_devconf.hop_limit,
++ .data = &global_ipv6_devconf.hop_limit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+@@ -3464,7 +3494,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_MTU,
+ .procname = "mtu",
+- .data = &ipv6_devconf.mtu6,
++ .data = &global_ipv6_devconf.mtu6,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3472,7 +3502,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_ACCEPT_RA,
+ .procname = "accept_ra",
+- .data = &ipv6_devconf.accept_ra,
++ .data = &global_ipv6_devconf.accept_ra,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3480,7 +3510,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_ACCEPT_REDIRECTS,
+ .procname = "accept_redirects",
+- .data = &ipv6_devconf.accept_redirects,
++ .data = &global_ipv6_devconf.accept_redirects,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3488,7 +3518,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_AUTOCONF,
+ .procname = "autoconf",
+- .data = &ipv6_devconf.autoconf,
++ .data = &global_ipv6_devconf.autoconf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3496,7 +3526,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_DAD_TRANSMITS,
+ .procname = "dad_transmits",
+- .data = &ipv6_devconf.dad_transmits,
++ .data = &global_ipv6_devconf.dad_transmits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3504,7 +3534,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_RTR_SOLICITS,
+ .procname = "router_solicitations",
+- .data = &ipv6_devconf.rtr_solicits,
++ .data = &global_ipv6_devconf.rtr_solicits,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3512,7 +3542,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL,
+ .procname = "router_solicitation_interval",
+- .data = &ipv6_devconf.rtr_solicit_interval,
++ .data = &global_ipv6_devconf.rtr_solicit_interval,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+@@ -3521,7 +3551,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY,
+ .procname = "router_solicitation_delay",
+- .data = &ipv6_devconf.rtr_solicit_delay,
++ .data = &global_ipv6_devconf.rtr_solicit_delay,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+@@ -3530,7 +3560,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_FORCE_MLD_VERSION,
+ .procname = "force_mld_version",
+- .data = &ipv6_devconf.force_mld_version,
++ .data = &global_ipv6_devconf.force_mld_version,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3539,7 +3569,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_USE_TEMPADDR,
+ .procname = "use_tempaddr",
+- .data = &ipv6_devconf.use_tempaddr,
++ .data = &global_ipv6_devconf.use_tempaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3547,7 +3577,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_TEMP_VALID_LFT,
+ .procname = "temp_valid_lft",
+- .data = &ipv6_devconf.temp_valid_lft,
++ .data = &global_ipv6_devconf.temp_valid_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3555,7 +3585,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_TEMP_PREFERED_LFT,
+ .procname = "temp_prefered_lft",
+- .data = &ipv6_devconf.temp_prefered_lft,
++ .data = &global_ipv6_devconf.temp_prefered_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3563,7 +3593,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_REGEN_MAX_RETRY,
+ .procname = "regen_max_retry",
+- .data = &ipv6_devconf.regen_max_retry,
++ .data = &global_ipv6_devconf.regen_max_retry,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3571,7 +3601,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR,
+ .procname = "max_desync_factor",
+- .data = &ipv6_devconf.max_desync_factor,
++ .data = &global_ipv6_devconf.max_desync_factor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3580,7 +3610,7 @@ static struct addrconf_sysctl_table
+ {
+ .ctl_name = NET_IPV6_MAX_ADDRESSES,
+ .procname = "max_addresses",
+- .data = &ipv6_devconf.max_addresses,
++ .data = &global_ipv6_devconf.max_addresses,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+@@ -3635,29 +3665,22 @@ static struct addrconf_sysctl_table
+ },
+ };
+
+-static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p)
++static struct addrconf_sysctl_table *
++__addrconf_sysctl_register(struct inet6_dev *idev, char *dev_name, int ifindex, struct ipv6_devconf *p)
+ {
+ int i;
+- struct net_device *dev = idev ? idev->dev : NULL;
+ struct addrconf_sysctl_table *t;
+- char *dev_name = NULL;
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (t == NULL)
+- return;
++ return NULL;
++
+ memcpy(t, &addrconf_sysctl, sizeof(*t));
+ for (i=0; t->addrconf_vars[i].data; i++) {
+- t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf;
++ t->addrconf_vars[i].data += (char*)p - (char*)&global_ipv6_devconf;
+ t->addrconf_vars[i].de = NULL;
+ t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */
+ }
+- if (dev) {
+- dev_name = dev->name;
+- t->addrconf_dev[0].ctl_name = dev->ifindex;
+- } else {
+- dev_name = "default";
+- t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+- }
+
+ /*
+ * Make a copy of dev_name, because '.procname' is regarded as const
+@@ -3668,6 +3691,7 @@ static void addrconf_sysctl_register(str
+ if (!dev_name)
+ goto free;
+
++ t->addrconf_dev[0].ctl_name = ifindex;
+ t->addrconf_dev[0].procname = dev_name;
+
+ t->addrconf_dev[0].child = t->addrconf_vars;
+@@ -3682,9 +3706,7 @@ static void addrconf_sysctl_register(str
+ t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0);
+ if (t->sysctl_header == NULL)
+ goto free_procname;
+- else
+- p->sysctl = t;
+- return;
++ return t;
+
+ /* error path */
+ free_procname:
+@@ -3692,7 +3714,26 @@ static void addrconf_sysctl_register(str
+ free:
+ kfree(t);
+
+- return;
++ return NULL;
++}
++
++static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p)
++{
++ struct net_device *dev;
++ char *dev_name;
++ int ifindex;
++
++ dev = idev ? idev->dev : NULL;
++
++ if (dev) {
++ dev_name = dev->name;
++ ifindex = dev->ifindex;
++ } else {
++ dev_name = "default";
++ ifindex = NET_PROTO_CONF_DEFAULT;
++ }
++
++ p->sysctl = __addrconf_sysctl_register(idev, dev_name, ifindex, p);
+ }
+
+ static void addrconf_sysctl_unregister(struct ipv6_devconf *p)
+@@ -3706,6 +3747,73 @@ static void addrconf_sysctl_unregister(s
+ }
+ }
+
++int addrconf_sysctl_init(struct ve_struct *ve)
++{
++ int err = 0;
++#ifdef CONFIG_SYSCTL
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ struct ipv6_devconf *conf, *conf_def;
++
++ err = -ENOMEM;
++
++ conf = kmalloc(sizeof(*conf), GFP_KERNEL);
++ if (!conf)
++ goto err1;
++
++ memcpy(conf, &global_ipv6_devconf, sizeof(*conf));
++ conf->sysctl = __addrconf_sysctl_register(NULL, "all",
++ NET_PROTO_CONF_ALL, conf);
++ if (!conf->sysctl)
++ goto err2;
++
++ conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL);
++ if (!conf_def)
++ goto err3;
++
++ memcpy(conf_def, &global_ipv6_devconf_dflt, sizeof(*conf_def));
++ conf_def->sysctl = __addrconf_sysctl_register(NULL, "default",
++ NET_PROTO_CONF_DEFAULT, conf_def);
++ if (!conf_def->sysctl)
++ goto err4;
++
++ ve->_ipv6_devconf = conf;
++ ve->_ipv6_devconf_dflt = conf_def;
++ return 0;
++
++err4:
++ kfree(conf_def);
++err3:
++ addrconf_sysctl_unregister(conf);
++err2:
++ kfree(conf);
++err1:
++#endif
++#endif
++ return err;
++}
++EXPORT_SYMBOL(addrconf_sysctl_init);
++
++void addrconf_sysctl_fini(struct ve_struct *ve)
++{
++#ifdef CONFIG_SYSCTL
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ addrconf_sysctl_unregister(ve->_ipv6_devconf);
++ addrconf_sysctl_unregister(ve->_ipv6_devconf_dflt);
++#endif
++#endif
++}
++EXPORT_SYMBOL(addrconf_sysctl_fini);
++
++void addrconf_sysctl_free(struct ve_struct *ve)
++{
++#ifdef CONFIG_SYSCTL
++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
++ kfree(ve->_ipv6_devconf);
++ kfree(ve->_ipv6_devconf_dflt);
++#endif
++#endif
++}
++EXPORT_SYMBOL(addrconf_sysctl_free);
+
+ #endif
+
+@@ -3731,6 +3839,11 @@ int __init addrconf_init(void)
+ {
+ int err = 0;
+
++#ifdef CONFIG_VE
++ get_ve0()->_ipv6_devconf = &global_ipv6_devconf;
++ get_ve0()->_ipv6_devconf_dflt = &global_ipv6_devconf_dflt;
++#endif
++
+ /* The addrconf netdev notifier requires that loopback_dev
+ * has it's ipv6 private information allocated and setup
+ * before it can bring up and give link-local addresses
+@@ -3772,7 +3885,7 @@ int __init addrconf_init(void)
+ #ifdef CONFIG_SYSCTL
+ addrconf_sysctl.sysctl_header =
+ register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0);
+- addrconf_sysctl_register(NULL, &ipv6_devconf_dflt);
++ __addrconf_sysctl_register(NULL, "default", NET_PROTO_CONF_DEFAULT, &global_ipv6_devconf_dflt);
+ #endif
+
+ return 0;
+@@ -3789,8 +3902,8 @@ void __exit addrconf_cleanup(void)
+
+ rtnetlink_links[PF_INET6] = NULL;
+ #ifdef CONFIG_SYSCTL
+- addrconf_sysctl_unregister(&ipv6_devconf_dflt);
+- addrconf_sysctl_unregister(&ipv6_devconf);
++ addrconf_sysctl_unregister(&global_ipv6_devconf_dflt);
++ addrconf_sysctl_unregister(&global_ipv6_devconf);
+ #endif
+
+ rtnl_lock();
+@@ -3835,6 +3948,6 @@ void __exit addrconf_cleanup(void)
+ #endif
+
+ #ifdef CONFIG_PROC_FS
+- proc_net_remove("if_inet6");
++ remove_proc_glob_entry("net/if_inet6", NULL);
+ #endif
+ }
+diff -upr linux-2.6.16.orig/net/ipv6/af_inet6.c linux-2.6.16-026test015/net/ipv6/af_inet6.c
+--- linux-2.6.16.orig/net/ipv6/af_inet6.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/af_inet6.c 2006-07-04 14:41:39.000000000 +0400
+@@ -60,6 +60,7 @@
+ #ifdef CONFIG_IPV6_TUNNEL
+ #include <net/ip6_tunnel.h>
+ #endif
++#include <ub/ub_net.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/system.h>
+@@ -160,6 +161,13 @@ lookup_protocol:
+ if (sk == NULL)
+ goto out;
+
++ err = -ENOBUFS;
++ if (ub_sock_charge(sk, PF_INET6, sock->type))
++ goto out_sk_free;
++ /* if charge was successful, sock_init_data() MUST be called to
++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource
++ */
++
+ sock_init_data(sock, sk);
+
+ err = 0;
+@@ -234,6 +242,9 @@ out:
+ out_rcu_unlock:
+ rcu_read_unlock();
+ goto out;
++out_sk_free:
++ sk_free(sk);
++ return err;
+ }
+
+
+@@ -650,6 +661,8 @@ int inet6_sk_rebuild_header(struct sock
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
++ if (!sysctl_tcp_use_sg)
++ sk->sk_route_caps &= ~NETIF_F_SG;
+ }
+
+ return 0;
+@@ -715,21 +728,21 @@ snmp6_mib_free(void *ptr[2])
+
+ static int __init init_ipv6_mibs(void)
+ {
+- if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib),
++ if (snmp6_mib_init((void **)ve_ipv6_statistics, sizeof (struct ipstats_mib),
+ __alignof__(struct ipstats_mib)) < 0)
+ goto err_ip_mib;
+- if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib),
++ if (snmp6_mib_init((void **)ve_icmpv6_statistics, sizeof (struct icmpv6_mib),
+ __alignof__(struct icmpv6_mib)) < 0)
+ goto err_icmp_mib;
+- if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib),
++ if (snmp6_mib_init((void **)ve_udp_stats_in6, sizeof (struct udp_mib),
+ __alignof__(struct udp_mib)) < 0)
+ goto err_udp_mib;
+ return 0;
+
+ err_udp_mib:
+- snmp6_mib_free((void **)icmpv6_statistics);
++ snmp6_mib_free((void **)ve_icmpv6_statistics);
+ err_icmp_mib:
+- snmp6_mib_free((void **)ipv6_statistics);
++ snmp6_mib_free((void **)ve_ipv6_statistics);
+ err_ip_mib:
+ return -ENOMEM;
+
+@@ -737,9 +750,9 @@ err_ip_mib:
+
+ static void cleanup_ipv6_mibs(void)
+ {
+- snmp6_mib_free((void **)ipv6_statistics);
+- snmp6_mib_free((void **)icmpv6_statistics);
+- snmp6_mib_free((void **)udp_stats_in6);
++ snmp6_mib_free((void **)ve_ipv6_statistics);
++ snmp6_mib_free((void **)ve_icmpv6_statistics);
++ snmp6_mib_free((void **)ve_udp_stats_in6);
+ }
+
+ static int __init inet6_init(void)
+diff -upr linux-2.6.16.orig/net/ipv6/anycast.c linux-2.6.16-026test015/net/ipv6/anycast.c
+--- linux-2.6.16.orig/net/ipv6/anycast.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/anycast.c 2006-07-04 14:41:39.000000000 +0400
+@@ -83,7 +83,7 @@ int ipv6_sock_ac_join(struct sock *sk, i
+ struct net_device *dev = NULL;
+ struct inet6_dev *idev;
+ struct ipv6_ac_socklist *pac;
+- int ishost = !ipv6_devconf.forwarding;
++ int ishost = !ve_ipv6_devconf.forwarding;
+ int err = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+@@ -455,6 +455,8 @@ static inline struct ifacaddr6 *ac6_get_
+ state->dev;
+ state->dev = state->dev->next) {
+ struct inet6_dev *idev;
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ idev = in6_dev_get(state->dev);
+ if (!idev)
+ continue;
+@@ -484,6 +486,8 @@ static struct ifacaddr6 *ac6_get_next(st
+ state->idev = NULL;
+ break;
+ }
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ state->idev = in6_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+@@ -579,7 +583,7 @@ static struct file_operations ac6_seq_fo
+
+ int __init ac6_proc_init(void)
+ {
+- if (!proc_net_fops_create("anycast6", S_IRUGO, &ac6_seq_fops))
++ if (!proc_glob_fops_create("net/anycast6", S_IRUGO, &ac6_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+@@ -587,7 +591,7 @@ int __init ac6_proc_init(void)
+
+ void ac6_proc_exit(void)
+ {
+- proc_net_remove("anycast6");
++ remove_proc_glob_entry("net/anycast6", NULL);
+ }
+ #endif
+
+diff -upr linux-2.6.16.orig/net/ipv6/exthdrs.c linux-2.6.16-026test015/net/ipv6/exthdrs.c
+--- linux-2.6.16.orig/net/ipv6/exthdrs.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/exthdrs.c 2006-07-04 14:41:36.000000000 +0400
+@@ -489,6 +489,18 @@ int ipv6_parse_hopopts(struct sk_buff *s
+ {
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
++ /*
++ * skb->nh.raw is equal to skb->data, and
++ * skb->h.raw - skb->nh.raw is always equal to
++ * sizeof(struct ipv6hdr) by definition of
++ * hop-by-hop options.
++ */
++ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) ||
++ !pskb_may_pull(skb, sizeof(struct ipv6hdr) + ((skb->h.raw[1] + 1) << 3))) {
++ kfree_skb(skb);
++ return -1;
++ }
++
+ opt->hop = sizeof(struct ipv6hdr);
+ if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+ skb->h.raw += (skb->h.raw[1]+1)<<3;
+diff -upr linux-2.6.16.orig/net/ipv6/inet6_connection_sock.c linux-2.6.16-026test015/net/ipv6/inet6_connection_sock.c
+--- linux-2.6.16.orig/net/ipv6/inet6_connection_sock.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/inet6_connection_sock.c 2006-07-04 14:41:39.000000000 +0400
+@@ -26,6 +26,8 @@
+ #include <net/ip6_route.h>
+ #include <net/sock.h>
+ #include <net/inet6_connection_sock.h>
++#include <ub/ub_net.h>
++#include <ub/ub_orphan.h>
+
+ int inet6_csk_bind_conflict(const struct sock *sk,
+ const struct inet_bind_bucket *tb)
+@@ -36,6 +38,7 @@ int inet6_csk_bind_conflict(const struct
+ /* We must walk the whole port owner list in this case. -DaveM */
+ sk_for_each_bound(sk2, node, &tb->owners) {
+ if (sk != sk2 &&
++ !ve_accessible_strict(VE_OWNER_SK(sk), VE_OWNER_SK(sk2)) &&
+ (!sk->sk_bound_dev_if ||
+ !sk2->sk_bound_dev_if ||
+ sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
+@@ -173,6 +176,7 @@ int inet6_csk_xmit(struct sk_buff *skb,
+
+ if (err) {
+ sk->sk_err_soft = -err;
++ kfree_skb(skb);
+ return err;
+ }
+
+@@ -181,12 +185,15 @@ int inet6_csk_xmit(struct sk_buff *skb,
+
+ if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
+ sk->sk_route_caps = 0;
++ kfree_skb(skb);
+ return err;
+ }
+
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
++ if (!sysctl_tcp_use_sg)
++ sk->sk_route_caps &= ~NETIF_F_SG;
+ }
+
+ skb->dst = dst_clone(dst);
+diff -upr linux-2.6.16.orig/net/ipv6/inet6_hashtables.c linux-2.6.16-026test015/net/ipv6/inet6_hashtables.c
+--- linux-2.6.16.orig/net/ipv6/inet6_hashtables.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/inet6_hashtables.c 2006-07-04 14:41:39.000000000 +0400
+@@ -31,9 +31,14 @@ struct sock *inet6_lookup_listener(struc
+ const struct hlist_node *node;
+ struct sock *result = NULL;
+ int score, hiscore = 0;
++ struct ve_struct *env;
++
++ env = get_exec_env();
+
+ read_lock(&hashinfo->lhash_lock);
+- sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) {
++ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]) {
++ if (!ve_accessible_strict(VE_OWNER_SK(sk), env))
++ continue;
+ if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) {
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+
+@@ -84,7 +89,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
+
+ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
+ struct sock *sk, const __u16 lport,
+- struct inet_timewait_sock **twp)
++ struct inet_timewait_sock **twp,
++ struct ve_struct *ve)
+ {
+ struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_sock *inet = inet_sk(sk);
+@@ -94,7 +100,7 @@ static int __inet6_check_established(str
+ const int dif = sk->sk_bound_dev_if;
+ const u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr,
+- inet->dport);
++ inet->dport, VEID(ve));
+ struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
+ struct sock *sk2;
+ const struct hlist_node *node;
+@@ -113,7 +119,8 @@ static int __inet6_check_established(str
+ sk2->sk_family == PF_INET6 &&
+ ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) &&
+ ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) &&
+- sk2->sk_bound_dev_if == sk->sk_bound_dev_if) {
++ sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
++ ve_accessible_strict(tw->tw_owner_env, VEID(ve))) {
+ if (twsk_unique(sk, sk2, twp))
+ goto unique;
+ else
+@@ -124,7 +131,7 @@ static int __inet6_check_established(str
+
+ /* And established part... */
+ sk_for_each(sk2, node, &head->chain) {
+- if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif))
++ if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif, ve))
+ goto not_unique;
+ }
+
+@@ -173,7 +180,9 @@ int inet6_hash_connect(struct inet_timew
+ struct inet_bind_hashbucket *head;
+ struct inet_bind_bucket *tb;
+ int ret;
++ struct ve_struct *ve;
+
++ ve = VE_OWNER_SK(sk);
+ if (snum == 0) {
+ const int low = sysctl_local_port_range[0];
+ const int high = sysctl_local_port_range[1];
+@@ -187,7 +196,8 @@ int inet6_hash_connect(struct inet_timew
+ local_bh_disable();
+ for (i = 1; i <= range; i++) {
+ port = low + (i + offset) % range;
+- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)];
++ head = &hinfo->bhash[inet_bhashfn(port,
++ hinfo->bhash_size, VEID(ve))];
+ spin_lock(&head->lock);
+
+ /* Does not bother with rcv_saddr checks,
+@@ -201,14 +211,14 @@ int inet6_hash_connect(struct inet_timew
+ goto next_port;
+ if (!__inet6_check_established(death_row,
+ sk, port,
+- &tw))
++ &tw, ve))
+ goto ok;
+ goto next_port;
+ }
+ }
+
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+- head, port);
++ head, port, ve);
+ if (!tb) {
+ spin_unlock(&head->lock);
+ break;
+@@ -243,7 +253,7 @@ ok:
+ goto out;
+ }
+
+- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)];
++ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+
+@@ -254,7 +264,7 @@ ok:
+ } else {
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+- ret = __inet6_check_established(death_row, sk, snum, NULL);
++ ret = __inet6_check_established(death_row, sk, snum, NULL, ve);
+ out:
+ local_bh_enable();
+ return ret;
+diff -upr linux-2.6.16.orig/net/ipv6/ip6_fib.c linux-2.6.16-026test015/net/ipv6/ip6_fib.c
+--- linux-2.6.16.orig/net/ipv6/ip6_fib.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/ip6_fib.c 2006-07-04 14:41:39.000000000 +0400
+@@ -1128,8 +1128,12 @@ static int fib6_age(struct rt6_info *rt,
+
+ static DEFINE_SPINLOCK(fib6_gc_lock);
+
++LIST_HEAD(fib6_table_list);
++
+ void fib6_run_gc(unsigned long dummy)
+ {
++ struct fib6_table *tbl;
++
+ if (dummy != ~0UL) {
+ spin_lock_bh(&fib6_gc_lock);
+ gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval;
+@@ -1147,7 +1151,11 @@ void fib6_run_gc(unsigned long dummy)
+
+ write_lock_bh(&rt6_lock);
+ ndisc_dst_gc(&gc_args.more);
+- fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL);
++ list_for_each_entry(tbl, &fib6_table_list, list) {
++ struct ve_struct *old_env = set_exec_env(tbl->owner_env);
++ fib6_clean_tree(&tbl->root, fib6_age, 0, NULL);
++ set_exec_env(old_env);
++ }
+ write_unlock_bh(&rt6_lock);
+
+ if (gc_args.more)
+@@ -1163,7 +1171,7 @@ void __init fib6_init(void)
+ {
+ fib6_node_kmem = kmem_cache_create("fib6_nodes",
+ sizeof(struct fib6_node),
+- 0, SLAB_HWCACHE_ALIGN,
++ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC,
+ NULL, NULL);
+ if (!fib6_node_kmem)
+ panic("cannot create fib6_nodes cache");
+diff -upr linux-2.6.16.orig/net/ipv6/ip6_flowlabel.c linux-2.6.16-026test015/net/ipv6/ip6_flowlabel.c
+--- linux-2.6.16.orig/net/ipv6/ip6_flowlabel.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/ip6_flowlabel.c 2006-07-04 14:41:39.000000000 +0400
+@@ -417,6 +417,9 @@ int ipv6_flowlabel_opt(struct sock *sk,
+ struct ipv6_fl_socklist *sfl, **sflp;
+ struct ip6_flowlabel *fl;
+
++ if (!ve_is_super(get_exec_env()))
++ return -EPERM;
++
+ if (optlen < sizeof(freq))
+ return -EINVAL;
+
+diff -upr linux-2.6.16.orig/net/ipv6/ip6_output.c linux-2.6.16-026test015/net/ipv6/ip6_output.c
+--- linux-2.6.16.orig/net/ipv6/ip6_output.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/ip6_output.c 2006-07-04 14:41:39.000000000 +0400
+@@ -319,7 +319,7 @@ int ip6_forward(struct sk_buff *skb)
+ struct ipv6hdr *hdr = skb->nh.ipv6h;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+
+- if (ipv6_devconf.forwarding == 0)
++ if (ve_ipv6_devconf.forwarding == 0)
+ goto error;
+
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+@@ -407,6 +407,20 @@ int ip6_forward(struct sk_buff *skb)
+ return -EMSGSIZE;
+ }
+
++ /*
++ * We try to optimize forwarding of VE packets:
++ * do not decrement TTL (and so save skb_cow)
++ * during forwarding of outgoing pkts from VE.
++ * For incoming pkts we still do ttl decr,
++ * since such skb is not cloned and does not require
++ * actual cow. So, there is at least one place
++ * in pkts path with mandatory ttl decr, that is
++ * sufficient to prevent routing loops.
++ */
++ hdr = skb->nh.ipv6h;
++ if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */
++ goto no_ttl_decr;
++
+ if (skb_cow(skb, dst->dev->hard_header_len)) {
+ IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+ goto drop;
+@@ -418,6 +432,7 @@ int ip6_forward(struct sk_buff *skb)
+
+ hdr->hop_limit--;
+
++no_ttl_decr:
+ IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+ return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
+
+diff -upr linux-2.6.16.orig/net/ipv6/mcast.c linux-2.6.16-026test015/net/ipv6/mcast.c
+--- linux-2.6.16.orig/net/ipv6/mcast.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/mcast.c 2006-07-04 14:41:39.000000000 +0400
+@@ -156,7 +156,7 @@ static int ip6_mc_leave_src(struct sock
+ #define IGMP6_UNSOLICITED_IVAL (10*HZ)
+ #define MLD_QRV_DEFAULT 2
+
+-#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \
++#define MLD_V1_SEEN(idev) (ve_ipv6_devconf.force_mld_version == 1 || \
+ (idev)->cnf.force_mld_version == 1 || \
+ ((idev)->mc_v1_seen && \
+ time_before(jiffies, (idev)->mc_v1_seen)))
+@@ -248,6 +248,7 @@ int ipv6_sock_mc_join(struct sock *sk, i
+
+ return 0;
+ }
++EXPORT_SYMBOL_GPL(ipv6_sock_mc_join);
+
+ /*
+ * socket leave on multicast group
+@@ -2166,15 +2167,18 @@ static void igmp6_leave_group(struct ifm
+ static void mld_gq_timer_expire(unsigned long data)
+ {
+ struct inet6_dev *idev = (struct inet6_dev *)data;
++ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
+
+ idev->mc_gq_running = 0;
+ mld_send_report(idev, NULL);
+ __in6_dev_put(idev);
++ set_exec_env(old_env);
+ }
+
+ static void mld_ifc_timer_expire(unsigned long data)
+ {
+ struct inet6_dev *idev = (struct inet6_dev *)data;
++ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env);
+
+ mld_send_cr(idev);
+ if (idev->mc_ifc_count) {
+@@ -2183,6 +2187,7 @@ static void mld_ifc_timer_expire(unsigne
+ mld_ifc_start_timer(idev, idev->mc_maxdelay);
+ }
+ __in6_dev_put(idev);
++ set_exec_env(old_env);
+ }
+
+ static void mld_ifc_event(struct inet6_dev *idev)
+@@ -2197,6 +2202,7 @@ static void mld_ifc_event(struct inet6_d
+ static void igmp6_timer_handler(unsigned long data)
+ {
+ struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
++ struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env);
+
+ if (MLD_V1_SEEN(ma->idev))
+ igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
+@@ -2208,6 +2214,7 @@ static void igmp6_timer_handler(unsigned
+ ma->mca_flags &= ~MAF_TIMER_RUNNING;
+ spin_unlock(&ma->mca_lock);
+ ma_put(ma);
++ set_exec_env(old_env);
+ }
+
+ /* Device going down */
+@@ -2331,6 +2338,8 @@ static inline struct ifmcaddr6 *igmp6_mc
+ state->dev;
+ state->dev = state->dev->next) {
+ struct inet6_dev *idev;
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ idev = in6_dev_get(state->dev);
+ if (!idev)
+ continue;
+@@ -2361,6 +2370,8 @@ static struct ifmcaddr6 *igmp6_mc_get_ne
+ state->idev = NULL;
+ break;
+ }
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ state->idev = in6_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+@@ -2476,6 +2487,8 @@ static inline struct ip6_sf_list *igmp6_
+ state->dev;
+ state->dev = state->dev->next) {
+ struct inet6_dev *idev;
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ idev = in6_dev_get(state->dev);
+ if (unlikely(idev == NULL))
+ continue;
+@@ -2515,6 +2528,8 @@ static struct ip6_sf_list *igmp6_mcf_get
+ state->idev = NULL;
+ goto out;
+ }
++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env())))
++ continue;
+ state->idev = in6_dev_get(state->dev);
+ if (!state->idev)
+ continue;
+@@ -2657,8 +2672,8 @@ int __init igmp6_init(struct net_proto_f
+ np->hop_limit = 1;
+
+ #ifdef CONFIG_PROC_FS
+- proc_net_fops_create("igmp6", S_IRUGO, &igmp6_mc_seq_fops);
+- proc_net_fops_create("mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops);
++ proc_glob_fops_create("net/igmp6", S_IRUGO, &igmp6_mc_seq_fops);
++ proc_glob_fops_create("net/mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops);
+ #endif
+
+ return 0;
+@@ -2670,7 +2685,7 @@ void igmp6_cleanup(void)
+ igmp6_socket = NULL; /* for safety */
+
+ #ifdef CONFIG_PROC_FS
+- proc_net_remove("mcfilter6");
+- proc_net_remove("igmp6");
++ remove_proc_glob_entry("net/mcfilter6", NULL);
++ remove_proc_glob_entry("net/igmp6", NULL);
+ #endif
+ }
+diff -upr linux-2.6.16.orig/net/ipv6/ndisc.c linux-2.6.16-026test015/net/ipv6/ndisc.c
+--- linux-2.6.16.orig/net/ipv6/ndisc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/ndisc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -124,7 +124,7 @@ static struct neigh_ops ndisc_direct_ops
+ .queue_xmit = dev_queue_xmit,
+ };
+
+-struct neigh_table nd_tbl = {
++struct neigh_table global_nd_tbl = {
+ .family = AF_INET6,
+ .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr),
+ .key_len = sizeof(struct in6_addr),
+@@ -135,7 +135,7 @@ struct neigh_table nd_tbl = {
+ .proxy_redo = pndisc_redo,
+ .id = "ndisc_cache",
+ .parms = {
+- .tbl = &nd_tbl,
++ .tbl = &global_nd_tbl,
+ .base_reachable_time = 30 * HZ,
+ .retrans_time = 1 * HZ,
+ .gc_staletime = 60 * HZ,
+@@ -1660,7 +1660,9 @@ int __init ndisc_init(struct net_proto_f
+ * Initialize the neighbour table
+ */
+
+- neigh_table_init(&nd_tbl);
++ get_ve0()->ve_nd_tbl = &global_nd_tbl;
++ if (neigh_table_init(&nd_tbl))
++ panic("cannot initialize IPv6 NDISC tables\n");
+
+ #ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH,
+@@ -1682,3 +1684,52 @@ void ndisc_cleanup(void)
+ sock_release(ndisc_socket);
+ ndisc_socket = NULL; /* For safety. */
+ }
++
++int ve_ndisc_init(struct ve_struct *ve)
++{
++ struct ve_struct *old_env;
++ int err;
++
++ ve->ve_nd_tbl = kmalloc(sizeof(struct neigh_table), GFP_KERNEL);
++ if (ve->ve_nd_tbl == NULL) {
++ err = -ENOMEM;
++ goto out;
++ }
++
++ *(ve->ve_nd_tbl) = global_nd_tbl;
++ ve->ve_nd_tbl->parms.tbl = ve->ve_nd_tbl;
++ old_env = set_exec_env(ve);
++ err = neigh_table_init(ve->ve_nd_tbl);
++ if (err)
++ goto out_free;
++#ifdef CONFIG_SYSCTL
++ neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH,
++ "ipv6",
++ &ndisc_ifinfo_sysctl_change,
++ &ndisc_ifinfo_sysctl_strategy);
++#endif
++ set_exec_env(old_env);
++ err = 0;
++
++out:
++ return err;
++
++out_free:
++ kfree(ve->ve_nd_tbl);
++ ve->ve_nd_tbl = NULL;
++ goto out;
++}
++EXPORT_SYMBOL(ve_ndisc_init);
++
++void ve_ndisc_fini(struct ve_struct *ve)
++{
++ if (ve->ve_nd_tbl) {
++#ifdef CONFIG_SYSCTL
++ neigh_sysctl_unregister(&ve->ve_nd_tbl->parms);
++#endif
++ neigh_table_clear(ve->ve_nd_tbl);
++ kfree(ve->ve_nd_tbl);
++ ve->ve_nd_tbl = NULL;
++ }
++}
++EXPORT_SYMBOL(ve_ndisc_fini);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6_queue.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6_queue.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6_queue.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6_queue.c 2006-07-04 14:41:39.000000000 +0400
+@@ -540,8 +540,11 @@ ipq_rcv_sk(struct sock *sk, int len)
+ down(&ipqnl_sem);
+
+ for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
++ struct ve_struct *env;
+ skb = skb_dequeue(&sk->sk_receive_queue);
++ env = set_exec_env(VE_OWNER_SKB(skb));
+ ipq_rcv_skb(skb);
++ (void)set_exec_env(env);
+ kfree_skb(skb);
+ }
+
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6_tables.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6_tables.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6_tables.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6_tables.c 2006-07-04 14:41:39.000000000 +0400
+@@ -32,9 +32,11 @@
+ #include <asm/semaphore.h>
+ #include <linux/proc_fs.h>
+ #include <linux/cpumask.h>
++#include <ub/ub_mem.h>
+
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+ #include <linux/netfilter/x_tables.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+@@ -79,6 +81,14 @@ do { \
+ #define inline
+ #endif
+
++#ifdef CONFIG_VE_IPTABLES
++/* include ve.h and define get_exec_env */
++#include <linux/sched.h>
++#define ve_ip6t_standard_target (get_exec_env()->_ip6t_standard_target)
++#else
++#define ve_ip6t_standard_target &ip6t_standard_target
++#endif
++
+ /*
+ We keep a set of rules for each CPU, so we can avoid write-locking
+ them in the softirq when updating the counters and therefore
+@@ -632,7 +642,7 @@ check_entry(struct ip6t_entry *e, const
+ }
+ t->u.kernel.target = target;
+
+- if (t->u.kernel.target == &ip6t_standard_target) {
++ if (t->u.kernel.target == ve_ip6t_standard_target) {
+ if (!standard_check(t, size)) {
+ ret = -EINVAL;
+ goto cleanup_matches;
+@@ -1120,7 +1130,7 @@ do_add_counters(void __user *user, unsig
+
+ write_lock_bh(&t->lock);
+ private = t->private;
+- if (private->number != paddc->num_counters) {
++ if (private->number != tmp.num_counters) {
+ ret = -EINVAL;
+ goto unlock_up_free;
+ }
+@@ -1148,7 +1158,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd
+ {
+ int ret;
+
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+@@ -1173,7 +1183,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd
+ {
+ int ret;
+
+- if (!capable(CAP_NET_ADMIN))
++ if (!capable(CAP_VE_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+@@ -1271,7 +1281,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd
+ return ret;
+ }
+
+-int ip6t_register_table(struct xt_table *table,
++struct ip6t_table *ip6t_register_table(struct xt_table *table,
+ const struct ip6t_replace *repl)
+ {
+ int ret;
+@@ -1282,7 +1292,7 @@ int ip6t_register_table(struct xt_table
+
+ newinfo = xt_alloc_table_info(repl->size);
+ if (!newinfo)
+- return -ENOMEM;
++ return ERR_PTR(-ENOMEM);
+
+ /* choose the copy on our node/cpu */
+ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+@@ -1295,15 +1305,13 @@ int ip6t_register_table(struct xt_table
+ repl->underflow);
+ if (ret != 0) {
+ xt_free_table_info(newinfo);
+- return ret;
++ return ERR_PTR(ret);
+ }
+
+- if (xt_register_table(table, &bootstrap, newinfo) != 0) {
++ table = virt_xt_register_table(table, &bootstrap, newinfo);
++ if (IS_ERR(table))
+ xt_free_table_info(newinfo);
+- return ret;
+- }
+-
+- return 0;
++ return table;
+ }
+
+ void ip6t_unregister_table(struct xt_table *table)
+@@ -1311,7 +1319,7 @@ void ip6t_unregister_table(struct xt_tab
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+
+- private = xt_unregister_table(table);
++ private = virt_xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries[raw_smp_processor_id()];
+@@ -1319,6 +1327,29 @@ void ip6t_unregister_table(struct xt_tab
+ xt_free_table_info(private);
+ }
+
++void ip6t_flush_table(struct xt_table *table)
++{
++ struct xt_table *t;
++ void *loc_cpu_entry;
++
++ if (table == NULL)
++ return;
++
++ t = xt_find_table_lock(AF_INET6, table->name);
++ if (t && !IS_ERR(t)) {
++ struct xt_table_info *private;
++ private = t->private;
++ loc_cpu_entry = private->entries[raw_smp_processor_id()];
++ IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size,
++ cleanup_entry, NULL);
++ if (private->number > private->initial_entries)
++ module_put(t->me);
++ private->size = 0;
++ xt_table_unlock(t);
++ module_put(t->me);
++ }
++}
++
+ /* Returns 1 if the type and code is matched by the range, 0 otherwise */
+ static inline int
+ icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+@@ -1405,36 +1436,93 @@ static struct ip6t_match icmp6_matchstru
+ .checkentry = &icmp6_checkentry,
+ };
+
+-static int __init init(void)
++static int init_ip6tables(void)
+ {
+ int ret;
+
+- xt_proto_init(AF_INET6);
++ if (ve_ip6t_standard_target != NULL)
++ return -EEXIST;
+
+- /* Noone else will be downing sem now, so we won't sleep */
+- xt_register_target(AF_INET6, &ip6t_standard_target);
+- xt_register_target(AF_INET6, &ip6t_error_target);
+- xt_register_match(AF_INET6, &icmp6_matchstruct);
++ ret = xt_register_target(AF_INET6, &ip6t_standard_target);
++ if (ret)
++ goto out;
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip6t_standard_target = xt_find_target(AF_INET6, IP6T_STANDARD_TARGET, 0);
++ if (IS_ERR(ve_ip6t_standard_target))
++ goto out_standard;
++#endif
++ ret = xt_register_target(AF_INET6, &ip6t_error_target);
++ if (ret)
++ goto out_error;
++ ret = xt_register_match(AF_INET6, &icmp6_matchstruct);
++ if (ret)
++ goto out_icmp;
++ ret = xt_proto_init(AF_INET6);
++ if (ret)
++ goto out_proc;
++ return 0;
++
++out_proc:
++ xt_unregister_match(AF_INET6, &icmp6_matchstruct);
++out_icmp:
++ xt_unregister_target(AF_INET6, &ip6t_error_target);
++out_error:
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip6t_standard_target = NULL;
++out_standard:
++#endif
++ xt_unregister_target(AF_INET6, &ip6t_standard_target);
++out:
++ return ret;
++}
++
++static void fini_ip6tables(void)
++{
++ xt_proto_fini(AF_INET6);
++ xt_unregister_match(AF_INET6, &icmp6_matchstruct);
++ xt_unregister_target(AF_INET6, &ip6t_error_target);
++#ifdef CONFIG_VE_IPTABLES
++ ve_ip6t_standard_target = NULL;
++#endif
++ xt_unregister_target(AF_INET6, &ip6t_standard_target);
++}
++
++static int __init init(void)
++{
++ int ret;
++
++ ret = init_ip6tables();
++ if (ret)
++ goto out;
+
+ /* Register setsockopt */
+ ret = nf_register_sockopt(&ip6t_sockopts);
+ if (ret < 0) {
+ duprintf("Unable to register sockopts.\n");
+- xt_proto_fini(AF_INET6);
+- return ret;
++ goto out_sockopts;
+ }
+
++ KSYMRESOLVE(init_ip6tables);
++ KSYMRESOLVE(fini_ip6tables);
++ KSYMRESOLVE(ip6t_flush_table);
++ KSYMMODRESOLVE(ip6_tables);
+ printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n");
+ return 0;
++
++out_sockopts:
++ fini_ip6tables();
++out:
++ return ret;
+ }
+
+ static void __exit fini(void)
+ {
++ KSYMMODUNRESOLVE(ip6_tables);
++ KSYMUNRESOLVE(init_ip6tables);
++ KSYMUNRESOLVE(fini_ip6tables);
++ KSYMUNRESOLVE(ip6t_flush_table);
+ nf_unregister_sockopt(&ip6t_sockopts);
+- xt_unregister_match(AF_INET6, &icmp6_matchstruct);
+- xt_unregister_target(AF_INET6, &ip6t_error_target);
+- xt_unregister_target(AF_INET6, &ip6t_standard_target);
+- xt_proto_fini(AF_INET6);
++ fini_ip6tables();
+ }
+
+ /*
+@@ -1516,6 +1604,7 @@ EXPORT_SYMBOL(ip6t_do_table);
+ EXPORT_SYMBOL(ip6t_ext_hdr);
+ EXPORT_SYMBOL(ipv6_find_hdr);
+ EXPORT_SYMBOL(ip6_masked_addrcmp);
++EXPORT_SYMBOL(ip6t_flush_table);
+
+-module_init(init);
++subsys_initcall(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6t_LOG.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_LOG.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6t_LOG.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_LOG.c 2006-07-04 14:41:39.000000000 +0400
+@@ -20,6 +20,7 @@
+ #include <net/udp.h>
+ #include <net/tcp.h>
+ #include <net/ipv6.h>
++#include <linux/nfcalls.h>
+ #include <linux/netfilter.h>
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+
+@@ -488,10 +489,23 @@ static struct nf_logger ip6t_logger = {
+ .me = THIS_MODULE,
+ };
+
++int init_ip6table_LOG(void)
++{
++ return ip6t_register_target(&ip6t_log_reg);
++}
++
++void fini_ip6table_LOG(void)
++{
++ ip6t_unregister_target(&ip6t_log_reg);
++}
++
+ static int __init init(void)
+ {
+- if (ip6t_register_target(&ip6t_log_reg))
+- return -EINVAL;
++ int err;
++
++ err = init_ip6table_LOG();
++ if (err < 0)
++ return err;
+ if (nf_log_register(PF_INET6, &ip6t_logger) < 0) {
+ printk(KERN_WARNING "ip6t_LOG: not logging via system console "
+ "since somebody else already registered for PF_INET6\n");
+@@ -499,13 +513,19 @@ static int __init init(void)
+ * ip6tables userspace would abort */
+ }
+
++ KSYMRESOLVE(init_ip6table_LOG);
++ KSYMRESOLVE(fini_ip6table_LOG);
++ KSYMMODRESOLVE(ip6t_LOG);
+ return 0;
+ }
+
+ static void __exit fini(void)
+ {
++ KSYMMODUNRESOLVE(ip6t_LOG);
++ KSYMUNRESOLVE(init_ip6table_LOG);
++ KSYMUNRESOLVE(fini_ip6table_LOG);
+ nf_log_unregister_logger(&ip6t_logger);
+- ip6t_unregister_target(&ip6t_log_reg);
++ fini_ip6table_LOG();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6t_REJECT.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_REJECT.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6t_REJECT.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_REJECT.c 2006-07-04 14:41:39.000000000 +0400
+@@ -26,6 +26,7 @@
+ #include <net/ip6_checksum.h>
+ #include <net/ip6_fib.h>
+ #include <net/ip6_route.h>
++#include <linux/nfcalls.h>
+ #include <net/flow.h>
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+ #include <linux/netfilter_ipv6/ip6t_REJECT.h>
+@@ -268,17 +269,39 @@ static struct ip6t_target ip6t_reject_re
+ .me = THIS_MODULE
+ };
+
+-static int __init init(void)
++int init_ip6table_REJECT(void)
+ {
+ if (ip6t_register_target(&ip6t_reject_reg))
+ return -EINVAL;
+ return 0;
+ }
+
+-static void __exit fini(void)
++void fini_ip6table_REJECT(void)
+ {
+ ip6t_unregister_target(&ip6t_reject_reg);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_ip6table_REJECT();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_ip6table_REJECT);
++ KSYMRESOLVE(fini_ip6table_REJECT);
++ KSYMMODRESOLVE(ip6t_REJECT);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip6t_REJECT);
++ KSYMUNRESOLVE(init_ip6table_REJECT);
++ KSYMUNRESOLVE(fini_ip6table_REJECT);
++ fini_ip6table_REJECT();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6t_multiport.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_multiport.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6t_multiport.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_multiport.c 2006-07-04 14:41:39.000000000 +0400
+@@ -14,6 +14,7 @@
+ #include <linux/udp.h>
+ #include <linux/skbuff.h>
+ #include <linux/in.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter_ipv6/ip6t_multiport.h>
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+@@ -112,15 +113,37 @@ static struct ip6t_match multiport_match
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_ip6table_multiport(void)
+ {
+ return ip6t_register_match(&multiport_match);
+ }
+
+-static void __exit fini(void)
++void fini_ip6table_multiport(void)
+ {
+ ip6t_unregister_match(&multiport_match);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_ip6table_multiport();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_ip6table_multiport);
++ KSYMRESOLVE(fini_ip6table_multiport);
++ KSYMMODRESOLVE(ip6t_multiport);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip6t_multiport);
++ KSYMUNRESOLVE(init_ip6table_multiport);
++ KSYMUNRESOLVE(fini_ip6table_multiport);
++ fini_ip6table_multiport();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6table_filter.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_filter.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6table_filter.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_filter.c 2006-07-04 14:41:39.000000000 +0400
+@@ -11,12 +11,20 @@
+
+ #include <linux/module.h>
+ #include <linux/moduleparam.h>
++#include <linux/nfcalls.h>
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+ MODULE_DESCRIPTION("ip6tables filter table");
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_packet_filter (get_exec_env()->_ve_ip6t_filter_pf)
++#else
++#define ve_packet_filter &packet_filter
++#endif
++
+ #define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT))
+
+ /* Standard entry. */
+@@ -43,7 +51,7 @@ static struct
+ struct ip6t_replace repl;
+ struct ip6t_standard entries[3];
+ struct ip6t_error term;
+-} initial_table __initdata
++} initial_table
+ = { { "filter", FILTER_VALID_HOOKS, 4,
+ sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error),
+ { [NF_IP6_LOCAL_IN] = 0,
+@@ -108,7 +116,7 @@ ip6t_hook(unsigned int hook,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+ {
+- return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL);
++ return ip6t_do_table(pskb, hook, in, out, ve_packet_filter, NULL);
+ }
+
+ static unsigned int
+@@ -128,7 +136,7 @@ ip6t_local_out_hook(unsigned int hook,
+ }
+ #endif
+
+- return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL);
++ return ip6t_do_table(pskb, hook, in, out, ve_packet_filter, NULL);
+ }
+
+ static struct nf_hook_ops ip6t_ops[] = {
+@@ -159,56 +167,89 @@ static struct nf_hook_ops ip6t_ops[] = {
+ static int forward = NF_ACCEPT;
+ module_param(forward, bool, 0000);
+
+-static int __init init(void)
++int init_ip6table_filter(void)
+ {
+ int ret;
+-
+- if (forward < 0 || forward > NF_MAX_VERDICT) {
+- printk("iptables forward must be 0 or 1\n");
+- return -EINVAL;
+- }
+-
+- /* Entry 1 is the FORWARD hook */
+- initial_table.entries[1].target.verdict = -forward - 1;
++ struct ip6t_table *tmp_filter;
+
+ /* Register table */
+- ret = ip6t_register_table(&packet_filter, &initial_table.repl);
+- if (ret < 0)
+- return ret;
++ tmp_filter = ip6t_register_table(&packet_filter,
++ &initial_table.repl);
++ if (IS_ERR(tmp_filter))
++ return PTR_ERR(tmp_filter);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_filter = tmp_filter;
++#endif
+
+ /* Register hooks */
+- ret = nf_register_hook(&ip6t_ops[0]);
++ ret = virt_nf_register_hook(&ip6t_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+- ret = nf_register_hook(&ip6t_ops[1]);
++ ret = virt_nf_register_hook(&ip6t_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+- ret = nf_register_hook(&ip6t_ops[2]);
++ ret = virt_nf_register_hook(&ip6t_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+ return ret;
+
+ cleanup_hook1:
+- nf_unregister_hook(&ip6t_ops[1]);
++ virt_nf_unregister_hook(&ip6t_ops[1]);
+ cleanup_hook0:
+- nf_unregister_hook(&ip6t_ops[0]);
++ virt_nf_unregister_hook(&ip6t_ops[0]);
+ cleanup_table:
+- ip6t_unregister_table(&packet_filter);
++ ip6t_unregister_table(ve_packet_filter);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_filter = NULL;
++#endif
+
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_ip6table_filter(void)
+ {
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++)
+- nf_unregister_hook(&ip6t_ops[i]);
++ virt_nf_unregister_hook(&ip6t_ops[i]);
+
+- ip6t_unregister_table(&packet_filter);
++ ip6t_unregister_table(ve_packet_filter);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_filter = NULL;
++#endif
++}
++
++static int __init init(void)
++{
++ int err;
++
++ if (forward < 0 || forward > NF_MAX_VERDICT) {
++ printk("iptables forward must be 0 or 1\n");
++ return -EINVAL;
++ }
++
++ /* Entry 1 is the FORWARD hook */
++ initial_table.entries[1].target.verdict = -forward - 1;
++
++ err = init_ip6table_filter();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_ip6table_filter);
++ KSYMRESOLVE(fini_ip6table_filter);
++ KSYMMODRESOLVE(ip6table_filter);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip6table_filter);
++ KSYMUNRESOLVE(init_ip6table_filter);
++ KSYMUNRESOLVE(fini_ip6table_filter);
++ fini_ip6table_filter();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6table_mangle.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_mangle.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6table_mangle.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_mangle.c 2006-07-04 14:41:39.000000000 +0400
+@@ -12,6 +12,7 @@
+ */
+ #include <linux/module.h>
+ #include <linux/netfilter_ipv6/ip6_tables.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+@@ -53,7 +54,7 @@ static struct
+ struct ip6t_replace repl;
+ struct ip6t_standard entries[5];
+ struct ip6t_error term;
+-} initial_table __initdata
++} initial_table
+ = { { "mangle", MANGLE_VALID_HOOKS, 6,
+ sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error),
+ { [NF_IP6_PRE_ROUTING] = 0,
+@@ -130,6 +131,13 @@ static struct ip6t_table packet_mangler
+ .af = AF_INET6,
+ };
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_packet_mangler (get_exec_env()->_ip6t_mangle_table)
++#else
++#define ve_packet_mangler &packet_mangler
++#endif
++
+ /* The work comes in here from netfilter.c. */
+ static unsigned int
+ ip6t_route_hook(unsigned int hook,
+@@ -138,7 +146,7 @@ ip6t_route_hook(unsigned int hook,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+ {
+- return ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL);
++ return ip6t_do_table(pskb, hook, in, out, ve_packet_mangler, NULL);
+ }
+
+ static unsigned int
+@@ -174,7 +182,7 @@ ip6t_local_hook(unsigned int hook,
+ /* flowlabel and prio (includes version, which shouldn't change either */
+ flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h);
+
+- ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL);
++ ret = ip6t_do_table(pskb, hook, in, out, ve_packet_mangler, NULL);
+
+ if (ret != NF_DROP && ret != NF_STOLEN
+ && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr))
+@@ -228,60 +236,93 @@ static struct nf_hook_ops ip6t_ops[] = {
+ },
+ };
+
+-static int __init init(void)
++int init_ip6table_mangle(void)
+ {
+ int ret;
++ struct ip6t_table *tmp_mangler;
+
+ /* Register table */
+- ret = ip6t_register_table(&packet_mangler, &initial_table.repl);
+- if (ret < 0)
+- return ret;
++ tmp_mangler = ip6t_register_table(&packet_mangler,
++ &initial_table.repl);
++ if (IS_ERR(tmp_mangler))
++ return PTR_ERR(tmp_mangler);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_mangler = tmp_mangler;
++#endif
+
+ /* Register hooks */
+- ret = nf_register_hook(&ip6t_ops[0]);
++ ret = virt_nf_register_hook(&ip6t_ops[0]);
+ if (ret < 0)
+ goto cleanup_table;
+
+- ret = nf_register_hook(&ip6t_ops[1]);
++ ret = virt_nf_register_hook(&ip6t_ops[1]);
+ if (ret < 0)
+ goto cleanup_hook0;
+
+- ret = nf_register_hook(&ip6t_ops[2]);
++ ret = virt_nf_register_hook(&ip6t_ops[2]);
+ if (ret < 0)
+ goto cleanup_hook1;
+
+- ret = nf_register_hook(&ip6t_ops[3]);
++ ret = virt_nf_register_hook(&ip6t_ops[3]);
+ if (ret < 0)
+ goto cleanup_hook2;
+
+- ret = nf_register_hook(&ip6t_ops[4]);
++ ret = virt_nf_register_hook(&ip6t_ops[4]);
+ if (ret < 0)
+ goto cleanup_hook3;
+
+ return ret;
+
+ cleanup_hook3:
+- nf_unregister_hook(&ip6t_ops[3]);
++ virt_nf_unregister_hook(&ip6t_ops[3]);
+ cleanup_hook2:
+- nf_unregister_hook(&ip6t_ops[2]);
++ virt_nf_unregister_hook(&ip6t_ops[2]);
+ cleanup_hook1:
+- nf_unregister_hook(&ip6t_ops[1]);
++ virt_nf_unregister_hook(&ip6t_ops[1]);
+ cleanup_hook0:
+- nf_unregister_hook(&ip6t_ops[0]);
++ virt_nf_unregister_hook(&ip6t_ops[0]);
+ cleanup_table:
+- ip6t_unregister_table(&packet_mangler);
++ ip6t_unregister_table(ve_packet_mangler);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_mangler = NULL;
++#endif
+
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_ip6table_mangle(void)
+ {
+ unsigned int i;
+
+ for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++)
+- nf_unregister_hook(&ip6t_ops[i]);
++ virt_nf_unregister_hook(&ip6t_ops[i]);
++
++ ip6t_unregister_table(ve_packet_mangler);
++#ifdef CONFIG_VE_IPTABLES
++ ve_packet_mangler = NULL;
++#endif
++}
++
++static int __init init(void)
++{
++ int err;
+
+- ip6t_unregister_table(&packet_mangler);
++ err = init_ip6table_mangle();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_ip6table_mangle);
++ KSYMRESOLVE(fini_ip6table_mangle);
++ KSYMMODRESOLVE(ip6table_mangle);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(ip6table_mangle);
++ KSYMUNRESOLVE(init_ip6table_mangle);
++ KSYMUNRESOLVE(fini_ip6table_mangle);
++ fini_ip6table_mangle();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6table_raw.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_raw.c
+--- linux-2.6.16.orig/net/ipv6/netfilter/ip6table_raw.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_raw.c 2006-07-04 14:41:39.000000000 +0400
+@@ -145,11 +145,12 @@ static struct nf_hook_ops ip6t_ops[] = {
+ static int __init init(void)
+ {
+ int ret;
++ struct ip6t_table *tmp;
+
+ /* Register table */
+- ret = ip6t_register_table(&packet_raw, &initial_table.repl);
+- if (ret < 0)
+- return ret;
++ tmp = ip6t_register_table(&packet_raw, &initial_table.repl);
++ if (IS_ERR(tmp))
++ return PTR_ERR(tmp);
+
+ /* Register hooks */
+ ret = nf_register_hook(&ip6t_ops[0]);
+diff -upr linux-2.6.16.orig/net/ipv6/proc.c linux-2.6.16-026test015/net/ipv6/proc.c
+--- linux-2.6.16.orig/net/ipv6/proc.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/proc.c 2006-07-04 14:41:39.000000000 +0400
+@@ -25,13 +25,18 @@
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
+ #include <linux/stddef.h>
++#include <linux/ve.h>
+ #include <net/sock.h>
+ #include <net/tcp.h>
+ #include <net/transp_v6.h>
+ #include <net/ipv6.h>
+
+ #ifdef CONFIG_PROC_FS
++#ifdef CONFIG_VE
++#define proc_net_devsnmp6 (get_exec_env()->_proc_net_devsnmp6)
++#else
+ static struct proc_dir_entry *proc_net_devsnmp6;
++#endif
+
+ static int fold_prot_inuse(struct proto *proto)
+ {
+@@ -164,9 +169,9 @@ static int snmp6_seq_show(struct seq_fil
+ seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
+ snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list);
+ } else {
+- snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list);
+- snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list);
+- snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list);
++ snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list);
++ snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list);
++ snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list);
+ }
+ return 0;
+ }
+@@ -229,15 +234,27 @@ int snmp6_unregister_dev(struct inet6_de
+ return 0;
+ }
+
++int ve_snmp_proc_init(void)
++{
++ proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net);
++ return proc_net_devsnmp6 == NULL ? -ENOMEM : 0;
++}
++EXPORT_SYMBOL(ve_snmp_proc_init);
++
++void ve_snmp_proc_fini(void)
++{
++ proc_net_remove("dev_snmp6");
++}
++EXPORT_SYMBOL(ve_snmp_proc_fini);
++
+ int __init ipv6_misc_proc_init(void)
+ {
+ int rc = 0;
+
+- if (!proc_net_fops_create("snmp6", S_IRUGO, &snmp6_seq_fops))
++ if (!proc_glob_fops_create("net/snmp6", S_IRUGO, &snmp6_seq_fops))
+ goto proc_snmp6_fail;
+
+- proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net);
+- if (!proc_net_devsnmp6)
++ if (ve_snmp_proc_init())
+ goto proc_dev_snmp6_fail;
+
+ if (!proc_net_fops_create("sockstat6", S_IRUGO, &sockstat6_seq_fops))
+@@ -246,9 +263,9 @@ out:
+ return rc;
+
+ proc_sockstat6_fail:
+- proc_net_remove("dev_snmp6");
++ ve_snmp_proc_fini();
+ proc_dev_snmp6_fail:
+- proc_net_remove("snmp6");
++ remove_proc_glob_entry("net/snmp6", NULL);
+ proc_snmp6_fail:
+ rc = -ENOMEM;
+ goto out;
+diff -upr linux-2.6.16.orig/net/ipv6/raw.c linux-2.6.16-026test015/net/ipv6/raw.c
+--- linux-2.6.16.orig/net/ipv6/raw.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/raw.c 2006-07-04 14:41:39.000000000 +0400
+@@ -99,6 +99,9 @@ struct sock *__raw_v6_lookup(struct sock
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ continue;
+
++ if (!ve_accessible_strict(VE_OWNER_SK(sk), get_exec_env()))
++ continue;
++
+ if (!ipv6_addr_any(&np->rcv_saddr)) {
+ if (ipv6_addr_equal(&np->rcv_saddr, loc_addr))
+ goto found;
+@@ -1046,8 +1049,14 @@ static struct sock *raw6_get_next(struct
+ do {
+ sk = sk_next(sk);
+ try_again:
+- ;
+- } while (sk && sk->sk_family != PF_INET6);
++ if (!sk)
++ break;
++ if (sk->sk_family != PF_INET6)
++ continue;
++ if (ve_accessible(VE_OWNER_SK(sk),
++ get_exec_env()))
++ break;
++ } while (1);
+
+ if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) {
+ sk = sk_head(&raw_v6_htable[state->bucket]);
+@@ -1166,13 +1175,13 @@ static struct file_operations raw6_seq_f
+
+ int __init raw6_proc_init(void)
+ {
+- if (!proc_net_fops_create("raw6", S_IRUGO, &raw6_seq_fops))
++ if (!proc_glob_fops_create("net/raw6", S_IRUGO, &raw6_seq_fops))
+ return -ENOMEM;
+ return 0;
+ }
+
+ void raw6_proc_exit(void)
+ {
+- proc_net_remove("raw6");
++ remove_proc_glob_entry("net/raw6", NULL);
+ }
+ #endif /* CONFIG_PROC_FS */
+diff -upr linux-2.6.16.orig/net/ipv6/reassembly.c linux-2.6.16-026test015/net/ipv6/reassembly.c
+--- linux-2.6.16.orig/net/ipv6/reassembly.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/reassembly.c 2006-07-04 14:41:39.000000000 +0400
+@@ -43,6 +43,7 @@
+ #include <linux/icmpv6.h>
+ #include <linux/random.h>
+ #include <linux/jhash.h>
++#include <linux/ve_owner.h>
+
+ #include <net/sock.h>
+ #include <net/snmp.h>
+@@ -53,6 +54,7 @@
+ #include <net/rawv6.h>
+ #include <net/ndisc.h>
+ #include <net/addrconf.h>
++#include <linux/ve_owner.h>
+
+ int sysctl_ip6frag_high_thresh = 256*1024;
+ int sysctl_ip6frag_low_thresh = 192*1024;
+@@ -95,8 +97,12 @@ struct frag_queue
+ #define FIRST_IN 2
+ #define LAST_IN 1
+ __u16 nhoffset;
++ struct ve_struct *owner_env;
+ };
+
++DCL_VE_OWNER_PROTO(IP6Q, struct frag_queue, owner_env)
++DCL_VE_OWNER(IP6Q, struct frag_queue, owner_env)
++
+ /* Hash table. */
+
+ #define IP6Q_HASHSZ 64
+@@ -288,6 +294,9 @@ static void ip6_evictor(void)
+ static void ip6_frag_expire(unsigned long data)
+ {
+ struct frag_queue *fq = (struct frag_queue *) data;
++ struct ve_struct *envid;
++
++ envid = set_exec_env(VE_OWNER_IP6Q(fq));
+
+ spin_lock(&fq->lock);
+
+@@ -318,6 +327,8 @@ static void ip6_frag_expire(unsigned lon
+ out:
+ spin_unlock(&fq->lock);
+ fq_put(fq, NULL);
++
++ (void)set_exec_env(envid);
+ }
+
+ /* Creation primitives. */
+@@ -336,7 +347,8 @@ static struct frag_queue *ip6_frag_inter
+ hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
+ if (fq->id == fq_in->id &&
+ ipv6_addr_equal(&fq_in->saddr, &fq->saddr) &&
+- ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) {
++ ipv6_addr_equal(&fq_in->daddr, &fq->daddr) &&
++ fq->owner_env == get_exec_env()) {
+ atomic_inc(&fq->refcnt);
+ write_unlock(&ip6_frag_lock);
+ fq_in->last_in |= COMPLETE;
+@@ -380,6 +392,8 @@ ip6_frag_create(unsigned int hash, u32 i
+ spin_lock_init(&fq->lock);
+ atomic_set(&fq->refcnt, 1);
+
++ SET_VE_OWNER_IP6Q(fq, get_exec_env());
++
+ return ip6_frag_intern(hash, fq);
+
+ oom:
+@@ -398,7 +412,8 @@ fq_find(u32 id, struct in6_addr *src, st
+ hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) {
+ if (fq->id == id &&
+ ipv6_addr_equal(src, &fq->saddr) &&
+- ipv6_addr_equal(dst, &fq->daddr)) {
++ ipv6_addr_equal(dst, &fq->daddr) &&
++ fq->owner_env == get_exec_env()) {
+ atomic_inc(&fq->refcnt);
+ read_unlock(&ip6_frag_lock);
+ return fq;
+@@ -727,6 +742,9 @@ static int ipv6_frag_rcv(struct sk_buff
+ fq->meat == fq->len)
+ ret = ip6_frag_reasm(fq, skbp, dev);
+
++ if (ret > 0)
++ SET_VE_OWNER_SKB(*skbp, VE_OWNER_SKB(skb));
++
+ spin_unlock(&fq->lock);
+ fq_put(fq, NULL);
+ return ret;
+@@ -737,6 +755,50 @@ static int ipv6_frag_rcv(struct sk_buff
+ return -1;
+ }
+
++#ifdef CONFIG_VE
++/* XXX */
++void ip6_frag_cleanup(struct ve_struct *envid)
++{
++ int i, progress;
++
++ local_bh_disable();
++ do {
++ progress = 0;
++ for (i = 0; i < IP6Q_HASHSZ; i++) {
++ struct frag_queue *fq;
++ struct hlist_node *p, *n;
++
++ if (hlist_empty(&ip6_frag_hash[i]))
++ continue;
++inner_restart:
++ read_lock(&ip6_frag_lock);
++ hlist_for_each_entry_safe(fq, p, n,
++ &ip6_frag_hash[i], list) {
++ if (!ve_accessible_strict(
++ VE_OWNER_IP6Q(fq),
++ envid))
++ continue;
++ atomic_inc(&fq->refcnt);
++ read_unlock(&ip6_frag_lock);
++
++ spin_lock(&fq->lock);
++ if (!(fq->last_in&COMPLETE))
++ fq_kill(fq);
++ spin_unlock(&fq->lock);
++
++ fq_put(fq, NULL);
++ progress = 1;
++ goto inner_restart;
++ }
++ read_unlock(&ip6_frag_lock);
++ }
++ } while(progress);
++ local_bh_enable();
++}
++EXPORT_SYMBOL(ip6_frag_cleanup);
++#endif
++
++
+ static struct inet6_protocol frag_protocol =
+ {
+ .handler = ipv6_frag_rcv,
+diff -upr linux-2.6.16.orig/net/ipv6/route.c linux-2.6.16-026test015/net/ipv6/route.c
+--- linux-2.6.16.orig/net/ipv6/route.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/route.c 2006-07-04 14:41:39.000000000 +0400
+@@ -52,7 +52,6 @@
+ #include <net/addrconf.h>
+ #include <net/tcp.h>
+ #include <linux/rtnetlink.h>
+-#include <net/dst.h>
+ #include <net/xfrm.h>
+
+ #include <asm/uaccess.h>
+@@ -113,7 +112,6 @@ struct rt6_info ip6_null_entry = {
+ .dst = {
+ .__refcnt = ATOMIC_INIT(1),
+ .__use = 1,
+- .dev = &loopback_dev,
+ .obsolete = -1,
+ .error = -ENETUNREACH,
+ .metrics = { [RTAX_HOPLIMIT - 1] = 255, },
+@@ -128,11 +126,19 @@ struct rt6_info ip6_null_entry = {
+ .rt6i_ref = ATOMIC_INIT(1),
+ };
+
+-struct fib6_node ip6_routing_table = {
+- .leaf = &ip6_null_entry,
+- .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
++struct fib6_table global_fib6_table = {
++ .root = {
++ .leaf = &ip6_null_entry,
++ .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
++ }
+ };
+
++#ifdef CONFIG_VE
++#define ip6_routing_table (get_exec_env()->_fib6_table->root)
++#else
++#define ip6_routing_table (global_ip6_routing_table.root)
++#endif
++
+ /* Protects all the ip6 fib */
+
+ DEFINE_RWLOCK(rt6_lock);
+@@ -778,7 +784,7 @@ static int ipv6_get_mtu(struct net_devic
+
+ int ipv6_get_hoplimit(struct net_device *dev)
+ {
+- int hoplimit = ipv6_devconf.hop_limit;
++ int hoplimit = ve_ipv6_devconf.hop_limit;
+ struct inet6_dev *idev;
+
+ idev = in6_dev_get(dev);
+@@ -1421,10 +1427,12 @@ struct rt6_info *addrconf_dst_alloc(stru
+ rt->rt6i_flags |= RTF_ANYCAST;
+ else
+ rt->rt6i_flags |= RTF_LOCAL;
+- rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
+- if (rt->rt6i_nexthop == NULL) {
++ rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
++ if (IS_ERR(rt->rt6i_nexthop)) {
++ void *err = rt->rt6i_nexthop;
++ rt->rt6i_nexthop = NULL;
+ dst_free((struct dst_entry *) rt);
+- return ERR_PTR(-ENOMEM);
++ return err;
+ }
+
+ ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
+@@ -1640,8 +1648,12 @@ static int rt6_fill_node(struct sk_buff
+ goto rtattr_failure;
+ if (rt->u.dst.neighbour)
+ RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
+- if (rt->u.dst.dev)
+- RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
++ if (rt->u.dst.dev) {
++ struct net_device *odev = rt->rt6i_dev;
++ if (rt == &ip6_null_entry)
++ odev = &loopback_dev;
++ RTA_PUT(skb, RTA_OIF, sizeof(int), &odev->ifindex);
++ }
+ RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
+ ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
+ if (rt->rt6i_expires)
+@@ -2110,23 +2122,31 @@ void __init ip6_route_init(void)
+ if (!ip6_dst_ops.kmem_cachep)
+ panic("cannot create ip6_dst_cache");
+
++#ifdef CONFIG_VE
++ global_fib6_table.owner_env = get_ve0();
++ get_ve0()->_fib6_table = &global_fib6_table;
++#endif
++ list_add(&global_fib6_table.list, &fib6_table_list);
+ fib6_init();
+ #ifdef CONFIG_PROC_FS
+- p = proc_net_create("ipv6_route", 0, rt6_proc_info);
+- if (p)
++ p = create_proc_glob_entry("net/ipv6_route", 0, NULL);
++ if (p) {
+ p->owner = THIS_MODULE;
++ p->get_info = rt6_proc_info;
++ }
+
+ proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
+ #endif
+ #ifdef CONFIG_XFRM
+ xfrm6_init();
+ #endif
++ ip6_null_entry.u.dst.dev = &loopback_dev;
+ }
+
+ void ip6_route_cleanup(void)
+ {
+ #ifdef CONFIG_PROC_FS
+- proc_net_remove("ipv6_route");
++ remove_proc_glob_entry("net/ipv6_route", NULL);
+ proc_net_remove("rt6_stats");
+ #endif
+ #ifdef CONFIG_XFRM
+@@ -2136,3 +2156,35 @@ void ip6_route_cleanup(void)
+ fib6_gc_cleanup();
+ kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
+ }
++
++int init_ve_route6(struct ve_struct *ve)
++{
++ struct ve_struct *old_env = set_exec_env(ve);
++ ve->_fib6_table = kzalloc(sizeof(struct fib6_table), GFP_KERNEL_UBC);
++ if (ve->_fib6_table) {
++ ve->_fib6_table->owner_env = ve;
++ ve->_fib6_table->root.leaf = &ip6_null_entry;
++ ve->_fib6_table->root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
++ write_lock_bh(&rt6_lock);
++ list_add(&ve->_fib6_table->list, &fib6_table_list);
++ write_unlock_bh(&rt6_lock);
++ }
++ set_exec_env(old_env);
++ return ve->_fib6_table ? 0 : -ENOMEM;
++}
++EXPORT_SYMBOL(init_ve_route6);
++
++void fini_ve_route6(struct ve_struct *ve)
++{
++ struct ve_struct *old_env = set_exec_env(ve);
++
++ if (ve->_fib6_table) {
++ rt6_ifdown(NULL);
++ write_lock_bh(&rt6_lock);
++ list_del(&ve->_fib6_table->list);
++ write_unlock_bh(&rt6_lock);
++ kfree(ve->_fib6_table);
++ }
++ set_exec_env(old_env);
++}
++EXPORT_SYMBOL(fini_ve_route6);
+diff -upr linux-2.6.16.orig/net/ipv6/tcp_ipv6.c linux-2.6.16-026test015/net/ipv6/tcp_ipv6.c
+--- linux-2.6.16.orig/net/ipv6/tcp_ipv6.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/tcp_ipv6.c 2006-07-04 14:41:39.000000000 +0400
+@@ -62,6 +62,8 @@
+ #include <net/dsfield.h>
+ #include <net/timewait_sock.h>
+
++#include <ub/ub_tcp.h>
++
+ #include <asm/uaccess.h>
+
+ #include <linux/proc_fs.h>
+@@ -77,7 +79,7 @@ static void tcp_v6_send_check(struct soc
+
+ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+
+-static struct inet_connection_sock_af_ops ipv6_mapped;
++struct inet_connection_sock_af_ops ipv6_mapped;
+ static struct inet_connection_sock_af_ops ipv6_specific;
+
+ static int tcp_v6_get_port(struct sock *sk, unsigned short snum)
+@@ -273,6 +275,8 @@ static int tcp_v6_connect(struct sock *s
+ ip6_dst_store(sk, dst, NULL);
+ sk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
++ if (!sysctl_tcp_use_sg)
++ sk->sk_route_caps &= ~NETIF_F_SG;
+
+ icsk->icsk_ext_hdr_len = 0;
+ if (np->opt)
+@@ -933,6 +937,8 @@ static struct sock * tcp_v6_syn_recv_soc
+ ip6_dst_store(newsk, dst, NULL);
+ newsk->sk_route_caps = dst->dev->features &
+ ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
++ if (!sysctl_tcp_use_sg)
++ newsk->sk_route_caps &= ~NETIF_F_SG;
+
+ newtcp6sk = (struct tcp6_sock *)newsk;
+ inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+@@ -1040,6 +1046,8 @@ static int tcp_v6_do_rcv(struct sock *sk
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp;
+ struct sk_buff *opt_skb = NULL;
++ struct user_beancounter *ub;
++
+
+ /* Imagine: socket is IPv6. IPv4 packet arrives,
+ goes to IPv4 receive handler and backlogged.
+@@ -1052,6 +1060,8 @@ static int tcp_v6_do_rcv(struct sock *sk
+ if (skb->protocol == htons(ETH_P_IP))
+ return tcp_v4_do_rcv(sk, skb);
+
++ ub = set_exec_ub(sock_bc(sk)->ub);
++
+ if (sk_filter(sk, skb, 0))
+ goto discard;
+
+@@ -1083,7 +1093,7 @@ static int tcp_v6_do_rcv(struct sock *sk
+ TCP_CHECK_TIMER(sk);
+ if (opt_skb)
+ goto ipv6_pktoptions;
+- return 0;
++ goto restore_context;
+ }
+
+ if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
+@@ -1104,7 +1114,7 @@ static int tcp_v6_do_rcv(struct sock *sk
+ goto reset;
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+- return 0;
++ goto restore_context;
+ }
+ }
+
+@@ -1114,6 +1124,9 @@ static int tcp_v6_do_rcv(struct sock *sk
+ TCP_CHECK_TIMER(sk);
+ if (opt_skb)
+ goto ipv6_pktoptions;
++
++restore_context:
++ (void)set_exec_ub(ub);
+ return 0;
+
+ reset:
+@@ -1122,7 +1135,7 @@ discard:
+ if (opt_skb)
+ __kfree_skb(opt_skb);
+ kfree_skb(skb);
+- return 0;
++ goto restore_context;
+ csum_err:
+ TCP_INC_STATS_BH(TCP_MIB_INERRS);
+ goto discard;
+@@ -1154,7 +1167,7 @@ ipv6_pktoptions:
+
+ if (opt_skb)
+ kfree_skb(opt_skb);
+- return 0;
++ goto restore_context;
+ }
+
+ static int tcp_v6_rcv(struct sk_buff **pskb)
+@@ -1315,7 +1328,7 @@ static struct inet_connection_sock_af_op
+ * TCP over IPv4 via INET6 API
+ */
+
+-static struct inet_connection_sock_af_ops ipv6_mapped = {
++struct inet_connection_sock_af_ops ipv6_mapped = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = tcp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+@@ -1329,6 +1342,7 @@ static struct inet_connection_sock_af_op
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6)
+ };
++EXPORT_SYMBOL_GPL(ipv6_mapped);
+
+
+
+@@ -1535,7 +1549,7 @@ out:
+ static struct file_operations tcp6_seq_fops;
+ static struct tcp_seq_afinfo tcp6_seq_afinfo = {
+ .owner = THIS_MODULE,
+- .name = "tcp6",
++ .name = "net/tcp6",
+ .family = AF_INET6,
+ .seq_show = tcp6_seq_show,
+ .seq_fops = &tcp6_seq_fops,
+diff -upr linux-2.6.16.orig/net/ipv6/udp.c linux-2.6.16-026test015/net/ipv6/udp.c
+--- linux-2.6.16.orig/net/ipv6/udp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/udp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -69,7 +69,9 @@ static int udp_v6_get_port(struct sock *
+ {
+ struct sock *sk2;
+ struct hlist_node *node;
++ struct ve_struct *env;
+
++ env = VE_OWNER_SK(sk);
+ write_lock_bh(&udp_hash_lock);
+ if (snum == 0) {
+ int best_size_so_far, best, result, i;
+@@ -83,7 +85,7 @@ static int udp_v6_get_port(struct sock *
+ int size;
+ struct hlist_head *list;
+
+- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
++ list = &udp_hash[udp_hashfn(result, VEID(env))];
+ if (hlist_empty(list)) {
+ if (result > sysctl_local_port_range[1])
+ result = sysctl_local_port_range[0] +
+@@ -105,7 +107,7 @@ static int udp_v6_get_port(struct sock *
+ result = sysctl_local_port_range[0]
+ + ((result - sysctl_local_port_range[0]) &
+ (UDP_HTABLE_SIZE - 1));
+- if (!udp_lport_inuse(result))
++ if (!udp_lport_inuse(result, env))
+ break;
+ }
+ if (i >= (1 << 16) / UDP_HTABLE_SIZE)
+@@ -114,9 +116,10 @@ gotit:
+ udp_port_rover = snum = result;
+ } else {
+ sk_for_each(sk2, node,
+- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
++ &udp_hash[udp_hashfn(snum, VEID(env))]) {
+ if (inet_sk(sk2)->num == snum &&
+ sk2 != sk &&
++ ve_accessible_strict(VE_OWNER_SK(sk2), env) &&
+ (!sk2->sk_bound_dev_if ||
+ !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+@@ -128,7 +131,7 @@ gotit:
+
+ inet_sk(sk)->num = snum;
+ if (sk_unhashed(sk)) {
+- sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]);
++ sk_add_node(sk, &udp_hash[udp_hashfn(snum, VEID(env))]);
+ sock_prot_inc_use(sk->sk_prot);
+ }
+ write_unlock_bh(&udp_hash_lock);
+@@ -161,12 +164,15 @@ static struct sock *udp_v6_lookup(struct
+ struct hlist_node *node;
+ unsigned short hnum = ntohs(dport);
+ int badness = -1;
++ struct ve_struct *env;
+
+ read_lock(&udp_hash_lock);
+- sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) {
++ env = get_exec_env();
++ sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) {
+ struct inet_sock *inet = inet_sk(sk);
+
+- if (inet->num == hnum && sk->sk_family == PF_INET6) {
++ if (inet->num == hnum && sk->sk_family == PF_INET6 &&
++ ve_accessible_strict(VE_OWNER_SK(sk), env)) {
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ int score = 0;
+ if (inet->dport) {
+@@ -415,7 +421,8 @@ static void udpv6_mcast_deliver(struct u
+ int dif;
+
+ read_lock(&udp_hash_lock);
+- sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
++ sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest),
++ VEID(VE_OWNER_SKB(skb)))]);
+ dif = skb->dev->ifindex;
+ sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+ if (!sk) {
+@@ -1018,7 +1025,7 @@ static int udp6_seq_show(struct seq_file
+ static struct file_operations udp6_seq_fops;
+ static struct udp_seq_afinfo udp6_seq_afinfo = {
+ .owner = THIS_MODULE,
+- .name = "udp6",
++ .name = "net/udp6",
+ .family = AF_INET6,
+ .seq_show = udp6_seq_show,
+ .seq_fops = &udp6_seq_fops,
+diff -upr linux-2.6.16.orig/net/ipv6/xfrm6_policy.c linux-2.6.16-026test015/net/ipv6/xfrm6_policy.c
+--- linux-2.6.16.orig/net/ipv6/xfrm6_policy.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/ipv6/xfrm6_policy.c 2006-07-04 14:41:36.000000000 +0400
+@@ -191,16 +191,18 @@ error:
+ static inline void
+ _decode_session6(struct sk_buff *skb, struct flowi *fl)
+ {
+- u16 offset = sizeof(struct ipv6hdr);
++ u16 offset = skb->h.raw - skb->nh.raw;
+ struct ipv6hdr *hdr = skb->nh.ipv6h;
+- struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
+- u8 nexthdr = skb->nh.ipv6h->nexthdr;
++ struct ipv6_opt_hdr *exthdr;
++ u8 nexthdr = skb->nh.raw[IP6CB(skb)->nhoff];
+
+ memset(fl, 0, sizeof(struct flowi));
+ ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr);
+ ipv6_addr_copy(&fl->fl6_src, &hdr->saddr);
+
+ while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) {
++ exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
++
+ switch (nexthdr) {
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_HOP:
+diff -upr linux-2.6.16.orig/net/netfilter/core.c linux-2.6.16-026test015/net/netfilter/core.c
+--- linux-2.6.16.orig/net/netfilter/core.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/core.c 2006-07-04 14:41:39.000000000 +0400
+@@ -32,16 +32,24 @@
+ * of skbuffs queued for userspace, and not deregister a hook unless
+ * this is zero, but that sucks. Now, we simply check when the
+ * packets come back: if the hook is gone, the packet is discarded. */
++static DEFINE_SPINLOCK(nf_hook_lock);
++
+ struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+ EXPORT_SYMBOL(nf_hooks);
+-static DEFINE_SPINLOCK(nf_hook_lock);
++#ifdef CONFIG_VE_IPTABLES
++#define ve_nf_hooks \
++ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks))
++#else
++#define ve_nf_hooks nf_hooks
++#endif
++
+
+ int nf_register_hook(struct nf_hook_ops *reg)
+ {
+ struct list_head *i;
+
+ spin_lock_bh(&nf_hook_lock);
+- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
++ list_for_each(i, &ve_nf_hooks[reg->pf][reg->hooknum]) {
+ if (reg->priority < ((struct nf_hook_ops *)i)->priority)
+ break;
+ }
+@@ -53,6 +61,33 @@ int nf_register_hook(struct nf_hook_ops
+ }
+ EXPORT_SYMBOL(nf_register_hook);
+
++int virt_nf_register_hook(struct nf_hook_ops *reg)
++{
++ int ret = 0;
++
++ if (!ve_is_super(get_exec_env())) {
++ struct nf_hook_ops *tmp;
++ ret = -ENOMEM;
++ tmp = kmalloc(sizeof(struct nf_hook_ops), GFP_KERNEL);
++ if (!tmp)
++ goto nomem;
++ memcpy(tmp, reg, sizeof(struct nf_hook_ops));
++ reg = tmp;
++ }
++
++ ret = nf_register_hook(reg);
++ if (ret)
++ goto out;
++
++ return 0;
++out:
++ if (!ve_is_super(get_exec_env()))
++ kfree(reg);
++nomem:
++ return ret;
++}
++EXPORT_SYMBOL(virt_nf_register_hook);
++
+ void nf_unregister_hook(struct nf_hook_ops *reg)
+ {
+ spin_lock_bh(&nf_hook_lock);
+@@ -63,6 +98,29 @@ void nf_unregister_hook(struct nf_hook_o
+ }
+ EXPORT_SYMBOL(nf_unregister_hook);
+
++int virt_nf_unregister_hook(struct nf_hook_ops *reg)
++{
++ struct nf_hook_ops *i;
++
++ spin_lock_bh(&nf_hook_lock);
++ list_for_each_entry(i, &ve_nf_hooks[reg->pf][reg->hooknum], list) {
++ if (reg->hook == i->hook) {
++ reg = i;
++ break;
++ }
++ }
++ spin_unlock_bh(&nf_hook_lock);
++ if (reg != i)
++ return -ENOENT;
++
++ nf_unregister_hook(reg);
++
++ if (!ve_is_super(get_exec_env()))
++ kfree(reg);
++ return 0;
++}
++EXPORT_SYMBOL(virt_nf_unregister_hook);
++
+ unsigned int nf_iterate(struct list_head *head,
+ struct sk_buff **skb,
+ int hook,
+@@ -120,9 +178,9 @@ int nf_hook_slow(int pf, unsigned int ho
+ /* We may already have this, but read-locks nest anyway */
+ rcu_read_lock();
+
+- elem = &nf_hooks[pf][hook];
++ elem = &ve_nf_hooks[pf][hook];
+ next_hook:
+- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
++ verdict = nf_iterate(&ve_nf_hooks[pf][hook], pskb, hook, indev,
+ outdev, &elem, okfn, hook_thresh);
+ if (verdict == NF_ACCEPT || verdict == NF_STOP) {
+ ret = 1;
+@@ -195,13 +253,54 @@ struct proc_dir_entry *proc_net_netfilte
+ EXPORT_SYMBOL(proc_net_netfilter);
+ #endif
+
+-void __init netfilter_init(void)
++void init_nf_hooks(struct list_head (*nh)[NF_MAX_HOOKS])
+ {
+ int i, h;
+ for (i = 0; i < NPROTO; i++) {
+ for (h = 0; h < NF_MAX_HOOKS; h++)
+- INIT_LIST_HEAD(&nf_hooks[i][h]);
++ INIT_LIST_HEAD(&ve_nf_hooks[i][h]);
+ }
++}
++
++int init_netfilter(void)
++{
++#ifdef CONFIG_VE_IPTABLES
++ struct ve_struct *envid;
++
++ envid = get_exec_env();
++ envid->_nf_hooks = kmalloc(sizeof(nf_hooks), GFP_KERNEL);
++ if (envid->_nf_hooks == NULL)
++ return -ENOMEM;
++
++ /* FIXME: charge ubc */
++
++ init_nf_hooks(envid->_nf_hooks);
++ return 0;
++#else
++ init_nf_hooks(nf_hooks);
++ return 0;
++#endif
++}
++EXPORT_SYMBOL(init_netfilter);
++
++#ifdef CONFIG_VE_IPTABLES
++void fini_netfilter(void)
++{
++ struct ve_struct *envid;
++
++ envid = get_exec_env();
++ if (envid->_nf_hooks != NULL)
++ kfree(envid->_nf_hooks);
++ envid->_nf_hooks = NULL;
++
++ /* FIXME: uncharge ubc */
++}
++EXPORT_SYMBOL(fini_netfilter);
++#endif
++
++void __init netfilter_init(void)
++{
++ init_netfilter();
+
+ #ifdef CONFIG_PROC_FS
+ proc_net_netfilter = proc_mkdir("netfilter", proc_net);
+@@ -214,3 +313,4 @@ void __init netfilter_init(void)
+ if (netfilter_log_init() < 0)
+ panic("cannot initialize nf_log");
+ }
++
+diff -upr linux-2.6.16.orig/net/netfilter/nf_conntrack_netlink.c linux-2.6.16-026test015/net/netfilter/nf_conntrack_netlink.c
+--- linux-2.6.16.orig/net/netfilter/nf_conntrack_netlink.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/nf_conntrack_netlink.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1641,7 +1641,7 @@ static void __exit ctnetlink_exit(void)
+ printk("ctnetlink: unregistering from nfnetlink.\n");
+
+ #ifdef CONFIG_NF_CONNTRACK_EVENTS
+- nf_conntrack_unregister_notifier(&ctnl_notifier_exp);
++ nf_conntrack_expect_unregister_notifier(&ctnl_notifier_exp);
+ nf_conntrack_unregister_notifier(&ctnl_notifier);
+ #endif
+
+diff -upr linux-2.6.16.orig/net/netfilter/nf_conntrack_proto_sctp.c linux-2.6.16-026test015/net/netfilter/nf_conntrack_proto_sctp.c
+--- linux-2.6.16.orig/net/netfilter/nf_conntrack_proto_sctp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/nf_conntrack_proto_sctp.c 2006-07-04 14:41:36.000000000 +0400
+@@ -240,12 +240,15 @@ static int do_basic_checks(struct nf_con
+ flag = 1;
+ }
+
+- /* Cookie Ack/Echo chunks not the first OR
+- Init / Init Ack / Shutdown compl chunks not the only chunks */
+- if ((sch->type == SCTP_CID_COOKIE_ACK
++ /*
++ * Cookie Ack/Echo chunks not the first OR
++ * Init / Init Ack / Shutdown compl chunks not the only chunks
++ * OR zero-length.
++ */
++ if (((sch->type == SCTP_CID_COOKIE_ACK
+ || sch->type == SCTP_CID_COOKIE_ECHO
+ || flag)
+- && count !=0 ) {
++ && count !=0) || !sch->length) {
+ DEBUGP("Basic checks failed\n");
+ return 1;
+ }
+@@ -256,7 +259,7 @@ static int do_basic_checks(struct nf_con
+ }
+
+ DEBUGP("Basic checks passed\n");
+- return 0;
++ return count == 0;
+ }
+
+ static int new_state(enum ip_conntrack_dir dir,
+diff -upr linux-2.6.16.orig/net/netfilter/nf_queue.c linux-2.6.16-026test015/net/netfilter/nf_queue.c
+--- linux-2.6.16.orig/net/netfilter/nf_queue.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/nf_queue.c 2006-07-04 14:41:39.000000000 +0400
+@@ -209,12 +209,12 @@ void nf_reinject(struct sk_buff *skb, st
+ /* Drop reference to owner of hook which queued us. */
+ module_put(info->elem->owner);
+
+- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
++ list_for_each_rcu(i, &ve_nf_hooks[info->pf][info->hook]) {
+ if (i == elem)
+ break;
+ }
+
+- if (i == &nf_hooks[info->pf][info->hook]) {
++ if (i == &ve_nf_hooks[info->pf][info->hook]) {
+ /* The module which sent it to userspace is gone. */
+ NFDEBUG("%s: module disappeared, dropping packet.\n",
+ __FUNCTION__);
+@@ -235,7 +235,7 @@ void nf_reinject(struct sk_buff *skb, st
+
+ if (verdict == NF_ACCEPT) {
+ next_hook:
+- verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
++ verdict = nf_iterate(&ve_nf_hooks[info->pf][info->hook],
+ &skb, info->hook,
+ info->indev, info->outdev, &elem,
+ info->okfn, INT_MIN);
+diff -upr linux-2.6.16.orig/net/netfilter/nf_sockopt.c linux-2.6.16-026test015/net/netfilter/nf_sockopt.c
+--- linux-2.6.16.orig/net/netfilter/nf_sockopt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/nf_sockopt.c 2006-07-04 14:41:39.000000000 +0400
+@@ -80,6 +80,12 @@ static int nf_sockopt(struct sock *sk, i
+ struct nf_sockopt_ops *ops;
+ int ret;
+
++#ifdef CONFIG_VE_IPTABLES
++ if (!get_exec_env()->_nf_hooks ||
++ !get_exec_env()->_ipt_standard_target)
++ return -ENOPROTOOPT;
++#endif
++
+ if (down_interruptible(&nf_sockopt_mutex) != 0)
+ return -EINTR;
+
+diff -upr linux-2.6.16.orig/net/netfilter/x_tables.c linux-2.6.16-026test015/net/netfilter/x_tables.c
+--- linux-2.6.16.orig/net/netfilter/x_tables.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/x_tables.c 2006-07-04 14:41:39.000000000 +0400
+@@ -24,6 +24,10 @@
+
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter_arp.h>
++#include <linux/nfcalls.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_mem.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+@@ -38,7 +42,13 @@ struct xt_af {
+ struct list_head tables;
+ };
+
++#ifdef CONFIG_VE_IPTABLES
++/* include ve.h and define get_exec_env */
++#include <linux/sched.h>
++#define xt (get_exec_env()->_xt)
++#else
+ static struct xt_af *xt;
++#endif
+
+ #ifdef DEBUG_IP_FIREWALL_USER
+ #define duprintf(format, args...) printk(format , ## args)
+@@ -52,17 +62,52 @@ enum {
+ MATCH,
+ };
+
++#ifdef CONFIG_USER_RESOURCE
++#define UB_NUMXTENT 23
++static int charge_xtables(struct user_beancounter *ub, unsigned long size)
++{
++ if (ub == NULL)
++ return 0;
++ return charge_beancounter(ub, UB_NUMXTENT, size, 1);
++}
++static void uncharge_xtables(struct user_beancounter *ub, unsigned long size)
++{
++ if (ub == NULL)
++ return;
++ uncharge_beancounter(ub, UB_NUMXTENT, size);
++}
++#endif /* CONFIG_USER_RESOURCE */
++
+ /* Registration hooks for targets. */
+ int
+ xt_register_target(int af, struct xt_target *target)
+ {
+ int ret;
++ struct module *mod = target->me;
++
++ if (!ve_is_super(get_exec_env())) {
++ struct xt_target *tmp;
++ __module_get(mod);
++ ret = -ENOMEM;
++ tmp = ub_kmalloc(sizeof(struct xt_target), GFP_KERNEL);
++ if (!tmp)
++ goto nomem;
++ memcpy(tmp, target, sizeof(struct xt_target));
++ target = tmp;
++ }
+
+ ret = down_interruptible(&xt[af].mutex);
+ if (ret != 0)
+- return ret;
++ goto out;
+ list_add(&target->list, &xt[af].target);
+ up(&xt[af].mutex);
++ return 0;
++out:
++ if (!ve_is_super(get_exec_env())) {
++ kfree(target);
++nomem:
++ module_put(mod);
++ }
+ return ret;
+ }
+ EXPORT_SYMBOL(xt_register_target);
+@@ -71,8 +116,21 @@ void
+ xt_unregister_target(int af, struct xt_target *target)
+ {
+ down(&xt[af].mutex);
++ if (!ve_is_super(get_exec_env())) {
++ target = list_named_find(&xt[af].target, target->name);
++ if (!target) {
++ up(&xt[af].mutex);
++ return;
++ }
++ }
++
+ LIST_DELETE(&xt[af].target, target);
+ up(&xt[af].mutex);
++
++ if (!ve_is_super(get_exec_env())) {
++ module_put(target->me);
++ kfree(target);
++ }
+ }
+ EXPORT_SYMBOL(xt_unregister_target);
+
+@@ -80,14 +138,33 @@ int
+ xt_register_match(int af, struct xt_match *match)
+ {
+ int ret;
++ struct module *mod = match->me;
++
++ if (!ve_is_super(get_exec_env())) {
++ struct xt_match *tmp;
++ __module_get(mod);
++ ret = -ENOMEM;
++ tmp = ub_kmalloc(sizeof(struct xt_match), GFP_KERNEL);
++ if (!tmp)
++ goto nomem;
++ memcpy(tmp, match, sizeof(struct xt_match));
++ match = tmp;
++ }
+
+ ret = down_interruptible(&xt[af].mutex);
+ if (ret != 0)
+- return ret;
++ goto out;
+
+ list_add(&match->list, &xt[af].match);
+ up(&xt[af].mutex);
+
++ return 0;
++out:
++ if (!ve_is_super(get_exec_env())) {
++ kfree(match);
++nomem:
++ module_put(mod);
++ }
+ return ret;
+ }
+ EXPORT_SYMBOL(xt_register_match);
+@@ -96,8 +173,21 @@ void
+ xt_unregister_match(int af, struct xt_match *match)
+ {
+ down(&xt[af].mutex);
++ if (!ve_is_super(get_exec_env())) {
++ match = list_named_find(&xt[af].match, match->name);
++ if (!match) {
++ up(&xt[af].mutex);
++ return;
++ }
++ }
++
+ LIST_DELETE(&xt[af].match, match);
+ up(&xt[af].mutex);
++
++ if (!ve_is_super(get_exec_env())) {
++ module_put(match->me);
++ kfree(match);
++ }
+ }
+ EXPORT_SYMBOL(xt_unregister_match);
+
+@@ -246,7 +336,7 @@ struct xt_table_info *xt_alloc_table_inf
+ if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages)
+ return NULL;
+
+- newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL);
++ newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL_UBC);
+ if (!newinfo)
+ return NULL;
+
+@@ -255,10 +345,10 @@ struct xt_table_info *xt_alloc_table_inf
+ for_each_cpu(cpu) {
+ if (size <= PAGE_SIZE)
+ newinfo->entries[cpu] = kmalloc_node(size,
+- GFP_KERNEL,
++ GFP_KERNEL_UBC,
+ cpu_to_node(cpu));
+ else
+- newinfo->entries[cpu] = vmalloc_node(size,
++ newinfo->entries[cpu] = ub_vmalloc_node(size,
+ cpu_to_node(cpu));
+
+ if (newinfo->entries[cpu] == NULL) {
+@@ -315,6 +405,9 @@ xt_replace_table(struct xt_table *table,
+ int *error)
+ {
+ struct xt_table_info *oldinfo, *private;
++#ifdef CONFIG_USER_RESOURCE
++ struct user_beancounter *old_ub, *new_ub;
++#endif
+
+ /* Do the substitution. */
+ write_lock_bh(&table->lock);
+@@ -328,6 +421,21 @@ xt_replace_table(struct xt_table *table,
+ return NULL;
+ }
+ oldinfo = private;
++
++#ifdef CONFIG_USER_RESOURCE
++ new_ub = mem_ub(newinfo);
++ if (charge_xtables(new_ub, newinfo->number)) {
++ oldinfo = NULL;
++ write_unlock_bh(&table->lock);
++ *error = -ENOMEM;
++ return NULL;
++ }
++ if (num_counters) {
++ old_ub = mem_ub(oldinfo);
++ uncharge_xtables(old_ub, oldinfo->number);
++ }
++#endif
++
+ table->private = newinfo;
+ newinfo->initial_entries = oldinfo->initial_entries;
+ write_unlock_bh(&table->lock);
+@@ -355,6 +463,7 @@ int xt_register_table(struct xt_table *t
+
+ /* Simplifies replace_table code. */
+ table->private = bootstrap;
++ rwlock_init(&table->lock);
+ if (!xt_replace_table(table, 0, newinfo, &ret))
+ goto unlock;
+
+@@ -364,7 +473,6 @@ int xt_register_table(struct xt_table *t
+ /* save number of initial entries */
+ private->initial_entries = private->number;
+
+- rwlock_init(&table->lock);
+ list_prepend(&xt[table->af].tables, table);
+
+ ret = 0;
+@@ -374,6 +482,39 @@ int xt_register_table(struct xt_table *t
+ }
+ EXPORT_SYMBOL_GPL(xt_register_table);
+
++struct xt_table * virt_xt_register_table(struct xt_table *table,
++ struct xt_table_info *bootstrap,
++ struct xt_table_info *newinfo)
++{
++ int ret;
++ struct module *mod = table->me;
++
++ if (!ve_is_super(get_exec_env())) {
++ struct xt_table *tmp;
++ __module_get(mod);
++ ret = -ENOMEM;
++ tmp = ub_kmalloc(sizeof(struct xt_table), GFP_KERNEL);
++ if (!tmp)
++ goto nomem;
++ memcpy(tmp, table, sizeof(struct xt_table));
++ table = tmp;
++ }
++
++ ret = xt_register_table(table, bootstrap, newinfo);
++ if (ret)
++ goto out;
++
++ return table;
++out:
++ if (!ve_is_super(get_exec_env())) {
++ kfree(table);
++nomem:
++ module_put(mod);
++ }
++ return ERR_PTR(ret);
++}
++EXPORT_SYMBOL_GPL(virt_xt_register_table);
++
+ void *xt_unregister_table(struct xt_table *table)
+ {
+ struct xt_table_info *private;
+@@ -383,10 +524,27 @@ void *xt_unregister_table(struct xt_tabl
+ LIST_DELETE(&xt[table->af].tables, table);
+ up(&xt[table->af].mutex);
+
++#ifdef CONFIG_USER_RESOURCE
++ uncharge_xtables(mem_ub(private), private->number);
++#endif
++
+ return private;
+ }
+ EXPORT_SYMBOL_GPL(xt_unregister_table);
+
++void *virt_xt_unregister_table(struct xt_table *table)
++{
++ void *ret;
++
++ ret = xt_unregister_table(table);
++ if (!ve_is_super(get_exec_env())) {
++ module_put(table->me);
++ kfree(table);
++ }
++ return ret;
++}
++EXPORT_SYMBOL_GPL(virt_xt_unregister_table);
++
+ #ifdef CONFIG_PROC_FS
+ static char *xt_proto_prefix[NPROTO] = {
+ [AF_INET] = "ip",
+@@ -597,10 +755,13 @@ void xt_proto_fini(int af)
+ EXPORT_SYMBOL_GPL(xt_proto_fini);
+
+
+-static int __init xt_init(void)
++int init_xtables(void)
+ {
+ int i;
+
++ if (xt)
++ return -EEXIST;
++
+ xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL);
+ if (!xt)
+ return -ENOMEM;
+@@ -614,11 +775,34 @@ static int __init xt_init(void)
+ return 0;
+ }
+
+-static void __exit xt_fini(void)
++void fini_xtables(void)
+ {
+ kfree(xt);
++ xt = NULL;
++}
++
++static int __init xt_init(void)
++{
++ int err;
++
++ err = init_xtables();
++ if (err)
++ return err;
++
++ KSYMRESOLVE(init_xtables);
++ KSYMRESOLVE(fini_xtables);
++ KSYMMODRESOLVE(x_tables);
++ return 0;
++}
++
++static void __exit xt_fini(void)
++{
++ KSYMMODUNRESOLVE(x_tables);
++ KSYMUNRESOLVE(init_xtables);
++ KSYMUNRESOLVE(fini_xtables);
++ fini_xtables();
+ }
+
+-module_init(xt_init);
++subsys_initcall(xt_init);
+ module_exit(xt_fini);
+
+diff -upr linux-2.6.16.orig/net/netfilter/xt_conntrack.c linux-2.6.16-026test015/net/netfilter/xt_conntrack.c
+--- linux-2.6.16.orig/net/netfilter/xt_conntrack.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_conntrack.c 2006-07-04 14:41:39.000000000 +0400
+@@ -20,6 +20,8 @@
+
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter/xt_conntrack.h>
++#include <linux/netfilter_ipv4/ip_tables.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+@@ -213,25 +215,145 @@ static int check(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat_to_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct ipt_entry_match *pm;
++ struct xt_conntrack_info *pinfo;
++ struct compat_xt_conntrack_info info;
++ u_int16_t msize;
++
++ pm = (struct ipt_entry_match *)match;
++ msize = pm->u.user.match_size;
++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match)))
++ return -EFAULT;
++ pinfo = (struct xt_conntrack_info *)pm->data;
++ memset(&info, 0, sizeof(struct compat_xt_conntrack_info));
++ info.statemask = pinfo->statemask;
++ info.statusmask = pinfo->statusmask;
++ memcpy(info.tuple, pinfo->tuple, IP_CT_DIR_MAX *
++ sizeof(struct ip_conntrack_tuple));
++ memcpy(info.sipmsk, pinfo->sipmsk,
++ IP_CT_DIR_MAX * sizeof(struct in_addr));
++ memcpy(info.dipmsk, pinfo->dipmsk,
++ IP_CT_DIR_MAX * sizeof(struct in_addr));
++ info.expires_min = pinfo->expires_min;
++ info.expires_max = pinfo->expires_max;
++ info.flags = pinfo->flags;
++ info.invflags = pinfo->invflags;
++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match),
++ &info, sizeof(struct compat_xt_conntrack_info)))
++ return -EFAULT;
++ msize -= off;
++ if (put_user(msize, (u_int16_t *)*dstptr))
++ return -EFAULT;
++ *size -= off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int compat_from_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct compat_ipt_entry_match *pm;
++ struct ipt_entry_match *dstpm;
++ struct compat_xt_conntrack_info *pinfo;
++ struct xt_conntrack_info info;
++ u_int16_t msize;
++
++ pm = (struct compat_ipt_entry_match *)match;
++ dstpm = (struct ipt_entry_match *)*dstptr;
++ msize = pm->u.user.match_size;
++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match));
++ pinfo = (struct compat_xt_conntrack_info *)pm->data;
++ memset(&info, 0, sizeof(struct xt_conntrack_info));
++ info.statemask = pinfo->statemask;
++ info.statusmask = pinfo->statusmask;
++ memcpy(info.tuple, pinfo->tuple, IP_CT_DIR_MAX *
++ sizeof(struct ip_conntrack_tuple));
++ memcpy(info.sipmsk, pinfo->sipmsk,
++ IP_CT_DIR_MAX * sizeof(struct in_addr));
++ memcpy(info.dipmsk, pinfo->dipmsk,
++ IP_CT_DIR_MAX * sizeof(struct in_addr));
++ info.expires_min = pinfo->expires_min;
++ info.expires_max = pinfo->expires_max;
++ info.flags = pinfo->flags;
++ info.invflags = pinfo->invflags;
++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match),
++ &info, sizeof(struct xt_conntrack_info));
++ msize += off;
++ dstpm->u.user.match_size = msize;
++ *size += off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int compat(void *match, void **dstptr, int *size, int convert)
++{
++ int ret, off;
++
++ off = XT_ALIGN(sizeof(struct xt_conntrack_info)) -
++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_conntrack_info));
++ switch (convert) {
++ case COMPAT_TO_USER:
++ ret = compat_to_user(match, dstptr, size, off);
++ break;
++ case COMPAT_FROM_USER:
++ ret = compat_from_user(match, dstptr, size, off);
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ ret = 0;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++#endif
++
+ static struct xt_match conntrack_match = {
+ .name = "conntrack",
+ .match = &match,
+ .checkentry = &check,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
++int init_xt_conntrack_match(void)
++{
++ return xt_register_match(AF_INET, &conntrack_match);
++}
++
++void fini_xt_conntrack_match(void)
++{
++ xt_unregister_match(AF_INET, &conntrack_match);
++}
++
+ static int __init init(void)
+ {
+ int ret;
+ need_conntrack();
+- ret = xt_register_match(AF_INET, &conntrack_match);
+-
++ ret = init_xt_conntrack_match();
++ if (ret < 0)
++ return ret;
++
++ KSYMRESOLVE(init_xt_conntrack_match);
++ KSYMRESOLVE(fini_xt_conntrack_match);
++ KSYMMODRESOLVE(xt_conntrack);
+ return ret;
+ }
+
+ static void __exit fini(void)
+ {
+- xt_unregister_match(AF_INET, &conntrack_match);
++ KSYMMODUNRESOLVE(xt_conntrack);
++ KSYMUNRESOLVE(init_xt_conntrack_match);
++ KSYMUNRESOLVE(fini_xt_conntrack_match);
++ fini_xt_conntrack_match();
+ }
+
+ module_init(init);
+diff -upr linux-2.6.16.orig/net/netfilter/xt_helper.c linux-2.6.16-026test015/net/netfilter/xt_helper.c
+--- linux-2.6.16.orig/net/netfilter/xt_helper.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_helper.c 2006-07-04 14:41:39.000000000 +0400
+@@ -24,6 +24,8 @@
+ #endif
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter/xt_helper.h>
++#include <linux/netfilter_ipv4/ip_tables.h>
++#include <linux/nfcalls.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+@@ -148,23 +150,107 @@ static int check(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat_to_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct ipt_entry_match *pm;
++ struct xt_helper_info *pinfo;
++ struct compat_xt_helper_info info;
++ u_int16_t msize;
++
++ pm = (struct ipt_entry_match *)match;
++ msize = pm->u.user.match_size;
++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match)))
++ return -EFAULT;
++ pinfo = (struct xt_helper_info *)pm->data;
++ memset(&info, 0, sizeof(struct compat_xt_helper_info));
++ info.invert = pinfo->invert;
++ memcpy(info.name, pinfo->name, 30);
++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match),
++ &info, sizeof(struct compat_xt_helper_info)))
++ return -EFAULT;
++ msize -= off;
++ if (put_user(msize, (u_int16_t *)*dstptr))
++ return -EFAULT;
++ *size -= off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int compat_from_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct compat_ipt_entry_match *pm;
++ struct ipt_entry_match *dstpm;
++ struct compat_xt_helper_info *pinfo;
++ struct xt_helper_info info;
++ u_int16_t msize;
++
++ pm = (struct compat_ipt_entry_match *)match;
++ dstpm = (struct ipt_entry_match *)*dstptr;
++ msize = pm->u.user.match_size;
++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match));
++ pinfo = (struct compat_xt_helper_info *)pm->data;
++ memset(&info, 0, sizeof(struct xt_helper_info));
++ info.invert = pinfo->invert;
++ memcpy(info.name, pinfo->name, 30);
++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match),
++ &info, sizeof(struct xt_helper_info));
++ msize += off;
++ dstpm->u.user.match_size = msize;
++ *size += off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int compat(void *match, void **dstptr, int *size, int convert)
++{
++ int ret, off;
++
++ off = XT_ALIGN(sizeof(struct xt_helper_info)) -
++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_helper_info));
++ switch (convert) {
++ case COMPAT_TO_USER:
++ ret = compat_to_user(match, dstptr, size, off);
++ break;
++ case COMPAT_FROM_USER:
++ ret = compat_from_user(match, dstptr, size, off);
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ ret = 0;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++#endif
++
+ static struct xt_match helper_match = {
+ .name = "helper",
+ .match = &match,
+ .checkentry = &check,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+ static struct xt_match helper6_match = {
+ .name = "helper",
+ .match = &match,
+ .checkentry = &check,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_xt_helper(void)
+ {
+ int ret;
+- need_conntrack();
+
+ ret = xt_register_match(AF_INET, &helper_match);
+ if (ret < 0)
+@@ -177,12 +263,35 @@ static int __init init(void)
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_xt_helper(void)
+ {
+ xt_unregister_match(AF_INET, &helper_match);
+ xt_unregister_match(AF_INET6, &helper6_match);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ need_conntrack();
++ err = init_xt_helper();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_xt_helper);
++ KSYMRESOLVE(fini_xt_helper);
++ KSYMMODRESOLVE(xt_helper);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(xt_helper);
++ KSYMUNRESOLVE(init_xt_helper);
++ KSYMUNRESOLVE(fini_xt_helper);
++ fini_xt_helper();
++}
++
+ module_init(init);
+ module_exit(fini);
+
+diff -upr linux-2.6.16.orig/net/netfilter/xt_length.c linux-2.6.16-026test015/net/netfilter/xt_length.c
+--- linux-2.6.16.orig/net/netfilter/xt_length.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_length.c 2006-07-04 14:41:39.000000000 +0400
+@@ -13,6 +13,7 @@
+
+ #include <linux/netfilter/xt_length.h>
+ #include <linux/netfilter/x_tables.h>
++#include <linux/nfcalls.h>
+
+ MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+ MODULE_DESCRIPTION("IP tables packet length matching module");
+@@ -63,20 +64,38 @@ checkentry(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = XT_ALIGN(sizeof(struct xt_length_info)) -
++ COMPAT_XT_ALIGN(sizeof(struct xt_length_info));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++#endif
++
+ static struct xt_match length_match = {
+ .name = "length",
+ .match = &match,
+ .checkentry = &checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+ static struct xt_match length6_match = {
+ .name = "length",
+ .match = &match6,
+ .checkentry = &checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_xt_length(void)
+ {
+ int ret;
+ ret = xt_register_match(AF_INET, &length_match);
+@@ -89,11 +108,33 @@ static int __init init(void)
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_xt_length(void)
+ {
+ xt_unregister_match(AF_INET, &length_match);
+ xt_unregister_match(AF_INET6, &length6_match);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_xt_length();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_xt_length);
++ KSYMRESOLVE(fini_xt_length);
++ KSYMMODRESOLVE(xt_length);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(xt_length);
++ KSYMUNRESOLVE(init_xt_length);
++ KSYMUNRESOLVE(fini_xt_length);
++ fini_xt_length();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/netfilter/xt_limit.c linux-2.6.16-026test015/net/netfilter/xt_limit.c
+--- linux-2.6.16.orig/net/netfilter/xt_limit.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_limit.c 2006-07-04 14:41:39.000000000 +0400
+@@ -17,9 +17,11 @@
+ #include <linux/skbuff.h>
+ #include <linux/spinlock.h>
+ #include <linux/interrupt.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter/xt_limit.h>
++#include <linux/netfilter_ipv4/ip_tables.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+@@ -27,6 +29,13 @@ MODULE_DESCRIPTION("iptables rate limit
+ MODULE_ALIAS("ipt_limit");
+ MODULE_ALIAS("ip6t_limit");
+
++#ifdef CONFIG_VE_IPTABLES
++#include <linux/sched.h>
++#define ve_ipt_limit_reg (*(get_exec_env()->_ipt_limit_reg))
++#else
++#define ve_ipt_limit_reg ipt_limit_reg
++#endif
++
+ /* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+@@ -137,20 +146,108 @@ ipt_limit_checkentry(const char *tablena
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int ipt_limit_compat_to_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct ipt_entry_match *pm;
++ struct xt_rateinfo *pinfo;
++ struct compat_xt_rateinfo rinfo;
++ u_int16_t msize;
++
++ pm = (struct ipt_entry_match *)match;
++ msize = pm->u.user.match_size;
++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match)))
++ return -EFAULT;
++ pinfo = (struct xt_rateinfo *)pm->data;
++ memset(&rinfo, 0, sizeof(struct compat_xt_rateinfo));
++ rinfo.avg = pinfo->avg;
++ rinfo.burst = pinfo->burst;
++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match),
++ &rinfo, sizeof(struct compat_xt_rateinfo)))
++ return -EFAULT;
++ msize -= off;
++ if (put_user(msize, (u_int16_t *)*dstptr))
++ return -EFAULT;
++ *size -= off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int ipt_limit_compat_from_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct compat_ipt_entry_match *pm;
++ struct ipt_entry_match *dstpm;
++ struct compat_xt_rateinfo *pinfo;
++ struct xt_rateinfo rinfo;
++ u_int16_t msize;
++
++ pm = (struct compat_ipt_entry_match *)match;
++ dstpm = (struct ipt_entry_match *)*dstptr;
++ msize = pm->u.user.match_size;
++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match));
++ pinfo = (struct compat_xt_rateinfo *)pm->data;
++ memset(&rinfo, 0, sizeof(struct xt_rateinfo));
++ rinfo.avg = pinfo->avg;
++ rinfo.burst = pinfo->burst;
++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match),
++ &rinfo, sizeof(struct xt_rateinfo));
++ msize += off;
++ dstpm->u.user.match_size = msize;
++ *size += off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int ipt_limit_compat(void *match, void **dstptr,
++ int *size, int convert)
++{
++ int ret, off;
++
++ off = XT_ALIGN(sizeof(struct xt_rateinfo)) -
++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_rateinfo));
++ switch (convert) {
++ case COMPAT_TO_USER:
++ ret = ipt_limit_compat_to_user(match,
++ dstptr, size, off);
++ break;
++ case COMPAT_FROM_USER:
++ ret = ipt_limit_compat_from_user(match,
++ dstptr, size, off);
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ ret = 0;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++#endif
++
+ static struct xt_match ipt_limit_reg = {
+ .name = "limit",
+ .match = ipt_limit_match,
+ .checkentry = ipt_limit_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = ipt_limit_compat,
++#endif
+ .me = THIS_MODULE,
+ };
+ static struct xt_match limit6_reg = {
+ .name = "limit",
+ .match = ipt_limit_match,
+ .checkentry = ipt_limit_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = ipt_limit_compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_xt_limit(void)
+ {
+ int ret;
+
+@@ -165,11 +262,33 @@ static int __init init(void)
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_xt_limit(void)
+ {
+ xt_unregister_match(AF_INET, &ipt_limit_reg);
+ xt_unregister_match(AF_INET6, &limit6_reg);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_xt_limit();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_xt_limit);
++ KSYMRESOLVE(fini_xt_limit);
++ KSYMMODRESOLVE(xt_limit);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(xt_limit);
++ KSYMUNRESOLVE(init_xt_limit);
++ KSYMUNRESOLVE(fini_xt_limit);
++ fini_xt_limit();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/netfilter/xt_sctp.c linux-2.6.16-026test015/net/netfilter/xt_sctp.c
+--- linux-2.6.16.orig/net/netfilter/xt_sctp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_sctp.c 2006-07-04 14:41:36.000000000 +0400
+@@ -62,7 +62,7 @@ match_packet(const struct sk_buff *skb,
+
+ do {
+ sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+- if (sch == NULL) {
++ if (sch == NULL || sch->length == 0) {
+ duprintf("Dropping invalid SCTP packet.\n");
+ *hotdrop = 1;
+ return 0;
+diff -upr linux-2.6.16.orig/net/netfilter/xt_state.c linux-2.6.16-026test015/net/netfilter/xt_state.c
+--- linux-2.6.16.orig/net/netfilter/xt_state.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_state.c 2006-07-04 14:41:39.000000000 +0400
+@@ -10,9 +10,11 @@
+
+ #include <linux/module.h>
+ #include <linux/skbuff.h>
++#include <linux/nfcalls.h>
+ #include <net/netfilter/nf_conntrack_compat.h>
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter/xt_state.h>
++#include <linux/netfilter_ipv4/ip_tables.h>
+
+ MODULE_LICENSE("GPL");
+ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+@@ -55,10 +57,90 @@ static int check(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat_to_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct ipt_entry_match *pm;
++ struct xt_state_info *pinfo;
++ struct compat_xt_state_info info;
++ u_int16_t msize;
++
++ pm = (struct ipt_entry_match *)match;
++ msize = pm->u.user.match_size;
++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match)))
++ return -EFAULT;
++ pinfo = (struct xt_state_info *)pm->data;
++ memset(&info, 0, sizeof(struct compat_xt_state_info));
++ info.statemask = pinfo->statemask;
++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match),
++ &info, sizeof(struct compat_xt_state_info)))
++ return -EFAULT;
++ msize -= off;
++ if (put_user(msize, (u_int16_t *)*dstptr))
++ return -EFAULT;
++ *size -= off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int compat_from_user(void *match, void **dstptr,
++ int *size, int off)
++{
++ struct compat_ipt_entry_match *pm;
++ struct ipt_entry_match *dstpm;
++ struct compat_xt_state_info *pinfo;
++ struct xt_state_info info;
++ u_int16_t msize;
++
++ pm = (struct compat_ipt_entry_match *)match;
++ dstpm = (struct ipt_entry_match *)*dstptr;
++ msize = pm->u.user.match_size;
++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match));
++ pinfo = (struct compat_xt_state_info *)pm->data;
++ memset(&info, 0, sizeof(struct xt_state_info));
++ info.statemask = pinfo->statemask;
++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match),
++ &info, sizeof(struct xt_state_info));
++ msize += off;
++ dstpm->u.user.match_size = msize;
++ *size += off;
++ *dstptr += msize;
++ return 0;
++}
++
++static int compat(void *match, void **dstptr, int *size, int convert)
++{
++ int ret, off;
++
++ off = XT_ALIGN(sizeof(struct xt_state_info)) -
++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_state_info));
++ switch (convert) {
++ case COMPAT_TO_USER:
++ ret = compat_to_user(match, dstptr, size, off);
++ break;
++ case COMPAT_FROM_USER:
++ ret = compat_from_user(match, dstptr, size, off);
++ break;
++ case COMPAT_CALC_SIZE:
++ *size += off;
++ ret = 0;
++ break;
++ default:
++ ret = -ENOPROTOOPT;
++ break;
++ }
++ return ret;
++}
++#endif
++
+ static struct xt_match state_match = {
+ .name = "state",
+ .match = &match,
+ .checkentry = &check,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+@@ -66,15 +148,16 @@ static struct xt_match state6_match = {
+ .name = "state",
+ .match = &match,
+ .checkentry = &check,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_xt_state(void)
+ {
+ int ret;
+
+- need_conntrack();
+-
+ ret = xt_register_match(AF_INET, &state_match);
+ if (ret < 0)
+ return ret;
+@@ -86,11 +169,34 @@ static int __init init(void)
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_xt_state(void)
+ {
+ xt_unregister_match(AF_INET, &state_match);
+ xt_unregister_match(AF_INET6, &state6_match);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ need_conntrack();
++ err = init_xt_state();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_xt_state);
++ KSYMRESOLVE(fini_xt_state);
++ KSYMMODRESOLVE(xt_state);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(xt_state);
++ KSYMUNRESOLVE(init_xt_state);
++ KSYMUNRESOLVE(fini_xt_state);
++ fini_xt_state();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/netfilter/xt_tcpmss.c linux-2.6.16-026test015/net/netfilter/xt_tcpmss.c
+--- linux-2.6.16.orig/net/netfilter/xt_tcpmss.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_tcpmss.c 2006-07-04 14:41:39.000000000 +0400
+@@ -11,6 +11,7 @@
+ #include <linux/module.h>
+ #include <linux/skbuff.h>
+ #include <net/tcp.h>
++#include <linux/nfcalls.h>
+
+ #include <linux/netfilter/xt_tcpmss.h>
+ #include <linux/netfilter/x_tables.h>
+@@ -133,10 +134,25 @@ checkentry6(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = XT_ALIGN(sizeof(struct xt_tcpmss_match_info)) -
++ COMPAT_XT_ALIGN(sizeof(struct xt_tcpmss_match_info));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++#endif
++
+ static struct xt_match tcpmss_match = {
+ .name = "tcpmss",
+ .match = &match,
+ .checkentry = &checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+@@ -144,11 +160,14 @@ static struct xt_match tcpmss6_match = {
+ .name = "tcpmss",
+ .match = &match,
+ .checkentry = &checkentry6,
++#ifdef CONFIG_COMPAT
++ .compat = &compat,
++#endif
+ .me = THIS_MODULE,
+ };
+
+
+-static int __init init(void)
++int init_xt_tcpmss(void)
+ {
+ int ret;
+ ret = xt_register_match(AF_INET, &tcpmss_match);
+@@ -162,11 +181,33 @@ static int __init init(void)
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_xt_tcpmss(void)
+ {
+ xt_unregister_match(AF_INET6, &tcpmss6_match);
+ xt_unregister_match(AF_INET, &tcpmss_match);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_xt_tcpmss();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_xt_tcpmss);
++ KSYMRESOLVE(fini_xt_tcpmss);
++ KSYMMODRESOLVE(xt_tcpmss);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(xt_tcpmss);
++ KSYMUNRESOLVE(init_xt_tcpmss);
++ KSYMUNRESOLVE(fini_xt_tcpmss);
++ fini_xt_tcpmss();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/netfilter/xt_tcpudp.c linux-2.6.16-026test015/net/netfilter/xt_tcpudp.c
+--- linux-2.6.16.orig/net/netfilter/xt_tcpudp.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netfilter/xt_tcpudp.c 2006-07-04 14:41:39.000000000 +0400
+@@ -5,6 +5,7 @@
+ #include <net/ipv6.h>
+ #include <net/tcp.h>
+ #include <net/udp.h>
++#include <linux/nfcalls.h>
+ #include <linux/netfilter/x_tables.h>
+ #include <linux/netfilter/xt_tcpudp.h>
+ #include <linux/netfilter_ipv4/ip_tables.h>
+@@ -266,10 +267,35 @@ udp6_checkentry(const char *tablename,
+ return 1;
+ }
+
++#ifdef CONFIG_COMPAT
++static int tcp_compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = XT_ALIGN(sizeof(struct xt_tcp)) -
++ COMPAT_XT_ALIGN(sizeof(struct xt_tcp));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++
++static int udp_compat(void *match,
++ void **dstptr, int *size, int convert)
++{
++ int off;
++
++ off = XT_ALIGN(sizeof(struct xt_udp)) -
++ COMPAT_XT_ALIGN(sizeof(struct xt_udp));
++ return ipt_match_align_compat(match, dstptr, size, off, convert);
++}
++#endif
++
+ static struct xt_match tcp_matchstruct = {
+ .name = "tcp",
+ .match = &tcp_match,
+ .checkentry = &tcp_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &tcp_compat,
++#endif
+ .me = THIS_MODULE,
+ };
+ static struct xt_match tcp6_matchstruct = {
+@@ -283,6 +309,9 @@ static struct xt_match udp_matchstruct =
+ .name = "udp",
+ .match = &udp_match,
+ .checkentry = &udp_checkentry,
++#ifdef CONFIG_COMPAT
++ .compat = &udp_compat,
++#endif
+ .me = THIS_MODULE,
+ };
+ static struct xt_match udp6_matchstruct = {
+@@ -292,7 +321,7 @@ static struct xt_match udp6_matchstruct
+ .me = THIS_MODULE,
+ };
+
+-static int __init init(void)
++int init_xt_tcpudp(void)
+ {
+ int ret;
+ ret = xt_register_match(AF_INET, &tcp_matchstruct);
+@@ -322,7 +351,7 @@ out_unreg_tcp:
+ return ret;
+ }
+
+-static void __exit fini(void)
++void fini_xt_tcpudp(void)
+ {
+ xt_unregister_match(AF_INET6, &udp6_matchstruct);
+ xt_unregister_match(AF_INET, &udp_matchstruct);
+@@ -330,5 +359,27 @@ static void __exit fini(void)
+ xt_unregister_match(AF_INET, &tcp_matchstruct);
+ }
+
++static int __init init(void)
++{
++ int err;
++
++ err = init_xt_tcpudp();
++ if (err < 0)
++ return err;
++
++ KSYMRESOLVE(init_xt_tcpudp);
++ KSYMRESOLVE(fini_xt_tcpudp);
++ KSYMMODRESOLVE(xt_tcpudp);
++ return 0;
++}
++
++static void __exit fini(void)
++{
++ KSYMMODUNRESOLVE(xt_tcpudp);
++ KSYMUNRESOLVE(init_xt_tcpudp);
++ KSYMUNRESOLVE(fini_xt_tcpudp);
++ fini_xt_tcpudp();
++}
++
+ module_init(init);
+ module_exit(fini);
+diff -upr linux-2.6.16.orig/net/netlink/af_netlink.c linux-2.6.16-026test015/net/netlink/af_netlink.c
+--- linux-2.6.16.orig/net/netlink/af_netlink.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/netlink/af_netlink.c 2006-07-04 14:41:39.000000000 +0400
+@@ -60,27 +60,14 @@
+ #include <net/sock.h>
+ #include <net/scm.h>
+ #include <net/netlink.h>
++#include <net/netlink_sock.h>
++
++#include <ub/beancounter.h>
++#include <ub/ub_net.h>
+
+ #define Nprintk(a...)
+ #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
+
+-struct netlink_sock {
+- /* struct sock has to be the first member of netlink_sock */
+- struct sock sk;
+- u32 pid;
+- u32 dst_pid;
+- u32 dst_group;
+- u32 flags;
+- u32 subscriptions;
+- u32 ngroups;
+- unsigned long *groups;
+- unsigned long state;
+- wait_queue_head_t wait;
+- struct netlink_callback *cb;
+- spinlock_t cb_lock;
+- void (*data_ready)(struct sock *sk, int bytes);
+- struct module *module;
+-};
+
+ #define NETLINK_KERNEL_SOCKET 0x1
+ #define NETLINK_RECV_PKTINFO 0x2
+@@ -209,7 +196,10 @@ static __inline__ struct sock *netlink_l
+ read_lock(&nl_table_lock);
+ head = nl_pid_hashfn(hash, pid);
+ sk_for_each(sk, node, head) {
+- if (nlk_sk(sk)->pid == pid) {
++ /* VEs should find sockets, created by kernel */
++ if ((nlk_sk(sk)->pid == pid) &&
++ (!pid || ve_accessible_strict(VE_OWNER_SK(sk),
++ get_exec_env()))){
+ sock_hold(sk);
+ goto found;
+ }
+@@ -309,7 +299,9 @@ static int netlink_insert(struct sock *s
+ head = nl_pid_hashfn(hash, pid);
+ len = 0;
+ sk_for_each(osk, node, head) {
+- if (nlk_sk(osk)->pid == pid)
++ if ((nlk_sk(sk)->pid == pid) &&
++ ve_accessible_strict(VE_OWNER_SK(sk),
++ get_exec_env()))
+ break;
+ len++;
+ }
+@@ -362,6 +354,8 @@ static int __netlink_create(struct socke
+ sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1);
+ if (!sk)
+ return -ENOMEM;
++ if (ub_other_sock_charge(sk))
++ goto out_free;
+
+ sock_init_data(sock, sk);
+
+@@ -372,6 +366,10 @@ static int __netlink_create(struct socke
+ sk->sk_destruct = netlink_sock_destruct;
+ sk->sk_protocol = protocol;
+ return 0;
++
++out_free:
++ sk_free(sk);
++ return -ENOMEM;
+ }
+
+ static int netlink_create(struct socket *sock, int protocol)
+@@ -425,6 +423,7 @@ static int netlink_release(struct socket
+ return 0;
+
+ netlink_remove(sk);
++ sock_orphan(sk);
+ nlk = nlk_sk(sk);
+
+ spin_lock(&nlk->cb_lock);
+@@ -439,7 +438,6 @@ static int netlink_release(struct socket
+ /* OK. Socket is unlinked, and, therefore,
+ no new packets will arrive */
+
+- sock_orphan(sk);
+ sock->sk = NULL;
+ wake_up_interruptible_all(&nlk->wait);
+
+@@ -477,7 +475,7 @@ static int netlink_autobind(struct socke
+ struct hlist_head *head;
+ struct sock *osk;
+ struct hlist_node *node;
+- s32 pid = current->tgid;
++ s32 pid = virt_pid(current);
+ int err;
+ static s32 rover = -4097;
+
+@@ -486,7 +484,9 @@ retry:
+ netlink_table_grab();
+ head = nl_pid_hashfn(hash, pid);
+ sk_for_each(osk, node, head) {
+- if (nlk_sk(osk)->pid == pid) {
++ if ((nlk_sk(osk)->pid == pid) &&
++ ve_accessible_strict(VE_OWNER_SK(osk),
++ get_exec_env())) {
+ /* Bind collision, search negative pid values. */
+ pid = rover--;
+ if (rover > -4097)
+@@ -511,7 +511,7 @@ retry:
+ static inline int netlink_capable(struct socket *sock, unsigned int flag)
+ {
+ return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) ||
+- capable(CAP_NET_ADMIN);
++ capable(CAP_VE_NET_ADMIN);
+ }
+
+ static void
+@@ -845,6 +845,9 @@ static inline int do_one_broadcast(struc
+ !test_bit(p->group - 1, nlk->groups))
+ goto out;
+
++ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk)))
++ goto out;
++
+ if (p->failure) {
+ netlink_overrun(sk);
+ goto out;
+@@ -942,6 +945,9 @@ static inline int do_one_set_err(struct
+ !test_bit(p->group - 1, nlk->groups))
+ goto out;
+
++ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk)))
++ goto out;
++
+ sk->sk_err = p->code;
+ sk->sk_error_report(sk);
+ out:
+@@ -1076,12 +1082,17 @@ static int netlink_sendmsg(struct kiocb
+ struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
+ struct sock *sk = sock->sk;
+ struct netlink_sock *nlk = nlk_sk(sk);
+- struct sockaddr_nl *addr=msg->msg_name;
++ struct sockaddr_nl *addr = msg->msg_name;
+ u32 dst_pid;
+- u32 dst_group;
+ struct sk_buff *skb;
+ int err;
+ struct scm_cookie scm;
++ struct sock *dstsk;
++ long timeo;
++ int no_ubc, no_buf;
++ unsigned long chargesize;
++
++ DECLARE_WAITQUEUE(wait, current);
+
+ if (msg->msg_flags&MSG_OOB)
+ return -EOPNOTSUPP;
+@@ -1092,17 +1103,16 @@ static int netlink_sendmsg(struct kiocb
+ if (err < 0)
+ return err;
+
++ /* Broadcasts from user to kernel are disabled. This is OK
++ * according to ANK */
+ if (msg->msg_namelen) {
+ if (addr->nl_family != AF_NETLINK)
+ return -EINVAL;
+ dst_pid = addr->nl_pid;
+- dst_group = ffs(addr->nl_groups);
+- if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND))
++ if (addr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND))
+ return -EPERM;
+- } else {
++ } else
+ dst_pid = nlk->dst_pid;
+- dst_group = nlk->dst_group;
+- }
+
+ if (!nlk->pid) {
+ err = netlink_autobind(sock);
+@@ -1115,12 +1125,12 @@ static int netlink_sendmsg(struct kiocb
+ goto out;
+ err = -ENOBUFS;
+ skb = alloc_skb(len, GFP_KERNEL);
+- if (skb==NULL)
++ if (skb == NULL)
+ goto out;
+
+ NETLINK_CB(skb).pid = nlk->pid;
+ NETLINK_CB(skb).dst_pid = dst_pid;
+- NETLINK_CB(skb).dst_group = dst_group;
++ NETLINK_CB(skb).dst_group = 0;
+ NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
+ memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
+
+@@ -1131,25 +1141,88 @@ static int netlink_sendmsg(struct kiocb
+ */
+
+ err = -EFAULT;
+- if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) {
+- kfree_skb(skb);
+- goto out;
+- }
++ if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len))
++ goto out_free;
+
+ err = security_netlink_send(sk, skb);
+- if (err) {
+- kfree_skb(skb);
+- goto out;
++ if (err)
++ goto out_free;
++
++ timeo = sock_sndtimeo(sk, msg->msg_flags&MSG_DONTWAIT);
++retry:
++ dstsk = netlink_getsockbypid(sk, dst_pid);
++ if (IS_ERR(dstsk)) {
++ err = PTR_ERR(dstsk);
++ goto out_free;
+ }
+
+- if (dst_group) {
+- atomic_inc(&skb->users);
+- netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL);
++ nlk = nlk_sk(dstsk);
++#ifdef NL_EMULATE_DEV
++ if (nlk->handler) {
++ skb_orphan(skb);
++ err = nlk->handler(protocol, skb);
++ goto out_put;
++ }
++#endif
++
++ /* BTW, it could be done once, before the retry loop */
++ chargesize = skb_charge_fullsize(skb);
++ no_ubc = ub_sock_getwres_other(sk, chargesize);
++ no_buf = atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf ||
++ test_bit(0, &nlk->state);
++ if (no_ubc || no_buf) {
++ wait_queue_head_t *sleep;
++
++ if (!no_ubc)
++ ub_sock_retwres_other(sk, chargesize,
++ SOCK_MIN_UBCSPACE_CH);
++ err = -EAGAIN;
++ if (timeo == 0) {
++ kfree_skb(skb);
++ goto out_put;
++ }
++
++ /* wake up comes to different queues */
++ sleep = no_ubc ? sk->sk_sleep : &nlk->wait;
++ __set_current_state(TASK_INTERRUPTIBLE);
++ add_wait_queue(sleep, &wait);
++
++ /* this if can't be moved upper because ub_sock_snd_queue_add()
++ * may change task state to TASK_RUNNING */
++ if (no_ubc)
++ ub_sock_sndqueueadd_other(sk, chargesize);
++
++ if ((atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf ||
++ test_bit(0, &nlk->state) || no_ubc) &&
++ !sock_flag(dstsk, SOCK_DEAD))
++ timeo = schedule_timeout(timeo);
++
++ __set_current_state(TASK_RUNNING);
++ remove_wait_queue(sleep, &wait);
++ if (no_ubc)
++ ub_sock_sndqueuedel(sk);
++ sock_put(dstsk);
++
++ if (!signal_pending(current))
++ goto retry;
++ err = sock_intr_errno(timeo);
++ goto out_free;
+ }
+- err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT);
+
++ skb_orphan(skb);
++ skb_set_owner_r(skb, dstsk);
++ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF);
++ skb_queue_tail(&dstsk->sk_receive_queue, skb);
++ dstsk->sk_data_ready(dstsk, len);
++ err = len;
++out_put:
++ sock_put(dstsk);
+ out:
+ return err;
++
++out_free:
++ kfree_skb(skb);
++ return err;
+ }
+
+ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
+@@ -1303,6 +1376,10 @@ static int netlink_dump(struct sock *sk)
+ skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
++ if (ub_nlrcvbuf_charge(skb, sk) < 0) {
++ kfree_skb(skb);
++ return -EACCES;
++ }
+
+ spin_lock(&nlk->cb_lock);
+
+@@ -1365,9 +1442,9 @@ int netlink_dump_start(struct sock *ssk,
+ return -ECONNREFUSED;
+ }
+ nlk = nlk_sk(sk);
+- /* A dump is in progress... */
++ /* A dump or destruction is in progress... */
+ spin_lock(&nlk->cb_lock);
+- if (nlk->cb) {
++ if (nlk->cb || sock_flag(sk, SOCK_DEAD)) {
+ spin_unlock(&nlk->cb_lock);
+ netlink_destroy_callback(cb);
+ sock_put(sk);
+@@ -1471,8 +1548,15 @@ void netlink_run_queue(struct sock *sk,
+ *qlen = skb_queue_len(&sk->sk_receive_queue);
+
+ for (; *qlen; (*qlen)--) {
++ int ret;
++ struct ve_struct *old_env;
+ skb = skb_dequeue(&sk->sk_receive_queue);
+- if (netlink_rcv_skb(skb, cb)) {
++
++ old_env = set_exec_env(VE_OWNER_SKB(skb));
++ ret = netlink_rcv_skb(skb, cb);
++ (void)set_exec_env(old_env);
++
++ if (ret) {
+ if (skb->len)
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ else {
+@@ -1740,6 +1824,7 @@ enomem:
+
+ sock_register(&netlink_family_ops);
+ #ifdef CONFIG_PROC_FS
++ /* FIXME: virtualize before give access from VEs */
+ proc_net_fops_create("netlink", 0, &netlink_seq_fops);
+ #endif
+ /* The netlink device handler may be needed early. */
+diff -upr linux-2.6.16.orig/net/packet/af_packet.c linux-2.6.16-026test015/net/packet/af_packet.c
+--- linux-2.6.16.orig/net/packet/af_packet.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/packet/af_packet.c 2006-07-04 14:41:38.000000000 +0400
+@@ -79,6 +79,8 @@
+ #include <linux/module.h>
+ #include <linux/init.h>
+
++#include <ub/ub_net.h>
++
+ #ifdef CONFIG_INET
+ #include <net/inet_common.h>
+ #endif
+@@ -280,7 +282,8 @@ static int packet_rcv_spkt(struct sk_buf
+ * so that this procedure is noop.
+ */
+
+- if (skb->pkt_type == PACKET_LOOPBACK)
++ if (skb->pkt_type == PACKET_LOOPBACK ||
++ !ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk)))
+ goto out;
+
+ if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+@@ -472,6 +475,9 @@ static int packet_rcv(struct sk_buff *sk
+ sk = pt->af_packet_priv;
+ po = pkt_sk(sk);
+
++ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk)))
++ goto drop;
++
+ skb->dev = dev;
+
+ if (dev->hard_header) {
+@@ -531,6 +537,9 @@ static int packet_rcv(struct sk_buff *sk
+ if (pskb_trim(skb, snaplen))
+ goto drop_n_acct;
+
++ if (ub_sockrcvbuf_charge(sk, skb))
++ goto drop_n_acct;
++
+ skb_set_owner_r(skb, sk);
+ skb->dev = NULL;
+ dst_release(skb->dst);
+@@ -581,6 +590,9 @@ static int tpacket_rcv(struct sk_buff *s
+ sk = pt->af_packet_priv;
+ po = pkt_sk(sk);
+
++ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk)))
++ goto drop;
++
+ if (dev->hard_header) {
+ if (sk->sk_type != SOCK_DGRAM)
+ skb_push(skb, skb->data - skb->mac.raw);
+@@ -630,6 +642,12 @@ static int tpacket_rcv(struct sk_buff *s
+ if (snaplen > skb->len-skb->data_len)
+ snaplen = skb->len-skb->data_len;
+
++ if (copy_skb &&
++ ub_sockrcvbuf_charge(sk, copy_skb)) {
++ spin_lock(&sk->sk_receive_queue.lock);
++ goto ring_is_full;
++ }
++
+ spin_lock(&sk->sk_receive_queue.lock);
+ h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
+
+@@ -1010,6 +1028,8 @@ static int packet_create(struct socket *
+ sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
+ if (sk == NULL)
+ goto out;
++ if (ub_other_sock_charge(sk))
++ goto out_free;
+
+ sock->ops = &packet_ops;
+ #ifdef CONFIG_SOCK_PACKET
+@@ -1048,6 +1068,9 @@ static int packet_create(struct socket *
+ sk_add_node(sk, &packet_sklist);
+ write_unlock_bh(&packet_sklist_lock);
+ return(0);
++
++out_free:
++ sk_free(sk);
+ out:
+ return err;
+ }
+@@ -1430,11 +1453,16 @@ static int packet_notifier(struct notifi
+ struct sock *sk;
+ struct hlist_node *node;
+ struct net_device *dev = (struct net_device*)data;
++ struct ve_struct *ve;
+
++ ve = get_exec_env();
+ read_lock(&packet_sklist_lock);
+ sk_for_each(sk, node, &packet_sklist) {
+ struct packet_sock *po = pkt_sk(sk);
+
++ if (!ve_accessible_strict(VE_OWNER_SK(sk), ve))
++ continue;
++
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ #ifdef CONFIG_PACKET_MULTICAST
+@@ -1845,6 +1873,8 @@ static inline struct sock *packet_seq_id
+ struct hlist_node *node;
+
+ sk_for_each(s, node, &packet_sklist) {
++ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env()))
++ continue;
+ if (!off--)
+ return s;
+ }
+@@ -1860,9 +1890,13 @@ static void *packet_seq_start(struct seq
+ static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ {
+ ++*pos;
+- return (v == SEQ_START_TOKEN)
+- ? sk_head(&packet_sklist)
+- : sk_next((struct sock*)v) ;
++ do {
++ v = (v == SEQ_START_TOKEN)
++ ? sk_head(&packet_sklist)
++ : sk_next((struct sock*)v);
++ } while (v != NULL &&
++ !ve_accessible(VE_OWNER_SK((struct sock*)v), get_exec_env()));
++ return v;
+ }
+
+ static void packet_seq_stop(struct seq_file *seq, void *v)
+@@ -1918,7 +1952,7 @@ static struct file_operations packet_seq
+
+ static void __exit packet_exit(void)
+ {
+- proc_net_remove("packet");
++ remove_proc_glob_entry("net/packet", NULL);
+ unregister_netdevice_notifier(&packet_netdev_notifier);
+ sock_unregister(PF_PACKET);
+ proto_unregister(&packet_proto);
+@@ -1933,7 +1967,7 @@ static int __init packet_init(void)
+
+ sock_register(&packet_family_ops);
+ register_netdevice_notifier(&packet_netdev_notifier);
+- proc_net_fops_create("packet", 0, &packet_seq_fops);
++ proc_glob_fops_create("net/packet", 0, &packet_seq_fops);
+ out:
+ return rc;
+ }
+diff -upr linux-2.6.16.orig/net/sched/sch_cbq.c linux-2.6.16-026test015/net/sched/sch_cbq.c
+--- linux-2.6.16.orig/net/sched/sch_cbq.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sched/sch_cbq.c 2006-07-04 14:41:37.000000000 +0400
+@@ -932,8 +932,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int
+
+ if (cl->deficit <= 0) {
+ q->active[prio] = cl;
+- cl = cl->next_alive;
+ cl->deficit += cl->quantum;
++ cl = cl->next_alive;
+ }
+ return skb;
+
+@@ -1109,17 +1109,19 @@ static void cbq_normalize_quanta(struct
+
+ for (h=0; h<16; h++) {
+ for (cl = q->classes[h]; cl; cl = cl->next) {
++ long mtu;
+ /* BUGGGG... Beware! This expression suffer of
+ arithmetic overflows!
+ */
+ if (cl->priority == prio) {
+- cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+- q->quanta[prio];
+- }
+- if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
+- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
+- cl->quantum = cl->qdisc->dev->mtu/2 + 1;
++ cl->quantum = (cl->weight * cl->allot) /
++ (q->quanta[prio] / q->nclasses[prio]);
+ }
++ mtu = cl->qdisc->dev->mtu;
++ if (cl->quantum <= mtu/2)
++ cl->quantum = mtu/2 + 1;
++ else if (cl->quantum > 32*mtu)
++ cl->quantum = 32*mtu;
+ }
+ }
+ }
+diff -upr linux-2.6.16.orig/net/sched/sch_generic.c linux-2.6.16-026test015/net/sched/sch_generic.c
+--- linux-2.6.16.orig/net/sched/sch_generic.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sched/sch_generic.c 2006-07-04 14:41:38.000000000 +0400
+@@ -97,6 +97,7 @@ int qdisc_restart(struct net_device *dev
+
+ /* Dequeue packet */
+ if ((skb = q->dequeue(q)) != NULL) {
++ struct ve_struct *envid;
+ unsigned nolock = (dev->features & NETIF_F_LLTX);
+ /*
+ * When the driver has LLTX set it does its own locking
+@@ -107,6 +108,7 @@ int qdisc_restart(struct net_device *dev
+ * of lock congestion it should return -1 and the packet
+ * will be requeued.
+ */
++ envid = set_exec_env(VE_OWNER_SKB(skb));
+ if (!nolock) {
+ if (!spin_trylock(&dev->xmit_lock)) {
+ collision:
+@@ -121,6 +123,7 @@ int qdisc_restart(struct net_device *dev
+ kfree_skb(skb);
+ if (net_ratelimit())
+ printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
++ (void)set_exec_env(envid);
+ return -1;
+ }
+ __get_cpu_var(netdev_rx_stat).cpu_collision++;
+@@ -146,6 +149,7 @@ int qdisc_restart(struct net_device *dev
+ spin_unlock(&dev->xmit_lock);
+ }
+ spin_lock(&dev->queue_lock);
++ (void)set_exec_env(envid);
+ return -1;
+ }
+ if (ret == NETDEV_TX_LOCKED && nolock) {
+@@ -177,6 +181,7 @@ int qdisc_restart(struct net_device *dev
+ requeue:
+ q->ops->requeue(skb, q);
+ netif_schedule(dev);
++ (void)set_exec_env(envid);
+ return 1;
+ }
+ BUG_ON((int) q->q.qlen < 0);
+@@ -625,3 +630,4 @@ EXPORT_SYMBOL(qdisc_reset);
+ EXPORT_SYMBOL(qdisc_restart);
+ EXPORT_SYMBOL(qdisc_lock_tree);
+ EXPORT_SYMBOL(qdisc_unlock_tree);
++EXPORT_SYMBOL(dev_shutdown);
+diff -upr linux-2.6.16.orig/net/sched/sch_teql.c linux-2.6.16-026test015/net/sched/sch_teql.c
+--- linux-2.6.16.orig/net/sched/sch_teql.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sched/sch_teql.c 2006-07-04 14:41:38.000000000 +0400
+@@ -189,6 +189,9 @@ static int teql_qdisc_init(struct Qdisc
+ struct teql_master *m = (struct teql_master*)sch->ops;
+ struct teql_sched_data *q = qdisc_priv(sch);
+
++ if (!capable(CAP_NET_ADMIN))
++ return -EPERM;
++
+ if (dev->hard_header_len > m->dev->hard_header_len)
+ return -EINVAL;
+
+diff -upr linux-2.6.16.orig/net/sctp/inqueue.c linux-2.6.16-026test015/net/sctp/inqueue.c
+--- linux-2.6.16.orig/net/sctp/inqueue.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sctp/inqueue.c 2006-07-04 14:41:36.000000000 +0400
+@@ -149,6 +149,7 @@ struct sctp_chunk *sctp_inq_pop(struct s
+ /* This is the first chunk in the packet. */
+ chunk->singleton = 1;
+ ch = (sctp_chunkhdr_t *) chunk->skb->data;
++ chunk->data_accepted = 0;
+ }
+
+ chunk->chunk_hdr = ch;
+diff -upr linux-2.6.16.orig/net/sctp/sm_statefuns.c linux-2.6.16-026test015/net/sctp/sm_statefuns.c
+--- linux-2.6.16.orig/net/sctp/sm_statefuns.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sctp/sm_statefuns.c 2006-07-04 14:41:36.000000000 +0400
+@@ -636,8 +636,9 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(co
+ */
+ chunk->subh.cookie_hdr =
+ (struct sctp_signed_cookie *)chunk->skb->data;
+- skb_pull(chunk->skb,
+- ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t));
++ if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
++ sizeof(sctp_chunkhdr_t)))
++ goto nomem;
+
+ /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint
+ * "Z" will reply with a COOKIE ACK chunk after building a TCB
+@@ -965,7 +966,8 @@ sctp_disposition_t sctp_sf_beat_8_3(cons
+ */
+ chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data;
+ paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t);
+- skb_pull(chunk->skb, paylen);
++ if (!pskb_pull(chunk->skb, paylen))
++ goto nomem;
+
+ reply = sctp_make_heartbeat_ack(asoc, chunk,
+ chunk->subh.hb_hdr, paylen);
+@@ -1028,6 +1030,12 @@ sctp_disposition_t sctp_sf_backbeat_8_3(
+ commands);
+
+ hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data;
++ /* Make sure that the length of the parameter is what we expect */
++ if (ntohs(hbinfo->param_hdr.length) !=
++ sizeof(sctp_sender_hb_info_t)) {
++ return SCTP_DISPOSITION_DISCARD;
++ }
++
+ from_addr = hbinfo->daddr;
+ link = sctp_assoc_lookup_paddr(asoc, &from_addr);
+
+@@ -1860,8 +1868,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupc
+ * are in good shape.
+ */
+ chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data;
+- skb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
+- sizeof(sctp_chunkhdr_t));
++ if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) -
++ sizeof(sctp_chunkhdr_t)))
++ goto nomem;
+
+ /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie
+ * of a duplicate COOKIE ECHO match the Verification Tags of the
+@@ -5151,7 +5160,9 @@ static int sctp_eat_data(const struct sc
+ int tmp;
+ __u32 tsn;
+ int account_value;
++ struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map;
+ struct sock *sk = asoc->base.sk;
++ int rcvbuf_over = 0;
+
+ data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data;
+ skb_pull(chunk->skb, sizeof(sctp_datahdr_t));
+@@ -5162,10 +5173,16 @@ static int sctp_eat_data(const struct sc
+ /* ASSERT: Now skb->data is really the user data. */
+
+ /*
+- * if we are established, and we have used up our receive
+- * buffer memory, drop the frame
++ * If we are established, and we have used up our receive buffer
++ * memory, think about droping the frame.
++ * Note that we have an opportunity to improve performance here.
++ * If we accept one chunk from an skbuff, we have to keep all the
++ * memory of that skbuff around until the chunk is read into user
++ * space. Therefore, once we accept 1 chunk we may as well accept all
++ * remaining chunks in the skbuff. The data_accepted flag helps us do
++ * that.
+ */
+- if (asoc->state == SCTP_STATE_ESTABLISHED) {
++ if ((asoc->state == SCTP_STATE_ESTABLISHED) && (!chunk->data_accepted)) {
+ /*
+ * If the receive buffer policy is 1, then each
+ * association can allocate up to sk_rcvbuf bytes
+@@ -5176,9 +5193,25 @@ static int sctp_eat_data(const struct sc
+ account_value = atomic_read(&asoc->rmem_alloc);
+ else
+ account_value = atomic_read(&sk->sk_rmem_alloc);
+-
+- if (account_value > sk->sk_rcvbuf)
+- return SCTP_IERROR_IGNORE_TSN;
++ if (account_value > sk->sk_rcvbuf) {
++ /*
++ * We need to make forward progress, even when we are
++ * under memory pressure, so we always allow the
++ * next tsn after the ctsn ack point to be accepted.
++ * This lets us avoid deadlocks in which we have to
++ * drop frames that would otherwise let us drain the
++ * receive queue.
++ */
++ if ((sctp_tsnmap_get_ctsn(map) + 1) != tsn)
++ return SCTP_IERROR_IGNORE_TSN;
++
++ /*
++ * We're going to accept the frame but we should renege
++ * to make space for it. This will send us down that
++ * path later in this function.
++ */
++ rcvbuf_over = 1;
++ }
+ }
+
+ /* Process ECN based congestion.
+@@ -5226,6 +5259,7 @@ static int sctp_eat_data(const struct sc
+ datalen -= sizeof(sctp_data_chunk_t);
+
+ deliver = SCTP_CMD_CHUNK_ULP;
++ chunk->data_accepted = 1;
+
+ /* Think about partial delivery. */
+ if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) {
+@@ -5242,7 +5276,8 @@ static int sctp_eat_data(const struct sc
+ * large spill over.
+ */
+ if (!asoc->rwnd || asoc->rwnd_over ||
+- (datalen > asoc->rwnd + asoc->frag_point)) {
++ (datalen > asoc->rwnd + asoc->frag_point) ||
++ rcvbuf_over) {
+
+ /* If this is the next TSN, consider reneging to make
+ * room. Note: Playing nice with a confused sender. A
+@@ -5250,8 +5285,8 @@ static int sctp_eat_data(const struct sc
+ * space and in the future we may want to detect and
+ * do more drastic reneging.
+ */
+- if (sctp_tsnmap_has_gap(&asoc->peer.tsn_map) &&
+- (sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1) == tsn) {
++ if (sctp_tsnmap_has_gap(map) &&
++ (sctp_tsnmap_get_ctsn(map) + 1) == tsn) {
+ SCTP_DEBUG_PRINTK("Reneging for tsn:%u\n", tsn);
+ deliver = SCTP_CMD_RENEGE;
+ } else {
+diff -upr linux-2.6.16.orig/net/sctp/sm_statetable.c linux-2.6.16-026test015/net/sctp/sm_statetable.c
+--- linux-2.6.16.orig/net/sctp/sm_statetable.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sctp/sm_statetable.c 2006-07-04 14:41:36.000000000 +0400
+@@ -366,9 +366,9 @@ const sctp_sm_table_entry_t *sctp_sm_loo
+ /* SCTP_STATE_EMPTY */ \
+ {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \
+ /* SCTP_STATE_CLOSED */ \
+- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \
+ /* SCTP_STATE_COOKIE_ECHOED */ \
+ {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \
+ /* SCTP_STATE_ESTABLISHED */ \
+@@ -380,7 +380,7 @@ const sctp_sm_table_entry_t *sctp_sm_loo
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \
+ } /* TYPE_SCTP_ECN_ECNE */
+
+ #define TYPE_SCTP_ECN_CWR { \
+@@ -401,7 +401,7 @@ const sctp_sm_table_entry_t *sctp_sm_loo
+ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \
+ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \
+ } /* TYPE_SCTP_ECN_CWR */
+
+ #define TYPE_SCTP_SHUTDOWN_COMPLETE { \
+@@ -647,7 +647,7 @@ chunk_event_table_unknown[SCTP_STATE_NUM
+ /* SCTP_STATE_EMPTY */ \
+ {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
+ /* SCTP_STATE_CLOSED */ \
+- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \
++ {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \
+ /* SCTP_STATE_COOKIE_WAIT */ \
+ {.fn = sctp_sf_do_prm_requestheartbeat, \
+ .name = "sctp_sf_do_prm_requestheartbeat"}, \
+diff -upr linux-2.6.16.orig/net/sctp/ulpqueue.c linux-2.6.16-026test015/net/sctp/ulpqueue.c
+--- linux-2.6.16.orig/net/sctp/ulpqueue.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sctp/ulpqueue.c 2006-07-04 14:41:36.000000000 +0400
+@@ -279,6 +279,7 @@ static inline void sctp_ulpq_store_reasm
+ static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag)
+ {
+ struct sk_buff *pos;
++ struct sk_buff *new = NULL;
+ struct sctp_ulpevent *event;
+ struct sk_buff *pnext, *last;
+ struct sk_buff *list = skb_shinfo(f_frag)->frag_list;
+@@ -297,11 +298,33 @@ static struct sctp_ulpevent *sctp_make_r
+ */
+ if (last)
+ last->next = pos;
+- else
+- skb_shinfo(f_frag)->frag_list = pos;
++ else {
++ if (skb_cloned(f_frag)) {
++ /* This is a cloned skb, we can't just modify
++ * the frag_list. We need a new skb to do that.
++ * Instead of calling skb_unshare(), we'll do it
++ * ourselves since we need to delay the free.
++ */
++ new = skb_copy(f_frag, GFP_ATOMIC);
++ if (!new)
++ return NULL; /* try again later */
++
++ new->sk = f_frag->sk;
++
++ skb_shinfo(new)->frag_list = pos;
++ } else
++ skb_shinfo(f_frag)->frag_list = pos;
++ }
+
+ /* Remove the first fragment from the reassembly queue. */
+ __skb_unlink(f_frag, queue);
++
++ /* if we did unshare, then free the old skb and re-assign */
++ if (new) {
++ kfree_skb(f_frag);
++ f_frag = new;
++ }
++
+ while (pos) {
+
+ pnext = pos->next;
+diff -upr linux-2.6.16.orig/net/socket.c linux-2.6.16-026test015/net/socket.c
+--- linux-2.6.16.orig/net/socket.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/socket.c 2006-07-04 14:41:39.000000000 +0400
+@@ -84,6 +84,7 @@
+ #include <linux/compat.h>
+ #include <linux/kmod.h>
+ #include <linux/audit.h>
++#include <linux/in.h>
+
+ #ifdef CONFIG_NET_RADIO
+ #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */
+@@ -1075,6 +1076,49 @@ int sock_wake_async(struct socket *sock,
+ return 0;
+ }
+
++int vz_security_proto_check(int family, int type, int protocol)
++{
++#ifdef CONFIG_VE
++ if (ve_is_super(get_exec_env()))
++ return 0;
++
++ switch (family) {
++ case PF_UNSPEC:
++ case PF_PACKET:
++ case PF_NETLINK:
++ case PF_UNIX:
++ break;
++ case PF_INET:
++ switch (protocol) {
++ case IPPROTO_IP:
++ case IPPROTO_ICMP:
++ case IPPROTO_TCP:
++ case IPPROTO_UDP:
++ case IPPROTO_RAW:
++ break;
++ default:
++ return -EAFNOSUPPORT;
++ }
++ break;
++ case PF_INET6:
++ switch (protocol) {
++ case IPPROTO_IP:
++ case IPPROTO_ICMPV6:
++ case IPPROTO_TCP:
++ case IPPROTO_UDP:
++ case IPPROTO_RAW:
++ break;
++ default:
++ return -EAFNOSUPPORT;
++ }
++ break;
++ default:
++ return -EAFNOSUPPORT;
++ }
++#endif
++ return 0;
++}
++
+ static int __sock_create(int family, int type, int protocol, struct socket **res, int kern)
+ {
+ int err;
+@@ -1102,6 +1146,11 @@ static int __sock_create(int family, int
+ family = PF_PACKET;
+ }
+
++ /* VZ compatibility layer */
++ err = vz_security_proto_check(family, type, protocol);
++ if (err < 0)
++ return err;
++
+ err = security_socket_create(family, type, protocol, kern);
+ if (err)
+ return err;
+diff -upr linux-2.6.16.orig/net/sunrpc/clnt.c linux-2.6.16-026test015/net/sunrpc/clnt.c
+--- linux-2.6.16.orig/net/sunrpc/clnt.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sunrpc/clnt.c 2006-07-04 14:41:38.000000000 +0400
+@@ -168,10 +168,10 @@ rpc_new_client(struct rpc_xprt *xprt, ch
+ }
+
+ /* save the nodename */
+- clnt->cl_nodelen = strlen(system_utsname.nodename);
++ clnt->cl_nodelen = strlen(ve_utsname.nodename);
+ if (clnt->cl_nodelen > UNX_MAXNODENAME)
+ clnt->cl_nodelen = UNX_MAXNODENAME;
+- memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen);
++ memcpy(clnt->cl_nodename, ve_utsname.nodename, clnt->cl_nodelen);
+ return clnt;
+
+ out_no_auth:
+diff -upr linux-2.6.16.orig/net/sunrpc/sched.c linux-2.6.16-026test015/net/sunrpc/sched.c
+--- linux-2.6.16.orig/net/sunrpc/sched.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sunrpc/sched.c 2006-07-04 14:41:38.000000000 +0400
+@@ -605,7 +605,9 @@ EXPORT_SYMBOL(rpc_exit_task);
+ static int __rpc_execute(struct rpc_task *task)
+ {
+ int status = 0;
++ struct ve_struct *env;
+
++ env = set_exec_env(get_ve0());
+ dprintk("RPC: %4d rpc_execute flgs %x\n",
+ task->tk_pid, task->tk_flags);
+
+@@ -693,6 +695,7 @@ static int __rpc_execute(struct rpc_task
+ rpc_mark_complete_task(task);
+ /* Release all resources associated with the task */
+ rpc_release_task(task);
++ (void)set_exec_env(env);
+ return status;
+ }
+
+diff -upr linux-2.6.16.orig/net/sunrpc/svcsock.c linux-2.6.16-026test015/net/sunrpc/svcsock.c
+--- linux-2.6.16.orig/net/sunrpc/svcsock.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/sunrpc/svcsock.c 2006-07-04 14:41:38.000000000 +0400
+@@ -361,6 +361,9 @@ svc_sendto(struct svc_rqst *rqstp, struc
+ size_t base = xdr->page_base;
+ unsigned int pglen = xdr->page_len;
+ unsigned int flags = MSG_MORE;
++ struct ve_struct *old_env;
++
++ old_env = set_exec_env(get_ve0());
+
+ slen = xdr->len;
+
+@@ -425,6 +428,8 @@ out:
+ rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len,
+ rqstp->rq_addr.sin_addr.s_addr);
+
++ (void)set_exec_env(old_env);
++
+ return len;
+ }
+
+@@ -437,9 +442,12 @@ svc_recv_available(struct svc_sock *svsk
+ mm_segment_t oldfs;
+ struct socket *sock = svsk->sk_sock;
+ int avail, err;
++ struct ve_struct *old_env;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
++ old_env = set_exec_env(get_ve0());
+ err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail);
++ (void)set_exec_env(old_env);
+ set_fs(oldfs);
+
+ return (err >= 0)? avail : err;
+@@ -454,6 +462,7 @@ svc_recvfrom(struct svc_rqst *rqstp, str
+ struct msghdr msg;
+ struct socket *sock;
+ int len, alen;
++ struct ve_struct *old_env;
+
+ rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
+ sock = rqstp->rq_sock->sk_sock;
+@@ -465,7 +474,9 @@ svc_recvfrom(struct svc_rqst *rqstp, str
+
+ msg.msg_flags = MSG_DONTWAIT;
+
++ old_env = set_exec_env(get_ve0());
+ len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT);
++ (void)set_exec_env(get_ve0());
+
+ /* sock_recvmsg doesn't fill in the name/namelen, so we must..
+ * possibly we should cache this in the svc_sock structure
+@@ -761,17 +772,19 @@ svc_tcp_accept(struct svc_sock *svsk)
+ const struct proto_ops *ops;
+ struct svc_sock *newsvsk;
+ int err, slen;
++ struct ve_struct *old_env;
+
+ dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
+ if (!sock)
+ return;
+
++ old_env = set_exec_env(get_ve0());
+ err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock);
+ if (err) {
+ if (err == -ENOMEM)
+ printk(KERN_WARNING "%s: no more sockets!\n",
+ serv->sv_name);
+- return;
++ goto restore;
+ }
+
+ dprintk("svc: tcp_accept %p allocated\n", newsock);
+@@ -865,6 +878,8 @@ svc_tcp_accept(struct svc_sock *svsk)
+
+ }
+
++ (void)set_exec_env(old_env);
++
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpconn++;
+
+@@ -872,6 +887,8 @@ svc_tcp_accept(struct svc_sock *svsk)
+
+ failed:
+ sock_release(newsock);
++restore:
++ (void)set_exec_env(old_env);
+ return;
+ }
+
+@@ -1388,6 +1405,7 @@ svc_create_socket(struct svc_serv *serv,
+ struct socket *sock;
+ int error;
+ int type;
++ struct ve_struct *old_env;
+
+ dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n",
+ serv->sv_program->pg_name, protocol,
+@@ -1401,8 +1419,10 @@ svc_create_socket(struct svc_serv *serv,
+ }
+ type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+
++ old_env = set_exec_env(get_ve0());
++
+ if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0)
+- return error;
++ goto restore;
+
+ if (sin != NULL) {
+ if (type == SOCK_STREAM)
+@@ -1418,12 +1438,16 @@ svc_create_socket(struct svc_serv *serv,
+ goto bummer;
+ }
+
+- if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL)
++ if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) {
++ (void)set_exec_env(old_env);
+ return 0;
++ }
+
+ bummer:
+ dprintk("svc: svc_create_socket error = %d\n", -error);
+ sock_release(sock);
++restore:
++ (void)set_exec_env(old_env);
+ return error;
+ }
+
+diff -upr linux-2.6.16.orig/net/unix/af_unix.c linux-2.6.16-026test015/net/unix/af_unix.c
+--- linux-2.6.16.orig/net/unix/af_unix.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/unix/af_unix.c 2006-07-04 14:41:38.000000000 +0400
+@@ -118,6 +118,9 @@
+ #include <net/checksum.h>
+ #include <linux/security.h>
+
++#include <ub/ub_net.h>
++#include <ub/beancounter.h>
++
+ int sysctl_unix_max_dgram_qlen = 10;
+
+ struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
+@@ -235,6 +238,8 @@ static struct sock *__unix_find_socket_b
+ sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
+ struct unix_sock *u = unix_sk(s);
+
++ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env()))
++ continue;
+ if (u->addr->len == len &&
+ !memcmp(u->addr->name, sunname, len))
+ goto found;
+@@ -439,7 +444,7 @@ static int unix_listen(struct socket *so
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_state = TCP_LISTEN;
+ /* set credentials so connect can copy them */
+- sk->sk_peercred.pid = current->tgid;
++ sk->sk_peercred.pid = virt_tgid(current);
+ sk->sk_peercred.uid = current->euid;
+ sk->sk_peercred.gid = current->egid;
+ err = 0;
+@@ -553,6 +558,8 @@ static struct sock * unix_create1(struct
+ sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
+ if (!sk)
+ goto out;
++ if (ub_other_sock_charge(sk))
++ goto out_sk_free;
+
+ atomic_inc(&unix_nr_socks);
+
+@@ -571,6 +578,9 @@ static struct sock * unix_create1(struct
+ unix_insert_socket(unix_sockets_unbound, sk);
+ out:
+ return sk;
++out_sk_free:
++ sk_free(sk);
++ return NULL;
+ }
+
+ static int unix_create(struct socket *sock, int protocol)
+@@ -676,7 +686,7 @@ static struct sock *unix_find_other(stru
+ err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
+ if (err)
+ goto fail;
+- err = vfs_permission(&nd, MAY_WRITE);
++ err = vfs_permission(&nd, MAY_WRITE, NULL);
+ if (err)
+ goto put_fail;
+
+@@ -932,6 +942,7 @@ static int unix_stream_connect(struct so
+ int st;
+ int err;
+ long timeo;
++ unsigned long chargesize;
+
+ err = unix_mkname(sunaddr, addr_len, &hash);
+ if (err < 0)
+@@ -960,6 +971,10 @@ static int unix_stream_connect(struct so
+ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
+ if (skb == NULL)
+ goto out;
++ chargesize = skb_charge_fullsize(skb);
++ if (ub_sock_getwres_other(newsk, chargesize) < 0)
++ goto out;
++ ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF);
+
+ restart:
+ /* Find listening sock. */
+@@ -1043,7 +1058,7 @@ restart:
+ unix_peer(newsk) = sk;
+ newsk->sk_state = TCP_ESTABLISHED;
+ newsk->sk_type = sk->sk_type;
+- newsk->sk_peercred.pid = current->tgid;
++ newsk->sk_peercred.pid = virt_tgid(current);
+ newsk->sk_peercred.uid = current->euid;
+ newsk->sk_peercred.gid = current->egid;
+ newu = unix_sk(newsk);
+@@ -1107,7 +1122,7 @@ static int unix_socketpair(struct socket
+ sock_hold(skb);
+ unix_peer(ska)=skb;
+ unix_peer(skb)=ska;
+- ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
++ ska->sk_peercred.pid = skb->sk_peercred.pid = virt_tgid(current);
+ ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
+ ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
+
+@@ -1433,6 +1448,16 @@ static int unix_stream_sendmsg(struct ki
+
+ size=len-sent;
+
++ if (msg->msg_flags & MSG_DONTWAIT)
++ ub_sock_makewres_other(sk, skb_charge_size(size));
++ if (sock_bc(sk) != NULL &&
++ sock_bc(sk)->poll_reserv >=
++ SOCK_MIN_UBCSPACE &&
++ skb_charge_size(size) >
++ sock_bc(sk)->poll_reserv)
++ size = skb_charge_datalen(sock_bc(sk)->poll_reserv);
++
++
+ /* Keep two messages in the pipe so it schedules better */
+ if (size > sk->sk_sndbuf / 2 - 64)
+ size = sk->sk_sndbuf / 2 - 64;
+@@ -1444,7 +1469,8 @@ static int unix_stream_sendmsg(struct ki
+ * Grab a buffer
+ */
+
+- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
++ skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE,
++ msg->msg_flags&MSG_DONTWAIT, &err);
+
+ if (skb==NULL)
+ goto out_err;
+@@ -1869,6 +1895,7 @@ static unsigned int unix_poll(struct fil
+ {
+ struct sock *sk = sock->sk;
+ unsigned int mask;
++ int no_ub_res;
+
+ poll_wait(file, sk->sk_sleep, wait);
+ mask = 0;
+@@ -1879,6 +1906,10 @@ static unsigned int unix_poll(struct fil
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ mask |= POLLHUP;
+
++ no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH);
++ if (no_ub_res)
++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH);
++
+ /* readable? */
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ (sk->sk_shutdown & RCV_SHUTDOWN))
+@@ -1892,7 +1923,7 @@ static unsigned int unix_poll(struct fil
+ * we set writable also when the other side has shut down the
+ * connection. This prevents stuck sockets.
+ */
+- if (unix_writable(sk))
++ if (!no_ub_res && unix_writable(sk))
+ mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
+
+ return mask;
+@@ -2044,7 +2075,7 @@ static int __init af_unix_init(void)
+
+ sock_register(&unix_family_ops);
+ #ifdef CONFIG_PROC_FS
+- proc_net_fops_create("unix", 0, &unix_seq_fops);
++ proc_glob_fops_create("net/unix", 0, &unix_seq_fops);
+ #endif
+ unix_sysctl_register();
+ out:
+@@ -2055,7 +2086,7 @@ static void __exit af_unix_exit(void)
+ {
+ sock_unregister(PF_UNIX);
+ unix_sysctl_unregister();
+- proc_net_remove("unix");
++ remove_proc_glob_entry("net/unix", NULL);
+ proto_unregister(&unix_proto);
+ }
+
+diff -upr linux-2.6.16.orig/net/unix/garbage.c linux-2.6.16-026test015/net/unix/garbage.c
+--- linux-2.6.16.orig/net/unix/garbage.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/net/unix/garbage.c 2006-07-04 14:41:39.000000000 +0400
+@@ -76,6 +76,7 @@
+ #include <linux/netdevice.h>
+ #include <linux/file.h>
+ #include <linux/proc_fs.h>
++#include <linux/module.h>
+
+ #include <net/sock.h>
+ #include <net/af_unix.h>
+@@ -135,7 +136,7 @@ void unix_notinflight(struct file *fp)
+ atomic_dec(&unix_tot_inflight);
+ }
+ }
+-
++EXPORT_SYMBOL_GPL(unix_notinflight);
+
+ /*
+ * Garbage Collector Support Functions
+diff -upr linux-2.6.16.orig/security/commoncap.c linux-2.6.16-026test015/security/commoncap.c
+--- linux-2.6.16.orig/security/commoncap.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/security/commoncap.c 2006-07-04 14:41:38.000000000 +0400
+@@ -35,7 +35,7 @@ EXPORT_SYMBOL(cap_netlink_send);
+
+ int cap_netlink_recv(struct sk_buff *skb)
+ {
+- if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
++ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_VE_NET_ADMIN))
+ return -EPERM;
+ return 0;
+ }
+@@ -197,7 +197,7 @@ int cap_inode_setxattr(struct dentry *de
+ {
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1) &&
+- !capable(CAP_SYS_ADMIN))
++ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
+ return -EPERM;
+ return 0;
+ }
+@@ -206,7 +206,7 @@ int cap_inode_removexattr(struct dentry
+ {
+ if (!strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1) &&
+- !capable(CAP_SYS_ADMIN))
++ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN))
+ return -EPERM;
+ return 0;
+ }
+@@ -312,7 +312,7 @@ void cap_task_reparent_to_init (struct t
+
+ int cap_syslog (int type)
+ {
+- if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN))
++ if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN))
+ return -EPERM;
+ return 0;
+ }
+diff -upr linux-2.6.16.orig/security/keys/key.c linux-2.6.16-026test015/security/keys/key.c
+--- linux-2.6.16.orig/security/keys/key.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/security/keys/key.c 2006-07-04 14:41:36.000000000 +0400
+@@ -785,6 +785,10 @@ key_ref_t key_create_or_update(key_ref_t
+
+ key_check(keyring);
+
++ key_ref = ERR_PTR(-ENOTDIR);
++ if (keyring->type != &key_type_keyring)
++ goto error_2;
++
+ down_write(&keyring->sem);
+
+ /* if we're going to allocate a new key, we're going to have
+diff -upr linux-2.6.16.orig/security/keys/keyring.c linux-2.6.16-026test015/security/keys/keyring.c
+--- linux-2.6.16.orig/security/keys/keyring.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/security/keys/keyring.c 2006-07-04 14:41:36.000000000 +0400
+@@ -437,6 +437,7 @@ EXPORT_SYMBOL(keyring_search);
+ /*
+ * search the given keyring only (no recursion)
+ * - keyring must be locked by caller
++ * - caller must guarantee that the keyring is a keyring
+ */
+ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
+ const struct key_type *ktype,
+diff -upr linux-2.6.16.orig/security/selinux/hooks.c linux-2.6.16-026test015/security/selinux/hooks.c
+--- linux-2.6.16.orig/security/selinux/hooks.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/security/selinux/hooks.c 2006-07-04 14:41:38.000000000 +0400
+@@ -4167,12 +4167,12 @@ static int selinux_setprocattr(struct ta
+ struct task_struct *g, *t;
+ struct mm_struct *mm = p->mm;
+ read_lock(&tasklist_lock);
+- do_each_thread(g, t)
++ do_each_thread_ve(g, t)
+ if (t->mm == mm && t != p) {
+ read_unlock(&tasklist_lock);
+ return -EPERM;
+ }
+- while_each_thread(g, t);
++ while_each_thread_ve(g, t);
+ read_unlock(&tasklist_lock);
+ }
+
+diff -upr linux-2.6.16.orig/security/selinux/ss/mls.c linux-2.6.16-026test015/security/selinux/ss/mls.c
+--- linux-2.6.16.orig/security/selinux/ss/mls.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/security/selinux/ss/mls.c 2006-07-04 14:41:36.000000000 +0400
+@@ -264,7 +264,7 @@ int mls_context_to_sid(char oldc,
+
+ if (!selinux_mls_enabled) {
+ if (def_sid != SECSID_NULL && oldc)
+- *scontext += strlen(*scontext);
++ *scontext += strlen(*scontext)+1;
+ return 0;
+ }
+
+diff -upr linux-2.6.16.orig/security/selinux/ss/services.c linux-2.6.16-026test015/security/selinux/ss/services.c
+--- linux-2.6.16.orig/security/selinux/ss/services.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/security/selinux/ss/services.c 2006-07-04 14:41:36.000000000 +0400
+@@ -592,6 +592,10 @@ int security_sid_to_context(u32 sid, cha
+
+ *scontext_len = strlen(initial_sid_to_string[sid]) + 1;
+ scontextp = kmalloc(*scontext_len,GFP_ATOMIC);
++ if (!scontextp) {
++ rc = -ENOMEM;
++ goto out;
++ }
+ strcpy(scontextp, initial_sid_to_string[sid]);
+ *scontext = scontextp;
+ goto out;
+diff -upr linux-2.6.16.orig/sound/isa/opti9xx/opti92x-ad1848.c linux-2.6.16-026test015/sound/isa/opti9xx/opti92x-ad1848.c
+--- linux-2.6.16.orig/sound/isa/opti9xx/opti92x-ad1848.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/sound/isa/opti9xx/opti92x-ad1848.c 2006-07-04 14:41:36.000000000 +0400
+@@ -2088,9 +2088,11 @@ static int __init alsa_card_opti9xx_init
+ int error;
+ struct platform_device *device;
+
++#ifdef CONFIG_PNP
+ pnp_register_card_driver(&opti9xx_pnpc_driver);
+ if (snd_opti9xx_pnp_is_probed)
+ return 0;
++#endif
+ if (! is_isapnp_selected()) {
+ error = platform_driver_register(&snd_opti9xx_driver);
+ if (error < 0)
+@@ -2102,7 +2104,9 @@ static int __init alsa_card_opti9xx_init
+ }
+ platform_driver_unregister(&snd_opti9xx_driver);
+ }
++#ifdef CONFIG_PNP
+ pnp_unregister_card_driver(&opti9xx_pnpc_driver);
++#endif
+ #ifdef MODULE
+ printk(KERN_ERR "no OPTi " CHIP_NAME " soundcard found\n");
+ #endif
+@@ -2115,7 +2119,9 @@ static void __exit alsa_card_opti9xx_exi
+ platform_device_unregister(snd_opti9xx_platform_device);
+ platform_driver_unregister(&snd_opti9xx_driver);
+ }
++#ifdef CONFIG_PNP
+ pnp_unregister_card_driver(&opti9xx_pnpc_driver);
++#endif
+ }
+
+ module_init(alsa_card_opti9xx_init)
+diff -upr linux-2.6.16.orig/sound/oss/dmasound/tas_common.c linux-2.6.16-026test015/sound/oss/dmasound/tas_common.c
+--- linux-2.6.16.orig/sound/oss/dmasound/tas_common.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/sound/oss/dmasound/tas_common.c 2006-07-04 14:41:36.000000000 +0400
+@@ -195,8 +195,8 @@ tas_init(int driver_id, const char *driv
+
+ printk(KERN_INFO "tas driver [%s])\n", driver_name);
+
+-#ifndef CONFIG_I2C_KEYWEST
+- request_module("i2c-keywest");
++#ifndef CONFIG_I2C_POWERMAC
++ request_module("i2c-powermac");
+ #endif
+ tas_node = find_devices("deq");
+ if (tas_node == NULL)
+diff -upr linux-2.6.16.orig/sound/pci/hda/patch_realtek.c linux-2.6.16-026test015/sound/pci/hda/patch_realtek.c
+--- linux-2.6.16.orig/sound/pci/hda/patch_realtek.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/sound/pci/hda/patch_realtek.c 2006-07-04 14:41:36.000000000 +0400
+@@ -2948,6 +2948,8 @@ static struct hda_board_config alc260_cf
+ { .modelname = "basic", .config = ALC260_BASIC },
+ { .pci_subvendor = 0x104d, .pci_subdevice = 0x81bb,
+ .config = ALC260_BASIC }, /* Sony VAIO */
++ { .pci_subvendor = 0x152d, .pci_subdevice = 0x0729,
++ .config = ALC260_BASIC }, /* CTL Travel Master U553W */
+ { .modelname = "hp", .config = ALC260_HP },
+ { .pci_subvendor = 0x103c, .pci_subdevice = 0x3010, .config = ALC260_HP },
+ { .pci_subvendor = 0x103c, .pci_subdevice = 0x3011, .config = ALC260_HP },
+diff -upr linux-2.6.16.orig/sound/ppc/daca.c linux-2.6.16-026test015/sound/ppc/daca.c
+--- linux-2.6.16.orig/sound/ppc/daca.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/sound/ppc/daca.c 2006-07-04 14:41:36.000000000 +0400
+@@ -256,7 +256,7 @@ int __init snd_pmac_daca_init(struct snd
+
+ #ifdef CONFIG_KMOD
+ if (current->fs->root)
+- request_module("i2c-keywest");
++ request_module("i2c-powermac");
+ #endif /* CONFIG_KMOD */
+
+ mix = kmalloc(sizeof(*mix), GFP_KERNEL);
+diff -upr linux-2.6.16.orig/sound/ppc/tumbler.c linux-2.6.16-026test015/sound/ppc/tumbler.c
+--- linux-2.6.16.orig/sound/ppc/tumbler.c 2006-03-20 08:53:29.000000000 +0300
++++ linux-2.6.16-026test015/sound/ppc/tumbler.c 2006-07-04 14:41:36.000000000 +0400
+@@ -1314,7 +1314,7 @@ int __init snd_pmac_tumbler_init(struct
+
+ #ifdef CONFIG_KMOD
+ if (current->fs->root)
+- request_module("i2c-keywest");
++ request_module("i2c-powermac");
+ #endif /* CONFIG_KMOD */
+
+ mix = kmalloc(sizeof(*mix), GFP_KERNEL);
diff --git a/openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch b/openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch
new file mode 100644
index 0000000..0f43c67
--- /dev/null
+++ b/openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch
@@ -0,0 +1,19 @@
+From: OpenVZ team <devel@openvz.org>
+Date: Fri, 14 Jul 2006 10:23:43 +0000 (+0400)
+Subject: Merged 2.6.16.24 from /linux/kernel/git/stable/linux-2.6.16.y
+X-Git-Url: http://10.0.101.105/cgi-bin/gitweb.cgi?p=kernel;a=commitdiff;h=9a23ec204b88ab5e678dc3e33fe03d7531167e66
+
+Merged 2.6.16.24 from /linux/kernel/git/stable/linux-2.6.16.y
+---
+
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -1954,7 +1954,7 @@ asmlinkage long sys_prctl(int option, un
+ error = current->mm->dumpable;
+ break;
+ case PR_SET_DUMPABLE:
+- if (arg2 < 0 || arg2 > 2) {
++ if (arg2 < 0 || arg2 > 1) {
+ error = -EINVAL;
+ break;
+ }
diff --git a/openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch b/openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch
new file mode 100644
index 0000000..a02bf91
--- /dev/null
+++ b/openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch
@@ -0,0 +1,20 @@
+Index: linux-2.6.16-gentoo-r12/fs/proc/base.c
+===================================================================
+--- linux-2.6.16-gentoo-r12.orig/fs/proc/base.c
++++ linux-2.6.16-gentoo-r12/fs/proc/base.c
+@@ -1367,6 +1367,7 @@ static int pid_revalidate(struct dentry
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ }
++ inode->i_mode &= ~(S_ISUID | S_ISGID);
+ security_task_to_inode(task, inode);
+ return 1;
+ }
+@@ -1394,6 +1395,7 @@ static int tid_fd_revalidate(struct dent
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ }
++ inode->i_mode &= ~(S_ISUID | S_ISGID);
+ security_task_to_inode(task, inode);
+ return 1;
+ }