diff options
author | Christian Heim <phreak@gentoo.org> | 2006-07-15 14:47:37 +0000 |
---|---|---|
committer | Christian Heim <phreak@gentoo.org> | 2006-07-15 14:47:37 +0000 |
commit | e4c83cd472e7986c2fce3dbd0c12b9edce2299ce (patch) | |
tree | 8740eab35358cab40fb55f26fb412c40a78c7ced /openvz-sources | |
parent | Adding the missing patch to 026.015-r1 (diff) | |
download | misc-e4c83cd472e7986c2fce3dbd0c12b9edce2299ce.tar.gz misc-e4c83cd472e7986c2fce3dbd0c12b9edce2299ce.tar.bz2 misc-e4c83cd472e7986c2fce3dbd0c12b9edce2299ce.zip |
Fixing #140444 / CVE-2006-3626
svn path=/; revision=404
Diffstat (limited to 'openvz-sources')
4 files changed, 91221 insertions, 0 deletions
diff --git a/openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch b/openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch new file mode 100644 index 0000000..a7fe97d --- /dev/null +++ b/openvz-sources/026.015-r2/0001_linux-2.6.0-nonintconfig.patch @@ -0,0 +1,99 @@ +--- ./scripts/kconfig/Makefile.nonint 2006-01-03 06:21:10.000000000 +0300 ++++ ./scripts/kconfig/Makefile 2006-01-16 16:59:19.000000000 +0300 +@@ -42,6 +42,10 @@ update-po-config: $(obj)/kxgettext + $(Q)rm -f arch/um/Kconfig_arch + $(Q)rm -f scripts/kconfig/linux_*.pot scripts/kconfig/config.pot + ++nonint_oldconfig: scripts/kconfig/conf ++ ./scripts/kconfig/conf -b arch/$(ARCH)/Kconfig ++ ++ + .PHONY: randconfig allyesconfig allnoconfig allmodconfig defconfig + + randconfig: $(obj)/conf +--- ./scripts/kconfig/conf.c.nonint 2006-01-03 06:21:10.000000000 +0300 ++++ ./scripts/kconfig/conf.c 2006-01-16 16:10:30.000000000 +0300 +@@ -20,6 +20,7 @@ enum { + ask_all, + ask_new, + ask_silent, ++ dont_ask, + set_default, + set_yes, + set_mod, +@@ -36,6 +37,8 @@ static struct menu *rootEntry; + + static char nohelp_text[] = N_("Sorry, no help available for this option yet.\n"); + ++static int return_value = 0; ++ + static void strip(char *str) + { + char *p = str; +@@ -102,6 +105,12 @@ static void conf_askvalue(struct symbol + fflush(stdout); + fgets(line, 128, stdin); + return; ++ case dont_ask: ++ if (!sym_has_value(sym)) { ++ fprintf(stderr,"CONFIG_%s\n",sym->name); ++ return_value++; ++ } ++ return; + case set_default: + printf("%s\n", def); + return; +@@ -346,6 +355,10 @@ static int conf_choice(struct menu *menu + printf("?"); + printf("]: "); + switch (input_mode) { ++ case dont_ask: ++ cnt = def; ++ printf("%d\n", cnt); ++ break; + case ask_new: + case ask_silent: + if (!is_new) { +@@ -482,7 +495,10 @@ static void check_conf(struct menu *menu + if (!conf_cnt++) + printf(_("*\n* Restart config...\n*\n")); + rootEntry = menu_get_parent_menu(menu); +- conf(rootEntry); ++ if (input_mode == dont_ask) ++ fprintf(stderr,"CONFIG_%s\n",sym->name); ++ else ++ conf(rootEntry); + } + } + +@@ -501,6 +517,9 @@ int main(int ac, char **av) + case 'o': + input_mode = ask_new; + break; ++ case 'b': ++ input_mode = dont_ask; ++ break; + case 's': + input_mode = ask_silent; + valid_stdin = isatty(0) && isatty(1) && isatty(2); +@@ -565,6 +584,7 @@ int main(int ac, char **av) + } + case ask_all: + case ask_new: ++ case dont_ask: + conf_read(NULL); + break; + case set_no: +@@ -603,10 +623,10 @@ int main(int ac, char **av) + do { + conf_cnt = 0; + check_conf(&rootmenu); +- } while (conf_cnt); ++ } while ((conf_cnt) && (input_mode != dont_ask)); + if (conf_write(NULL)) { + fprintf(stderr, _("\n*** Error during writing of the kernel configuration.\n\n")); + return 1; + } +- return 0; ++ return return_value; + } diff --git a/openvz-sources/026.015-r2/0100_patch-026test015-core.patch b/openvz-sources/026.015-r2/0100_patch-026test015-core.patch new file mode 100644 index 0000000..94452f7 --- /dev/null +++ b/openvz-sources/026.015-r2/0100_patch-026test015-core.patch @@ -0,0 +1,91083 @@ +diff -upr linux-2.6.16.orig/COPYING.SWsoft linux-2.6.16-026test015/COPYING.SWsoft +--- linux-2.6.16.orig/COPYING.SWsoft 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/COPYING.SWsoft 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,350 @@ ++ ++Nothing in this license should be construed as a grant by SWsoft of any rights ++beyond the rights specified in the GNU General Public License, and nothing in ++this license should be construed as a waiver by SWsoft of its patent, copyright ++and/or trademark rights, beyond the waiver required by the GNU General Public ++License. This license is expressly inapplicable to any product that is not ++within the scope of the GNU General Public License ++ ++---------------------------------------- ++ ++ GNU GENERAL PUBLIC LICENSE ++ Version 2, June 1991 ++ ++ Copyright (C) 1989, 1991 Free Software Foundation, Inc. ++ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ Everyone is permitted to copy and distribute verbatim copies ++ of this license document, but changing it is not allowed. ++ ++ Preamble ++ ++ The licenses for most software are designed to take away your ++freedom to share and change it. By contrast, the GNU General Public ++License is intended to guarantee your freedom to share and change free ++software--to make sure the software is free for all its users. This ++General Public License applies to most of the Free Software ++Foundation's software and to any other program whose authors commit to ++using it. (Some other Free Software Foundation software is covered by ++the GNU Library General Public License instead.) You can apply it to ++your programs, too. ++ ++ When we speak of free software, we are referring to freedom, not ++price. Our General Public Licenses are designed to make sure that you ++have the freedom to distribute copies of free software (and charge for ++this service if you wish), that you receive source code or can get it ++if you want it, that you can change the software or use pieces of it ++in new free programs; and that you know you can do these things. ++ ++ To protect your rights, we need to make restrictions that forbid ++anyone to deny you these rights or to ask you to surrender the rights. ++These restrictions translate to certain responsibilities for you if you ++distribute copies of the software, or if you modify it. ++ ++ For example, if you distribute copies of such a program, whether ++gratis or for a fee, you must give the recipients all the rights that ++you have. You must make sure that they, too, receive or can get the ++source code. And you must show them these terms so they know their ++rights. ++ ++ We protect your rights with two steps: (1) copyright the software, and ++(2) offer you this license which gives you legal permission to copy, ++distribute and/or modify the software. ++ ++ Also, for each author's protection and ours, we want to make certain ++that everyone understands that there is no warranty for this free ++software. If the software is modified by someone else and passed on, we ++want its recipients to know that what they have is not the original, so ++that any problems introduced by others will not reflect on the original ++authors' reputations. ++ ++ Finally, any free program is threatened constantly by software ++patents. We wish to avoid the danger that redistributors of a free ++program will individually obtain patent licenses, in effect making the ++program proprietary. To prevent this, we have made it clear that any ++patent must be licensed for everyone's free use or not licensed at all. ++ ++ The precise terms and conditions for copying, distribution and ++modification follow. ++ ++ GNU GENERAL PUBLIC LICENSE ++ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION ++ ++ 0. This License applies to any program or other work which contains ++a notice placed by the copyright holder saying it may be distributed ++under the terms of this General Public License. The "Program", below, ++refers to any such program or work, and a "work based on the Program" ++means either the Program or any derivative work under copyright law: ++that is to say, a work containing the Program or a portion of it, ++either verbatim or with modifications and/or translated into another ++language. (Hereinafter, translation is included without limitation in ++the term "modification".) Each licensee is addressed as "you". ++ ++Activities other than copying, distribution and modification are not ++covered by this License; they are outside its scope. The act of ++running the Program is not restricted, and the output from the Program ++is covered only if its contents constitute a work based on the ++Program (independent of having been made by running the Program). ++Whether that is true depends on what the Program does. ++ ++ 1. You may copy and distribute verbatim copies of the Program's ++source code as you receive it, in any medium, provided that you ++conspicuously and appropriately publish on each copy an appropriate ++copyright notice and disclaimer of warranty; keep intact all the ++notices that refer to this License and to the absence of any warranty; ++and give any other recipients of the Program a copy of this License ++along with the Program. ++ ++You may charge a fee for the physical act of transferring a copy, and ++you may at your option offer warranty protection in exchange for a fee. ++ ++ 2. You may modify your copy or copies of the Program or any portion ++of it, thus forming a work based on the Program, and copy and ++distribute such modifications or work under the terms of Section 1 ++above, provided that you also meet all of these conditions: ++ ++ a) You must cause the modified files to carry prominent notices ++ stating that you changed the files and the date of any change. ++ ++ b) You must cause any work that you distribute or publish, that in ++ whole or in part contains or is derived from the Program or any ++ part thereof, to be licensed as a whole at no charge to all third ++ parties under the terms of this License. ++ ++ c) If the modified program normally reads commands interactively ++ when run, you must cause it, when started running for such ++ interactive use in the most ordinary way, to print or display an ++ announcement including an appropriate copyright notice and a ++ notice that there is no warranty (or else, saying that you provide ++ a warranty) and that users may redistribute the program under ++ these conditions, and telling the user how to view a copy of this ++ License. (Exception: if the Program itself is interactive but ++ does not normally print such an announcement, your work based on ++ the Program is not required to print an announcement.) ++ ++These requirements apply to the modified work as a whole. If ++identifiable sections of that work are not derived from the Program, ++and can be reasonably considered independent and separate works in ++themselves, then this License, and its terms, do not apply to those ++sections when you distribute them as separate works. But when you ++distribute the same sections as part of a whole which is a work based ++on the Program, the distribution of the whole must be on the terms of ++this License, whose permissions for other licensees extend to the ++entire whole, and thus to each and every part regardless of who wrote it. ++ ++Thus, it is not the intent of this section to claim rights or contest ++your rights to work written entirely by you; rather, the intent is to ++exercise the right to control the distribution of derivative or ++collective works based on the Program. ++ ++In addition, mere aggregation of another work not based on the Program ++with the Program (or with a work based on the Program) on a volume of ++a storage or distribution medium does not bring the other work under ++the scope of this License. ++ ++ 3. You may copy and distribute the Program (or a work based on it, ++under Section 2) in object code or executable form under the terms of ++Sections 1 and 2 above provided that you also do one of the following: ++ ++ a) Accompany it with the complete corresponding machine-readable ++ source code, which must be distributed under the terms of Sections ++ 1 and 2 above on a medium customarily used for software interchange; or, ++ ++ b) Accompany it with a written offer, valid for at least three ++ years, to give any third party, for a charge no more than your ++ cost of physically performing source distribution, a complete ++ machine-readable copy of the corresponding source code, to be ++ distributed under the terms of Sections 1 and 2 above on a medium ++ customarily used for software interchange; or, ++ ++ c) Accompany it with the information you received as to the offer ++ to distribute corresponding source code. (This alternative is ++ allowed only for noncommercial distribution and only if you ++ received the program in object code or executable form with such ++ an offer, in accord with Subsection b above.) ++ ++The source code for a work means the preferred form of the work for ++making modifications to it. For an executable work, complete source ++code means all the source code for all modules it contains, plus any ++associated interface definition files, plus the scripts used to ++control compilation and installation of the executable. However, as a ++special exception, the source code distributed need not include ++anything that is normally distributed (in either source or binary ++form) with the major components (compiler, kernel, and so on) of the ++operating system on which the executable runs, unless that component ++itself accompanies the executable. ++ ++If distribution of executable or object code is made by offering ++access to copy from a designated place, then offering equivalent ++access to copy the source code from the same place counts as ++distribution of the source code, even though third parties are not ++compelled to copy the source along with the object code. ++ ++ 4. You may not copy, modify, sublicense, or distribute the Program ++except as expressly provided under this License. Any attempt ++otherwise to copy, modify, sublicense or distribute the Program is ++void, and will automatically terminate your rights under this License. ++However, parties who have received copies, or rights, from you under ++this License will not have their licenses terminated so long as such ++parties remain in full compliance. ++ ++ 5. You are not required to accept this License, since you have not ++signed it. However, nothing else grants you permission to modify or ++distribute the Program or its derivative works. These actions are ++prohibited by law if you do not accept this License. Therefore, by ++modifying or distributing the Program (or any work based on the ++Program), you indicate your acceptance of this License to do so, and ++all its terms and conditions for copying, distributing or modifying ++the Program or works based on it. ++ ++ 6. Each time you redistribute the Program (or any work based on the ++Program), the recipient automatically receives a license from the ++original licensor to copy, distribute or modify the Program subject to ++these terms and conditions. You may not impose any further ++restrictions on the recipients' exercise of the rights granted herein. ++You are not responsible for enforcing compliance by third parties to ++this License. ++ ++ 7. If, as a consequence of a court judgment or allegation of patent ++infringement or for any other reason (not limited to patent issues), ++conditions are imposed on you (whether by court order, agreement or ++otherwise) that contradict the conditions of this License, they do not ++excuse you from the conditions of this License. If you cannot ++distribute so as to satisfy simultaneously your obligations under this ++License and any other pertinent obligations, then as a consequence you ++may not distribute the Program at all. For example, if a patent ++license would not permit royalty-free redistribution of the Program by ++all those who receive copies directly or indirectly through you, then ++the only way you could satisfy both it and this License would be to ++refrain entirely from distribution of the Program. ++ ++If any portion of this section is held invalid or unenforceable under ++any particular circumstance, the balance of the section is intended to ++apply and the section as a whole is intended to apply in other ++circumstances. ++ ++It is not the purpose of this section to induce you to infringe any ++patents or other property right claims or to contest validity of any ++such claims; this section has the sole purpose of protecting the ++integrity of the free software distribution system, which is ++implemented by public license practices. Many people have made ++generous contributions to the wide range of software distributed ++through that system in reliance on consistent application of that ++system; it is up to the author/donor to decide if he or she is willing ++to distribute software through any other system and a licensee cannot ++impose that choice. ++ ++This section is intended to make thoroughly clear what is believed to ++be a consequence of the rest of this License. ++ ++ 8. If the distribution and/or use of the Program is restricted in ++certain countries either by patents or by copyrighted interfaces, the ++original copyright holder who places the Program under this License ++may add an explicit geographical distribution limitation excluding ++those countries, so that distribution is permitted only in or among ++countries not thus excluded. In such case, this License incorporates ++the limitation as if written in the body of this License. ++ ++ 9. The Free Software Foundation may publish revised and/or new versions ++of the General Public License from time to time. Such new versions will ++be similar in spirit to the present version, but may differ in detail to ++address new problems or concerns. ++ ++Each version is given a distinguishing version number. If the Program ++specifies a version number of this License which applies to it and "any ++later version", you have the option of following the terms and conditions ++either of that version or of any later version published by the Free ++Software Foundation. If the Program does not specify a version number of ++this License, you may choose any version ever published by the Free Software ++Foundation. ++ ++ 10. If you wish to incorporate parts of the Program into other free ++programs whose distribution conditions are different, write to the author ++to ask for permission. For software which is copyrighted by the Free ++Software Foundation, write to the Free Software Foundation; we sometimes ++make exceptions for this. Our decision will be guided by the two goals ++of preserving the free status of all derivatives of our free software and ++of promoting the sharing and reuse of software generally. ++ ++ NO WARRANTY ++ ++ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY ++FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN ++OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES ++PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED ++OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS ++TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE ++PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, ++REPAIR OR CORRECTION. ++ ++ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING ++WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR ++REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, ++INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING ++OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED ++TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY ++YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER ++PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE ++POSSIBILITY OF SUCH DAMAGES. ++ ++ END OF TERMS AND CONDITIONS ++ ++ How to Apply These Terms to Your New Programs ++ ++ If you develop a new program, and you want it to be of the greatest ++possible use to the public, the best way to achieve this is to make it ++free software which everyone can redistribute and change under these terms. ++ ++ To do so, attach the following notices to the program. It is safest ++to attach them to the start of each source file to most effectively ++convey the exclusion of warranty; and each file should have at least ++the "copyright" line and a pointer to where the full notice is found. ++ ++ <one line to give the program's name and a brief idea of what it does.> ++ Copyright (C) <year> <name of author> ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ ++ ++Also add information on how to contact you by electronic and paper mail. ++ ++If the program is interactive, make it output a short notice like this ++when it starts in an interactive mode: ++ ++ Gnomovision version 69, Copyright (C) year name of author ++ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. ++ This is free software, and you are welcome to redistribute it ++ under certain conditions; type `show c' for details. ++ ++The hypothetical commands `show w' and `show c' should show the appropriate ++parts of the General Public License. Of course, the commands you use may ++be called something other than `show w' and `show c'; they could even be ++mouse-clicks or menu items--whatever suits your program. ++ ++You should also get your employer (if you work as a programmer) or your ++school, if any, to sign a "copyright disclaimer" for the program, if ++necessary. Here is a sample; alter the names: ++ ++ Yoyodyne, Inc., hereby disclaims all copyright interest in the program ++ `Gnomovision' (which makes passes at compilers) written by James Hacker. ++ ++ <signature of Ty Coon>, 1 April 1989 ++ Ty Coon, President of Vice ++ ++This General Public License does not permit incorporating your program into ++proprietary programs. If your program is a subroutine library, you may ++consider it more useful to permit linking proprietary applications with the ++library. If this is what you want to do, use the GNU Library General ++Public License instead of this License. +diff -upr linux-2.6.16.orig/Documentation/dvb/get_dvb_firmware linux-2.6.16-026test015/Documentation/dvb/get_dvb_firmware +--- linux-2.6.16.orig/Documentation/dvb/get_dvb_firmware 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/Documentation/dvb/get_dvb_firmware 2006-07-04 14:41:36.000000000 +0400 +@@ -240,9 +240,9 @@ sub dibusb { + } + + sub nxt2002 { +- my $sourcefile = "Broadband4PC_4_2_11.zip"; ++ my $sourcefile = "Technisat_DVB-PC_4_4_COMPACT.zip"; + my $url = "http://www.bbti.us/download/windows/$sourcefile"; +- my $hash = "c6d2ea47a8f456d887ada0cfb718ff2a"; ++ my $hash = "476befae8c7c1bb9648954060b1eec1f"; + my $outfile = "dvb-fe-nxt2002.fw"; + my $tmpdir = tempdir(DIR => "/tmp", CLEANUP => 1); + +@@ -250,8 +250,8 @@ sub nxt2002 { + + wgetfile($sourcefile, $url); + unzip($sourcefile, $tmpdir); +- verify("$tmpdir/SkyNETU.sys", $hash); +- extract("$tmpdir/SkyNETU.sys", 375832, 5908, $outfile); ++ verify("$tmpdir/SkyNET.sys", $hash); ++ extract("$tmpdir/SkyNET.sys", 331624, 5908, $outfile); + + $outfile; + } +diff -upr linux-2.6.16.orig/Documentation/vsched.txt linux-2.6.16-026test015/Documentation/vsched.txt +--- linux-2.6.16.orig/Documentation/vsched.txt 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/Documentation/vsched.txt 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,83 @@ ++Copyright (C) 2005 SWsoft. All rights reserved. ++Licensing governed by "linux/COPYING.SWsoft" file. ++ ++Hierarchical CPU schedulers ++~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Hierarchical CPU scheduler is a stack of CPU schedulers which allows ++to organize different policies of scheduling in the system and/or between ++groups of processes. ++ ++Virtuozzo uses a hierarchical Fair CPU scheduler organized as a 2-stage ++CPU scheduler, where the scheduling decisions are made in 2 steps: ++1. On the first step Fair CPU scheduler selects a group of processes ++ which should get some CPU time. ++2. Then standard Linux scheduler chooses a process inside the group. ++Such scheduler efficiently allows to isolate one group of processes ++from another and still allows a group to use more than 1 CPU on SMP systems. ++ ++This document describes a new middle layer of Virtuozzo hierarchical CPU ++scheduler which makes decisions after Fair scheduler, but before Linux ++scheduler and which is called VCPU scheduler. ++ ++ ++Where VCPU scheduler comes from? ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Existing hierarchical CPU scheduler uses isolated algorithms on each stage ++of decision making, i.e. every scheduler makes its decisions without ++taking into account the details of other schedulers. This can lead to a number ++of problems described below. ++ ++On SMP systems there are possible situations when the first CPU scheduler ++in the hierarchy (e.g. Fair scheduler) wants to schedule some group of ++processes on the physical CPU, but the underlying process scheduler ++(e.g. Linux O(1) CPU scheduler) is unable to schedule any processes ++on this physical CPU. Usually this happens due to the fact that Linux ++kernel scheduler uses per-physical CPU runqueues. ++ ++Another problem is that Linux scheduler also knows nothing about ++Fair scheduler and can't balance efficiently without taking into account ++statistics about process groups from Fair scheduler. Without such ++statistics Linux scheduler can concentrate all processes on one physical ++CPU, thus making CPU consuming highly inefficient. ++ ++VCPU scheduler solves these problems by adding a new layer between ++Fair schedule and Linux scheduler. ++ ++VCPU scheduler ++~~~~~~~~~~~~~~ ++ ++VCPU scheduler is a CPU scheduler which splits notion of ++physical and virtual CPUs (VCPU and PCPU). This means that tasks are ++running on virtual CPU runqueues, while VCPUs are running on PCPUs. ++ ++The Virtuozzo hierarchical fair scheduler becomes 3 stage CPU scheduler: ++1. First, Fair CPU scheduler select a group of processes. ++2. Then VCPU scheduler select a virtual CPU to run (this is actually ++ a runqueue). ++3. Standard Linux scheduler chooses a process from the runqueue. ++ ++For example on the picture below PCPU0 executes tasks from ++VCPU1 runqueue and PCPU1 is idle: ++ ++ virtual | physical | virtual ++ idle CPUs | CPUs | CPUS ++--------------------|------------------------|-------------------------- ++ | | ----------------- ++ | | | virtual sched X | ++ | | | ----------- | ++ | | | | VCPU0 | | ++ | | | ----------- | ++ ------------ | ----------- | ----------- | ++| idle VCPU0 | | | PCPU0 | <---> | | VCPU1 | | ++ ------------ | ----------- | ----------- | ++ | | ----------------- ++ | | ++ | | ----------------- ++ | | | virtual sched Y | ++ ------------ ----------- | | ----------- | ++| idle VCPU1 | <---> | PCPU1 | | | | VCPU0 | | ++ ------------ ----------- | | ----------- | ++ | | ----------------- ++ | | +diff -upr linux-2.6.16.orig/Makefile linux-2.6.16-026test015/Makefile +--- linux-2.6.16.orig/Makefile 2006-07-04 14:41:39.000000000 +0400 ++++ linux-2.6.16-026test015/Makefile 2006-07-04 14:41:39.000000000 +0400 +@@ -1,7 +1,7 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 16 +-EXTRAVERSION = ++EXTRAVERSION = -026test015 + NAME=Sliding Snow Leopard + + # *DOCUMENTATION* +diff -upr linux-2.6.16.orig/arch/alpha/kernel/setup.c linux-2.6.16-026test015/arch/alpha/kernel/setup.c +--- linux-2.6.16.orig/arch/alpha/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/alpha/kernel/setup.c 2006-07-04 14:41:36.000000000 +0400 +@@ -24,6 +24,7 @@ + #include <linux/config.h> /* CONFIG_ALPHA_LCA etc */ + #include <linux/mc146818rtc.h> + #include <linux/console.h> ++#include <linux/cpu.h> + #include <linux/errno.h> + #include <linux/init.h> + #include <linux/string.h> +@@ -477,6 +478,22 @@ page_is_ram(unsigned long pfn) + #undef PFN_PHYS + #undef PFN_MAX + ++static int __init ++register_cpus(void) ++{ ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ register_cpu(p, i, NULL); ++ } ++ return 0; ++} ++ ++arch_initcall(register_cpus); ++ + void __init + setup_arch(char **cmdline_p) + { +diff -upr linux-2.6.16.orig/arch/alpha/kernel/smp.c linux-2.6.16-026test015/arch/alpha/kernel/smp.c +--- linux-2.6.16.orig/arch/alpha/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/alpha/kernel/smp.c 2006-07-04 14:41:36.000000000 +0400 +@@ -439,7 +439,7 @@ setup_smp(void) + if ((cpu->flags & 0x1cc) == 0x1cc) { + smp_num_probed++; + /* Assume here that "whami" == index */ +- cpu_set(i, cpu_possible_map); ++ cpu_set(i, cpu_present_mask); + cpu->pal_revision = boot_cpu_palrev; + } + +@@ -450,9 +450,8 @@ setup_smp(void) + } + } else { + smp_num_probed = 1; +- cpu_set(boot_cpuid, cpu_possible_map); ++ cpu_set(boot_cpuid, cpu_present_mask); + } +- cpu_present_mask = cpumask_of_cpu(boot_cpuid); + + printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_mask = %lx\n", + smp_num_probed, cpu_possible_map.bits[0]); +@@ -488,9 +487,8 @@ void __devinit + smp_prepare_boot_cpu(void) + { + /* +- * Mark the boot cpu (current cpu) as both present and online ++ * Mark the boot cpu (current cpu) as online + */ +- cpu_set(smp_processor_id(), cpu_present_mask); + cpu_set(smp_processor_id(), cpu_online_map); + } + +diff -upr linux-2.6.16.orig/arch/alpha/lib/strncpy.S linux-2.6.16-026test015/arch/alpha/lib/strncpy.S +--- linux-2.6.16.orig/arch/alpha/lib/strncpy.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/alpha/lib/strncpy.S 2006-07-04 14:41:36.000000000 +0400 +@@ -43,8 +43,8 @@ strncpy: + + .align 4 + $multiword: +- subq $24, 1, $2 # clear the final bits in the prev word +- or $2, $24, $2 ++ subq $27, 1, $2 # clear the final bits in the prev word ++ or $2, $27, $2 + zapnot $1, $2, $1 + subq $18, 1, $18 + +@@ -70,8 +70,8 @@ $multiword: + bne $18, 0b + + 1: ldq_u $1, 0($16) # clear the leading bits in the final word +- subq $27, 1, $2 +- or $2, $27, $2 ++ subq $24, 1, $2 ++ or $2, $24, $2 + + zap $1, $2, $1 + stq_u $1, 0($16) +diff -upr linux-2.6.16.orig/arch/arm/kernel/smp.c linux-2.6.16-026test015/arch/arm/kernel/smp.c +--- linux-2.6.16.orig/arch/arm/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/arm/kernel/smp.c 2006-07-04 14:41:38.000000000 +0400 +@@ -197,7 +197,7 @@ int __cpuexit __cpu_disable(void) + local_flush_tlb_all(); + + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->mm) + cpu_clear(cpu, p->mm->cpu_vm_mask); + } +diff -upr linux-2.6.16.orig/arch/frv/mm/mmu-context.c linux-2.6.16-026test015/arch/frv/mm/mmu-context.c +--- linux-2.6.16.orig/arch/frv/mm/mmu-context.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/frv/mm/mmu-context.c 2006-07-04 14:41:38.000000000 +0400 +@@ -181,7 +181,7 @@ int cxn_pin_by_pid(pid_t pid) + + /* get a handle on the mm_struct */ + read_lock(&tasklist_lock); +- tsk = find_task_by_pid(pid); ++ tsk = find_task_by_pid_ve(pid); + if (tsk) { + ret = -EINVAL; + +diff -upr linux-2.6.16.orig/arch/i386/Kconfig linux-2.6.16-026test015/arch/i386/Kconfig +--- linux-2.6.16.orig/arch/i386/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -216,6 +216,8 @@ config NR_CPUS + This is purely to save memory - each supported CPU adds + approximately eight kilobytes to the kernel image. + ++source "kernel/Kconfig.fairsched" ++ + config SCHED_SMT + bool "SMT (Hyperthreading) scheduler support" + depends on SMP +@@ -268,6 +270,14 @@ config X86_VISWS_APIC + depends on X86_VISWS + default y + ++config NMI_WATCHDOG ++ bool "NMI Watchdog" ++ default y ++ help ++ If you say Y here the kernel will activate NMI watchdog by default ++ on boot. You can still activate NMI watchdog via nmi_watchdog ++ command line option even if you say N here. ++ + config X86_MCE + bool "Machine Check Exception" + depends on !X86_VOYAGER +@@ -1071,12 +1081,16 @@ endmenu + + source "arch/i386/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" + ++source "kernel/ub/Kconfig" ++ + # + # Use the generic interrupt handling code in kernel/irq/: + # +diff -upr linux-2.6.16.orig/arch/i386/kernel/apic.c linux-2.6.16-026test015/arch/i386/kernel/apic.c +--- linux-2.6.16.orig/arch/i386/kernel/apic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/apic.c 2006-07-04 14:41:38.000000000 +0400 +@@ -1177,6 +1177,7 @@ inline void smp_local_timer_interrupt(st + fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) + { + int cpu = smp_processor_id(); ++ struct ve_struct *ve; + + /* + * the NMI deadlock-detector uses this. +@@ -1193,9 +1194,11 @@ fastcall void smp_apic_timer_interrupt(s + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ ++ ve = set_exec_env(get_ve0()); + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); ++ (void)set_exec_env(ve); + } + + #ifndef CONFIG_SMP +diff -upr linux-2.6.16.orig/arch/i386/kernel/apm.c linux-2.6.16-026test015/arch/i386/kernel/apm.c +--- linux-2.6.16.orig/arch/i386/kernel/apm.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/apm.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1081,7 +1081,7 @@ static int apm_console_blank(int blank) + break; + } + +- if (error == APM_NOT_ENGAGED && state != APM_STATE_READY) { ++ if (error == APM_NOT_ENGAGED) { + static int tried; + int eng_error; + if (tried++ == 0) { +diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/amd.c linux-2.6.16-026test015/arch/i386/kernel/cpu/amd.c +--- linux-2.6.16.orig/arch/i386/kernel/cpu/amd.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/amd.c 2006-07-04 14:41:36.000000000 +0400 +@@ -207,6 +207,8 @@ static void __init init_amd(struct cpuin + set_bit(X86_FEATURE_K7, c->x86_capability); + break; + } ++ if (c->x86 >= 6) ++ set_bit(X86_FEATURE_FXSAVE_LEAK, c->x86_capability); + + display_cacheinfo(c); + +diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/Kconfig linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/Kconfig +--- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -203,6 +203,7 @@ config X86_LONGRUN + config X86_LONGHAUL + tristate "VIA Cyrix III Longhaul" + select CPU_FREQ_TABLE ++ depends on BROKEN + help + This adds the CPUFreq driver for VIA Samuel/CyrixIII, + VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T +diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c +--- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c 2006-07-04 14:41:36.000000000 +0400 +@@ -244,7 +244,7 @@ static int cpufreq_p4_cpu_init(struct cp + for (i=1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { + if ((i<2) && (has_N44_O17_errata[policy->cpu])) + p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; +- else if (has_N60_errata[policy->cpu] && p4clockmod_table[i].frequency < 2000000) ++ else if (has_N60_errata[policy->cpu] && ((stock_freq * i)/8) < 2000000) + p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; + else + p4clockmod_table[i].frequency = (stock_freq * i)/8; +diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c +--- linux-2.6.16.orig/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c 2006-07-04 14:41:36.000000000 +0400 +@@ -75,7 +75,9 @@ static int speedstep_smi_ownership (void + __asm__ __volatile__( + "out %%al, (%%dx)\n" + : "=D" (result) +- : "a" (command), "b" (function), "c" (0), "d" (smi_port), "D" (0), "S" (magic) ++ : "a" (command), "b" (function), "c" (0), "d" (smi_port), ++ "D" (0), "S" (magic) ++ : "memory" + ); + + dprintk("result is %x\n", result); +diff -upr linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/if.c linux-2.6.16-026test015/arch/i386/kernel/cpu/mtrr/if.c +--- linux-2.6.16.orig/arch/i386/kernel/cpu/mtrr/if.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/cpu/mtrr/if.c 2006-07-04 14:41:38.000000000 +0400 +@@ -392,7 +392,7 @@ static int __init mtrr_if_init(void) + return -ENODEV; + + proc_root_mtrr = +- create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); ++ create_proc_entry("mtrr", S_IWUSR | S_IRUGO, NULL); + if (proc_root_mtrr) { + proc_root_mtrr->owner = THIS_MODULE; + proc_root_mtrr->proc_fops = &mtrr_fops; +diff -upr linux-2.6.16.orig/arch/i386/kernel/dmi_scan.c linux-2.6.16-026test015/arch/i386/kernel/dmi_scan.c +--- linux-2.6.16.orig/arch/i386/kernel/dmi_scan.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/dmi_scan.c 2006-07-04 14:41:36.000000000 +0400 +@@ -106,7 +106,7 @@ static void __init dmi_save_devices(stru + struct dmi_device *dev; + + for (i = 0; i < count; i++) { +- char *d = ((char *) dm) + (i * 2); ++ char *d = (char *)(dm + 1) + (i * 2); + + /* Skip disabled device */ + if ((*d & 0x80) == 0) +diff -upr linux-2.6.16.orig/arch/i386/kernel/irq.c linux-2.6.16-026test015/arch/i386/kernel/irq.c +--- linux-2.6.16.orig/arch/i386/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400 +@@ -59,7 +59,9 @@ fastcall unsigned int do_IRQ(struct pt_r + union irq_ctx *curctx, *irqctx; + u32 *isp; + #endif ++ struct ve_struct *ve; + ++ ve = set_exec_env(get_ve0()); + irq_enter(); + #ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ +@@ -108,6 +110,7 @@ fastcall unsigned int do_IRQ(struct pt_r + __do_IRQ(irq, regs); + + irq_exit(); ++ (void)set_exec_env(ve); + + return 1; + } +diff -upr linux-2.6.16.orig/arch/i386/kernel/ldt.c linux-2.6.16-026test015/arch/i386/kernel/ldt.c +--- linux-2.6.16.orig/arch/i386/kernel/ldt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/ldt.c 2006-07-04 14:41:39.000000000 +0400 +@@ -13,6 +13,7 @@ + #include <linux/smp_lock.h> + #include <linux/vmalloc.h> + #include <linux/slab.h> ++#include <linux/module.h> + + #include <asm/uaccess.h> + #include <asm/system.h> +@@ -20,6 +21,8 @@ + #include <asm/desc.h> + #include <asm/mmu_context.h> + ++#include <ub/ub_mem.h> ++ + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ + static void flush_ldt(void *null) + { +@@ -39,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, i + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) +- newldt = vmalloc(mincount*LDT_ENTRY_SIZE); ++ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); + else +- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); ++ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; +@@ -105,6 +108,7 @@ int init_new_context(struct task_struct + } + return retval; + } ++EXPORT_SYMBOL_GPL(init_new_context); + + /* + * No need to lock the MM as we are the last user +@@ -251,3 +255,5 @@ asmlinkage int sys_modify_ldt(int func, + } + return ret; + } ++ ++EXPORT_SYMBOL_GPL(default_ldt); +diff -upr linux-2.6.16.orig/arch/i386/kernel/nmi.c linux-2.6.16-026test015/arch/i386/kernel/nmi.c +--- linux-2.6.16.orig/arch/i386/kernel/nmi.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/nmi.c 2006-07-04 14:41:37.000000000 +0400 +@@ -32,7 +32,13 @@ + + #include "mach_traps.h" + +-unsigned int nmi_watchdog = NMI_NONE; ++#ifdef CONFIG_NMI_WATCHDOG ++#define NMI_DEFAULT NMI_IO_APIC ++#else ++#define NMI_DEFAULT NMI_NONE ++#endif ++ ++unsigned int nmi_watchdog = NMI_DEFAULT; + extern int unknown_nmi_panic; + static unsigned int nmi_hz = HZ; + static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ +@@ -521,7 +527,22 @@ void touch_nmi_watchdog (void) + + extern void die_nmi(struct pt_regs *, const char *msg); + +-void nmi_watchdog_tick (struct pt_regs * regs) ++void smp_show_regs(struct pt_regs *regs, void *info) ++{ ++ static DEFINE_SPINLOCK(show_regs_lock); ++ ++ if (regs == NULL) ++ return; ++ ++ bust_spinlocks(1); ++ spin_lock(&show_regs_lock); ++ printk("----------- IPI show regs -----------"); ++ show_regs(regs); ++ spin_unlock(&show_regs_lock); ++ bust_spinlocks(0); ++} ++ ++void nmi_watchdog_tick(struct pt_regs *regs) + { + + /* +diff -upr linux-2.6.16.orig/arch/i386/kernel/process.c linux-2.6.16-026test015/arch/i386/kernel/process.c +--- linux-2.6.16.orig/arch/i386/kernel/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -59,6 +59,7 @@ + #include <asm/cpu.h> + + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); ++EXPORT_SYMBOL_GPL(ret_from_fork); + + static int hlt_counter; + +@@ -289,11 +290,15 @@ __setup("idle=", idle_setup); + void show_regs(struct pt_regs * regs) + { + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; ++ extern int die_counter; + + printk("\n"); +- printk("Pid: %d, comm: %20s\n", current->pid, current->comm); +- printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); +- print_symbol("EIP is at %s\n", regs->eip); ++ printk("Pid: %d, comm: %20s, oopses: %d\n", ++ current->pid, current->comm, die_counter); ++ printk("EIP: %04x:[<%08lx>] CPU: %d, VCPU: %d:%d\n",0xffff & regs->xcs,regs->eip, smp_processor_id(), ++ task_vsched_id(current), task_cpu(current)); ++ if (decode_call_traces) ++ print_symbol("EIP is at %s\n", regs->eip); + + if (user_mode(regs)) + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); +@@ -314,6 +319,8 @@ void show_regs(struct pt_regs * regs) + cr4 = read_cr4_safe(); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + show_trace(NULL, ®s->esp); ++ if (!decode_call_traces) ++ printk(" EIP: [<%08lx>]\n",regs->eip); + } + + /* +@@ -339,6 +346,13 @@ int kernel_thread(int (*fn)(void *), voi + { + struct pt_regs regs; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; +diff -upr linux-2.6.16.orig/arch/i386/kernel/ptrace.c linux-2.6.16-026test015/arch/i386/kernel/ptrace.c +--- linux-2.6.16.orig/arch/i386/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/ptrace.c 2006-07-04 14:41:39.000000000 +0400 +@@ -706,7 +706,9 @@ int do_syscall_trace(struct pt_regs *reg + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ ++ set_pn_state(current, entryexit ? PN_STOP_LEAVE : PN_STOP_ENTRY); + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); ++ clear_pn_state(current); + + /* + * this isn't the same as continuing with a signal, but it will do +diff -upr linux-2.6.16.orig/arch/i386/kernel/signal.c linux-2.6.16-026test015/arch/i386/kernel/signal.c +--- linux-2.6.16.orig/arch/i386/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/signal.c 2006-07-04 14:41:36.000000000 +0400 +@@ -582,7 +582,7 @@ static void fastcall do_signal(struct pt + if (!user_mode(regs)) + return; + +- if (try_to_freeze()) ++ if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + + if (test_thread_flag(TIF_RESTORE_SIGMASK)) +diff -upr linux-2.6.16.orig/arch/i386/kernel/smp.c linux-2.6.16-026test015/arch/i386/kernel/smp.c +--- linux-2.6.16.orig/arch/i386/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/smp.c 2006-07-04 14:41:37.000000000 +0400 +@@ -21,6 +21,7 @@ + #include <linux/cpu.h> + #include <linux/module.h> + ++#include <asm/nmi.h> + #include <asm/mtrr.h> + #include <asm/tlbflush.h> + #include <mach_apic.h> +@@ -566,6 +567,89 @@ int smp_call_function (void (*func) (voi + } + EXPORT_SYMBOL(smp_call_function); + ++static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED; ++static struct nmi_call_data_struct { ++ smp_nmi_function func; ++ void *info; ++ atomic_t started; ++ atomic_t finished; ++ cpumask_t cpus_called; ++ int wait; ++} *nmi_call_data; ++ ++static int smp_nmi_callback(struct pt_regs * regs, int cpu) ++{ ++ smp_nmi_function func; ++ void *info; ++ int wait; ++ ++ func = nmi_call_data->func; ++ info = nmi_call_data->info; ++ wait = nmi_call_data->wait; ++ ack_APIC_irq(); ++ /* prevent from calling func() multiple times */ ++ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) ++ return 0; ++ /* ++ * notify initiating CPU that I've grabbed the data and am ++ * about to execute the function ++ */ ++ mb(); ++ atomic_inc(&nmi_call_data->started); ++ /* at this point the nmi_call_data structure is out of scope */ ++ irq_enter(); ++ func(regs, info); ++ irq_exit(); ++ if (wait) ++ atomic_inc(&nmi_call_data->finished); ++ ++ return 0; ++} ++ ++/* ++ * This function tries to call func(regs, info) on each cpu. ++ * Func must be fast and non-blocking. ++ * May be called with disabled interrupts and from any context. ++ */ ++int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) ++{ ++ struct nmi_call_data_struct data; ++ int cpus; ++ ++ cpus = num_online_cpus() - 1; ++ if (!cpus) ++ return 0; ++ ++ data.func = func; ++ data.info = info; ++ data.wait = wait; ++ atomic_set(&data.started, 0); ++ atomic_set(&data.finished, 0); ++ cpus_clear(data.cpus_called); ++ /* prevent this cpu from calling func if NMI happens */ ++ cpu_set(smp_processor_id(), data.cpus_called); ++ ++ if (!spin_trylock(&nmi_call_lock)) ++ return -1; ++ ++ nmi_call_data = &data; ++ set_nmi_ipi_callback(smp_nmi_callback); ++ mb(); ++ ++ /* Send a message to all other CPUs and wait for them to respond */ ++ send_IPI_allbutself(APIC_DM_NMI); ++ while (atomic_read(&data.started) != cpus) ++ barrier(); ++ ++ unset_nmi_ipi_callback(); ++ if (wait) ++ while (atomic_read(&data.finished) != cpus) ++ barrier(); ++ spin_unlock(&nmi_call_lock); ++ ++ return 0; ++} ++ + static void stop_this_cpu (void * dummy) + { + /* +diff -upr linux-2.6.16.orig/arch/i386/kernel/smpboot.c linux-2.6.16-026test015/arch/i386/kernel/smpboot.c +--- linux-2.6.16.orig/arch/i386/kernel/smpboot.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/smpboot.c 2006-07-04 14:41:38.000000000 +0400 +@@ -317,6 +317,10 @@ static void __init synchronize_tsc_bp (v + } + if (!buggy) + printk("passed.\n"); ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + + static void __init synchronize_tsc_ap (void) +@@ -342,6 +346,10 @@ static void __init synchronize_tsc_ap (v + atomic_inc(&tsc_count_stop); + while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + } ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + #undef NR_LOOPS + +@@ -908,6 +916,13 @@ static int __devinit do_boot_cpu(int api + if (IS_ERR(idle)) + panic("failed fork for CPU %d", cpu); + idle->thread.eip = (unsigned long) start_secondary; ++ ++#ifdef CONFIG_VE ++ /* Cosmetic: sleep_time won't be changed afterwards for the idle ++ * thread; keep it 0 rather than -cycles. */ ++ VE_TASK_INFO(idle)->sleep_time = 0; ++#endif ++ + /* start_eip had better be page-aligned! */ + start_eip = setup_trampoline(); + +diff -upr linux-2.6.16.orig/arch/i386/kernel/sys_i386.c linux-2.6.16-026test015/arch/i386/kernel/sys_i386.c +--- linux-2.6.16.orig/arch/i386/kernel/sys_i386.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/sys_i386.c 2006-07-04 14:41:38.000000000 +0400 +@@ -217,7 +217,7 @@ asmlinkage int sys_uname(struct old_utsn + if (!name) + return -EFAULT; + down_read(&uts_sem); +- err=copy_to_user(name, &system_utsname, sizeof (*name)); ++ err=copy_to_user(name, &ve_utsname, sizeof (*name)); + up_read(&uts_sem); + return err?-EFAULT:0; + } +@@ -233,15 +233,15 @@ asmlinkage int sys_olduname(struct oldol + + down_read(&uts_sem); + +- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); ++ error = __copy_to_user(name->sysname,ve_utsname.sysname,__OLD_UTS_LEN); + error |= __put_user(0,name->sysname+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->nodename,ve_utsname.nodename,__OLD_UTS_LEN); + error |= __put_user(0,name->nodename+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->release,ve_utsname.release,__OLD_UTS_LEN); + error |= __put_user(0,name->release+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->version,ve_utsname.version,__OLD_UTS_LEN); + error |= __put_user(0,name->version+__OLD_UTS_LEN); +- error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); ++ error |= __copy_to_user(name->machine,ve_utsname.machine,__OLD_UTS_LEN); + error |= __put_user(0,name->machine+__OLD_UTS_LEN); + + up_read(&uts_sem); +diff -upr linux-2.6.16.orig/arch/i386/kernel/syscall_table.S linux-2.6.16-026test015/arch/i386/kernel/syscall_table.S +--- linux-2.6.16.orig/arch/i386/kernel/syscall_table.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/syscall_table.S 2006-07-04 14:41:39.000000000 +0400 +@@ -310,3 +310,21 @@ ENTRY(sys_call_table) + .long sys_pselect6 + .long sys_ppoll + .long sys_unshare /* 310 */ ++ ++ .rept 500-(.-sys_call_table)/4 ++ .long sys_ni_syscall ++ .endr ++ .long sys_fairsched_mknod /* 500 */ ++ .long sys_fairsched_rmnod ++ .long sys_fairsched_chwt ++ .long sys_fairsched_mvpr ++ .long sys_fairsched_rate ++ ++ .rept 510-(.-sys_call_table)/4 ++ .long sys_ni_syscall ++ .endr ++ ++ .long sys_getluid /* 510 */ ++ .long sys_setluid ++ .long sys_setublimit ++ .long sys_ubstat +diff -upr linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c linux-2.6.16-026test015/arch/i386/kernel/timers/timer_tsc.c +--- linux-2.6.16.orig/arch/i386/kernel/timers/timer_tsc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/timers/timer_tsc.c 2006-07-04 14:41:38.000000000 +0400 +@@ -94,7 +94,7 @@ static int count2; /* counter for mark_o + * Equal to 2^32 * (1 / (clocks per usec) ). + * Initialized in time_init. + */ +-static unsigned long fast_gettimeoffset_quotient; ++unsigned long fast_gettimeoffset_quotient; + + static unsigned long get_offset_tsc(void) + { +diff -upr linux-2.6.16.orig/arch/i386/kernel/traps.c linux-2.6.16-026test015/arch/i386/kernel/traps.c +--- linux-2.6.16.orig/arch/i386/kernel/traps.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/traps.c 2006-07-04 14:41:39.000000000 +0400 +@@ -116,8 +116,10 @@ static void print_addr_and_symbol(unsign + { + printk(log_lvl); + printk(" [<%08lx>] ", addr); +- print_symbol("%s", addr); +- printk("\n"); ++ if (decode_call_traces) { ++ print_symbol("%s", addr); ++ printk("\n"); ++ } + } + + static inline unsigned long print_context_stack(struct thread_info *tinfo, +@@ -167,7 +169,10 @@ static void show_trace_log_lvl(struct ta + if (!stack) + break; + printk(log_lvl); +- printk(" =======================\n"); ++ if (decode_call_traces) ++ printk(" =======================\n"); ++ else ++ printk(" =<ctx>= "); + } + } + +@@ -203,8 +208,13 @@ static void show_stack_log_lvl(struct ta + } + printk("\n"); + printk(log_lvl); +- printk("Call Trace:\n"); ++ if (decode_call_traces) ++ printk("Call Trace:\n"); ++ else ++ printk("Call Trace: "); + show_trace_log_lvl(task, esp, log_lvl); ++ if (!decode_call_traces) ++ printk("\n"); + } + + void show_stack(struct task_struct *task, unsigned long *esp) +@@ -220,6 +230,8 @@ void dump_stack(void) + unsigned long stack; + + show_trace(current, &stack); ++ if (!decode_call_traces) ++ printk("\n"); + } + + EXPORT_SYMBOL(dump_stack); +@@ -239,9 +251,10 @@ void show_registers(struct pt_regs *regs + ss = regs->xss & 0xffff; + } + print_modules(); +- printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" ++ printk(KERN_EMERG "CPU: %d, VCPU: %d:%d\nEIP: %04x:[<%08lx>] %s VLI\n" + "EFLAGS: %08lx (%s %.*s) \n", +- smp_processor_id(), 0xffff & regs->xcs, regs->eip, ++ smp_processor_id(), task_vsched_id(current), task_cpu(current), ++ 0xffff & regs->xcs, regs->eip, + print_tainted(), regs->eflags, system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); +@@ -252,8 +265,11 @@ void show_registers(struct pt_regs *regs + regs->esi, regs->edi, regs->ebp, esp); + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss); +- printk(KERN_EMERG "Process %s (pid: %d, threadinfo=%p task=%p)", +- current->comm, current->pid, current_thread_info(), current); ++ printk(KERN_EMERG "Process %s (pid: %d, veid=%d, threadinfo=%p task=%p)", ++ current->comm, current->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ current_thread_info(), current); ++ + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. +@@ -299,9 +315,9 @@ static void handle_BUG(struct pt_regs *r + goto no_bug; + if (ud2 != 0x0b0f) + goto no_bug; +- if (__get_user(line, (unsigned short __user *)(eip + 2))) ++ if (__get_user(line, (unsigned short __user *)(eip + 4))) + goto bug; +- if (__get_user(file, (char * __user *)(eip + 4)) || ++ if (__get_user(file, (char * __user *)(eip + 7)) || + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + file = "<bad filename>"; + +@@ -316,6 +332,15 @@ bug: + printk(KERN_EMERG "Kernel BUG\n"); + } + ++int die_counter = 0; ++ ++static void inline check_kernel_csum_bug(void) ++{ ++ if (kernel_text_csum_broken) ++ printk("Kernel code checksum mismatch detected %d times\n", ++ kernel_text_csum_broken); ++} ++ + /* This is gone through when something in the kernel + * has done something bad and is about to be terminated. + */ +@@ -330,7 +355,6 @@ void die(const char * str, struct pt_reg + .lock_owner = -1, + .lock_owner_depth = 0 + }; +- static int die_counter; + unsigned long flags; + + if (die.lock_owner != raw_smp_processor_id()) { +@@ -370,6 +394,7 @@ void die(const char * str, struct pt_reg + } else + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + ++ check_kernel_csum_bug(); + bust_spinlocks(0); + die.lock_owner = -1; + spin_unlock_irqrestore(&die.lock, flags); +@@ -597,12 +622,27 @@ static void unknown_nmi_error(unsigned c + printk("Do you have a strange power saving mode enabled?\n"); + } + +-static DEFINE_SPINLOCK(nmi_print_lock); ++/* ++ * Voyager doesn't implement these ++ */ ++void __attribute__((weak)) smp_show_regs(struct pt_regs *regs, void *info) ++{ ++} ++ ++#ifdef CONFIG_SMP ++int __attribute__((weak)) ++smp_nmi_call_function(smp_nmi_function func, void *info, int wait) ++{ ++ return 0; ++} ++#endif + + void die_nmi (struct pt_regs *regs, const char *msg) + { ++ static DEFINE_SPINLOCK(nmi_print_lock); ++ + if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) == +- NOTIFY_STOP) ++ NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); +@@ -615,7 +655,11 @@ void die_nmi (struct pt_regs *regs, cons + printk(" on CPU%d, eip %08lx, registers:\n", + smp_processor_id(), regs->eip); + show_registers(regs); +- printk(KERN_EMERG "console shuts up ...\n"); ++ smp_nmi_call_function(smp_show_regs, NULL, 1); ++ bust_spinlocks(1); ++ /* current CPU messages should go bottom */ ++ if (!decode_call_traces) ++ smp_show_regs(regs, NULL); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); +@@ -631,6 +675,14 @@ void die_nmi (struct pt_regs *regs, cons + do_exit(SIGSEGV); + } + ++static int dummy_nmi_callback(struct pt_regs * regs, int cpu) ++{ ++ return 0; ++} ++ ++static nmi_callback_t nmi_callback = dummy_nmi_callback; ++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; ++ + static void default_do_nmi(struct pt_regs * regs) + { + unsigned char reason = 0; +@@ -653,6 +705,9 @@ static void default_do_nmi(struct pt_reg + return; + } + #endif ++ if (nmi_ipi_callback != dummy_nmi_callback) ++ return; ++ + unknown_nmi_error(reason, regs); + return; + } +@@ -669,13 +724,6 @@ static void default_do_nmi(struct pt_reg + reassert_nmi(); + } + +-static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +-{ +- return 0; +-} +- +-static nmi_callback_t nmi_callback = dummy_nmi_callback; +- + fastcall void do_nmi(struct pt_regs * regs, long error_code) + { + int cpu; +@@ -689,9 +737,20 @@ fastcall void do_nmi(struct pt_regs * re + if (!rcu_dereference(nmi_callback)(regs, cpu)) + default_do_nmi(regs); + ++ nmi_ipi_callback(regs, cpu); + nmi_exit(); + } + ++void set_nmi_ipi_callback(nmi_callback_t callback) ++{ ++ nmi_ipi_callback = callback; ++} ++ ++void unset_nmi_ipi_callback(void) ++{ ++ nmi_ipi_callback = dummy_nmi_callback; ++} ++ + void set_nmi_callback(nmi_callback_t callback) + { + rcu_assign_pointer(nmi_callback, callback); +diff -upr linux-2.6.16.orig/arch/i386/kernel/vm86.c linux-2.6.16-026test015/arch/i386/kernel/vm86.c +--- linux-2.6.16.orig/arch/i386/kernel/vm86.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/kernel/vm86.c 2006-07-04 14:41:36.000000000 +0400 +@@ -43,6 +43,7 @@ + #include <linux/smp_lock.h> + #include <linux/highmem.h> + #include <linux/ptrace.h> ++#include <linux/audit.h> + + #include <asm/uaccess.h> + #include <asm/io.h> +@@ -252,6 +253,7 @@ out: + static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) + { + struct tss_struct *tss; ++ long eax; + /* + * make sure the vm86() system call doesn't try to do anything silly + */ +@@ -305,13 +307,19 @@ static void do_sys_vm86(struct kernel_vm + tsk->thread.screen_bitmap = info->screen_bitmap; + if (info->flags & VM86_SCREEN_BITMAP) + mark_screen_rdonly(tsk->mm); ++ __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t"); ++ __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax)); ++ ++ /*call audit_syscall_exit since we do not exit via the normal paths */ ++ if (unlikely(current->audit_context)) ++ audit_syscall_exit(current, AUDITSC_RESULT(eax), eax); ++ + __asm__ __volatile__( +- "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" + "movl %0,%%esp\n\t" + "movl %1,%%ebp\n\t" + "jmp resume_userspace" + : /* no outputs */ +- :"r" (&info->regs), "r" (task_thread_info(tsk)) : "ax"); ++ :"r" (&info->regs), "r" (task_thread_info(tsk))); + /* we never return here */ + } + +diff -upr linux-2.6.16.orig/arch/i386/mm/fault.c linux-2.6.16-026test015/arch/i386/mm/fault.c +--- linux-2.6.16.orig/arch/i386/mm/fault.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/mm/fault.c 2006-07-04 14:41:37.000000000 +0400 +@@ -31,32 +31,6 @@ + extern void die(const char *,struct pt_regs *,long); + + /* +- * Unlock any spinlocks which will prevent us from getting the +- * message out +- */ +-void bust_spinlocks(int yes) +-{ +- int loglevel_save = console_loglevel; +- +- if (yes) { +- oops_in_progress = 1; +- return; +- } +-#ifdef CONFIG_VT +- unblank_screen(); +-#endif +- oops_in_progress = 0; +- /* +- * OK, the message is on the console. Now we call printk() +- * without oops_in_progress set so that printk will give klogd +- * a poke. Hold onto your hats... +- */ +- console_loglevel = 15; /* NMI oopser may have shut the console up */ +- printk(" "); +- console_loglevel = loglevel_save; +-} +- +-/* + * Return EIP plus the CS segment base. The segment limit is also + * adjusted, clamped to the kernel/user address space (whichever is + * appropriate), and returned in *eip_limit. +@@ -347,7 +321,6 @@ good_area: + goto bad_area; + } + +- survive: + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo +@@ -485,14 +458,14 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (tsk->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; ++ if (error_code & 4) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, tsk); ++ return; + } +- printk("VM: killing process %s\n", tsk->comm); +- if (error_code & 4) +- do_exit(SIGKILL); + goto no_context; + + do_sigbus: +diff -upr linux-2.6.16.orig/arch/i386/mm/hugetlbpage.c linux-2.6.16-026test015/arch/i386/mm/hugetlbpage.c +--- linux-2.6.16.orig/arch/i386/mm/hugetlbpage.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/mm/hugetlbpage.c 2006-07-04 14:41:39.000000000 +0400 +@@ -14,6 +14,7 @@ + #include <linux/slab.h> + #include <linux/err.h> + #include <linux/sysctl.h> ++#include <linux/module.h> + #include <asm/mman.h> + #include <asm/tlb.h> + #include <asm/tlbflush.h> +@@ -110,6 +111,7 @@ int pmd_huge(pmd_t pmd) + { + return !!(pmd_val(pmd) & _PAGE_PSE); + } ++EXPORT_SYMBOL(pmd_huge); + + struct page * + follow_huge_pmd(struct mm_struct *mm, unsigned long address, +diff -upr linux-2.6.16.orig/arch/i386/mm/init.c linux-2.6.16-026test015/arch/i386/mm/init.c +--- linux-2.6.16.orig/arch/i386/mm/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/mm/init.c 2006-07-04 14:41:37.000000000 +0400 +@@ -677,7 +677,7 @@ void __init pgtable_cache_init(void) + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), +- 0, ++ SLAB_UBC, + pmd_ctor, + NULL); + if (!pmd_cache) +@@ -686,7 +686,7 @@ void __init pgtable_cache_init(void) + pgd_cache = kmem_cache_create("pgd", + PTRS_PER_PGD*sizeof(pgd_t), + PTRS_PER_PGD*sizeof(pgd_t), +- 0, ++ SLAB_UBC, + pgd_ctor, + PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + if (!pgd_cache) +diff -upr linux-2.6.16.orig/arch/i386/mm/pgtable.c linux-2.6.16-026test015/arch/i386/mm/pgtable.c +--- linux-2.6.16.orig/arch/i386/mm/pgtable.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/i386/mm/pgtable.c 2006-07-04 14:41:38.000000000 +0400 +@@ -5,8 +5,10 @@ + #include <linux/config.h> + #include <linux/sched.h> + #include <linux/kernel.h> ++#include <linux/module.h> + #include <linux/errno.h> + #include <linux/mm.h> ++#include <linux/vmalloc.h> + #include <linux/swap.h> + #include <linux/smp.h> + #include <linux/highmem.h> +@@ -64,7 +66,9 @@ void show_mem(void) + printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); + printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); + printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); ++ vprintstat(); + } ++EXPORT_SYMBOL(show_mem); + + /* + * Associate a virtual page frame with a given physical page frame +@@ -159,9 +163,11 @@ struct page *pte_alloc_one(struct mm_str + struct page *pte; + + #ifdef CONFIG_HIGHPTE +- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_HIGHMEM| ++ __GFP_REPEAT|__GFP_ZERO, 0); + #else +- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); ++ pte = alloc_pages(GFP_KERNEL_UBC|__GFP_SOFT_UBC| ++ __GFP_REPEAT|__GFP_ZERO, 0); + #endif + return pte; + } +diff -upr linux-2.6.16.orig/arch/ia64/Kconfig linux-2.6.16-026test015/arch/ia64/Kconfig +--- linux-2.6.16.orig/arch/ia64/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -283,6 +283,8 @@ config PREEMPT + Say Y here if you are building a kernel for a desktop, embedded + or real-time system. Say N if you are unsure. + ++source "kernel/Kconfig.fairsched" ++ + source "mm/Kconfig" + + config ARCH_SELECT_MEMORY_MODEL +@@ -464,6 +466,10 @@ endmenu + + source "arch/ia64/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" ++ ++source "kernel/ub/Kconfig" +diff -upr linux-2.6.16.orig/arch/ia64/ia32/binfmt_elf32.c linux-2.6.16-026test015/arch/ia64/ia32/binfmt_elf32.c +--- linux-2.6.16.orig/arch/ia64/ia32/binfmt_elf32.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/ia32/binfmt_elf32.c 2006-07-04 14:41:37.000000000 +0400 +@@ -136,6 +136,12 @@ ia64_elf32_init (struct pt_regs *regs) + up_write(¤t->mm->mmap_sem); + } + ++ if (ub_memory_charge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * ++ IA32_LDT_ENTRY_SIZE), ++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, ++ NULL, UB_SOFT)) ++ goto skip; ++ + /* + * Install LDT as anonymous memory. This gives us all-zero segment descriptors + * until a task modifies them via modify_ldt(). +@@ -157,7 +163,12 @@ ia64_elf32_init (struct pt_regs *regs) + } + } + up_write(¤t->mm->mmap_sem); +- } ++ } else ++ ub_memory_uncharge(current->mm, PAGE_ALIGN(IA32_LDT_ENTRIES * ++ IA32_LDT_ENTRY_SIZE), ++ VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE, NULL); ++ ++skip: + + ia64_psr(regs)->ac = 0; /* turn off alignment checking */ + regs->loadrs = 0; +@@ -212,9 +223,15 @@ ia32_setup_arg_pages (struct linux_binpr + bprm->loader += stack_base; + bprm->exec += stack_base; + ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL, UB_SOFT)) ++ goto err_charge; ++ + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!mpnt) +- return -ENOMEM; ++ goto err_alloc; + + memset(mpnt, 0, sizeof(*mpnt)); + +@@ -231,11 +248,8 @@ ia32_setup_arg_pages (struct linux_binpr + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)? + PAGE_COPY_EXEC: PAGE_COPY; +- if ((ret = insert_vm_struct(current->mm, mpnt))) { +- up_write(¤t->mm->mmap_sem); +- kmem_cache_free(vm_area_cachep, mpnt); +- return ret; +- } ++ if ((ret = insert_vm_struct(current->mm, mpnt))) ++ goto err_insert; + current->mm->stack_vm = current->mm->total_vm = vma_pages(mpnt); + } + +@@ -254,6 +268,16 @@ ia32_setup_arg_pages (struct linux_binpr + current->thread.ppl = ia32_init_pp_list(); + + return 0; ++ ++err_insert: ++ up_write(¤t->mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, mpnt); ++err_alloc: ++ ub_memory_uncharge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL); ++err_charge: ++ return ret; + } + + static void +diff -upr linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c linux-2.6.16-026test015/arch/ia64/kernel/asm-offsets.c +--- linux-2.6.16.orig/arch/ia64/kernel/asm-offsets.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/asm-offsets.c 2006-07-04 14:41:38.000000000 +0400 +@@ -44,11 +44,21 @@ void foo(void) + DEFINE(IA64_TASK_CLEAR_CHILD_TID_OFFSET,offsetof (struct task_struct, clear_child_tid)); + DEFINE(IA64_TASK_GROUP_LEADER_OFFSET, offsetof (struct task_struct, group_leader)); + DEFINE(IA64_TASK_PENDING_OFFSET,offsetof (struct task_struct, pending)); ++#ifdef CONFIG_VE ++ DEFINE(IA64_TASK_PID_OFFSET, offsetof ++ (struct task_struct, pids[PIDTYPE_PID].vnr)); ++#else + DEFINE(IA64_TASK_PID_OFFSET, offsetof (struct task_struct, pid)); ++#endif + DEFINE(IA64_TASK_REAL_PARENT_OFFSET, offsetof (struct task_struct, real_parent)); + DEFINE(IA64_TASK_SIGHAND_OFFSET,offsetof (struct task_struct, sighand)); + DEFINE(IA64_TASK_SIGNAL_OFFSET,offsetof (struct task_struct, signal)); ++#ifdef CONFIG_VE ++ DEFINE(IA64_TASK_TGID_OFFSET, offsetof ++ (struct task_struct, pids[PIDTYPE_TGID].vnr)); ++#else + DEFINE(IA64_TASK_TGID_OFFSET, offsetof (struct task_struct, tgid)); ++#endif + DEFINE(IA64_TASK_THREAD_KSP_OFFSET, offsetof (struct task_struct, thread.ksp)); + DEFINE(IA64_TASK_THREAD_ON_USTACK_OFFSET, offsetof (struct task_struct, thread.on_ustack)); + +diff -upr linux-2.6.16.orig/arch/ia64/kernel/entry.S linux-2.6.16-026test015/arch/ia64/kernel/entry.S +--- linux-2.6.16.orig/arch/ia64/kernel/entry.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/entry.S 2006-07-04 14:41:39.000000000 +0400 +@@ -1620,4 +1620,17 @@ sys_call_table: + data8 sys_ni_syscall // 1295 reserved for ppoll + data8 sys_unshare + ++.rept 1500-1297 ++ data8 sys_ni_syscall ++.endr ++ data8 sys_fairsched_mknod // 1500 ++ data8 sys_fairsched_rmnod ++ data8 sys_fairsched_chwt ++ data8 sys_fairsched_mvpr ++ data8 sys_fairsched_rate ++ data8 sys_getluid // 1505 ++ data8 sys_setluid ++ data8 sys_setublimit ++ data8 sys_ubstat ++ + .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls +diff -upr linux-2.6.16.orig/arch/ia64/kernel/fsys.S linux-2.6.16-026test015/arch/ia64/kernel/fsys.S +--- linux-2.6.16.orig/arch/ia64/kernel/fsys.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/fsys.S 2006-07-04 14:41:38.000000000 +0400 +@@ -72,6 +72,7 @@ ENTRY(fsys_getpid) + FSYS_RETURN + END(fsys_getpid) + ++#ifndef CONFIG_VE + ENTRY(fsys_getppid) + .prologue + .altrp b6 +@@ -118,6 +119,7 @@ ENTRY(fsys_getppid) + #endif + FSYS_RETURN + END(fsys_getppid) ++#endif + + ENTRY(fsys_set_tid_address) + .prologue +@@ -665,7 +667,11 @@ fsyscall_table: + data8 0 // chown + data8 0 // lseek // 1040 + data8 fsys_getpid // getpid ++#ifdef CONFIG_VE ++ data8 0 ++#else + data8 fsys_getppid // getppid ++#endif + data8 0 // mount + data8 0 // umount + data8 0 // setuid // 1045 +diff -upr linux-2.6.16.orig/arch/ia64/kernel/irq.c linux-2.6.16-026test015/arch/ia64/kernel/irq.c +--- linux-2.6.16.orig/arch/ia64/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400 +@@ -163,7 +163,9 @@ void fixup_irqs(void) + { + unsigned int irq; + extern void ia64_process_pending_intr(void); ++ struct ve_struct *ve; + ++ ve = set_exec_env(get_ve0()); + ia64_set_itv(1<<16); + /* + * Phase 1: Locate irq's bound to this cpu and +@@ -197,5 +199,6 @@ void fixup_irqs(void) + */ + max_xtp(); + local_irq_disable(); ++ (void)set_exec_env(ve); + } + #endif +diff -upr linux-2.6.16.orig/arch/ia64/kernel/irq_ia64.c linux-2.6.16-026test015/arch/ia64/kernel/irq_ia64.c +--- linux-2.6.16.orig/arch/ia64/kernel/irq_ia64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/irq_ia64.c 2006-07-04 14:41:38.000000000 +0400 +@@ -103,6 +103,7 @@ void + ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) + { + unsigned long saved_tpr; ++ struct ve_struct *ve; + + #if IRQ_DEBUG + { +@@ -139,6 +140,7 @@ ia64_handle_irq (ia64_vector vector, str + * 16 (without this, it would be ~240, which could easily lead + * to kernel stack overflows). + */ ++ ve = set_exec_env(get_ve0()); + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); +@@ -164,6 +166,7 @@ ia64_handle_irq (ia64_vector vector, str + * come through until ia64_eoi() has been done. + */ + irq_exit(); ++ (void)set_exec_env(get_ve0()); + } + + #ifdef CONFIG_HOTPLUG_CPU +@@ -176,9 +179,11 @@ void ia64_process_pending_intr(void) + ia64_vector vector; + unsigned long saved_tpr; + extern unsigned int vectors_in_migration[NR_IRQS]; ++ struct ve_struct *ve; + + vector = ia64_get_ivr(); + ++ ve = set_exec_env(get_ve0()); + irq_enter(); + saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); + ia64_srlz_d(); +@@ -210,6 +215,7 @@ void ia64_process_pending_intr(void) + vector = ia64_get_ivr(); + } + irq_exit(); ++ (void)set_exec_env(ve); + } + #endif + +diff -upr linux-2.6.16.orig/arch/ia64/kernel/mca.c linux-2.6.16-026test015/arch/ia64/kernel/mca.c +--- linux-2.6.16.orig/arch/ia64/kernel/mca.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/mca.c 2006-07-04 14:41:38.000000000 +0400 +@@ -1241,10 +1241,10 @@ default_monarch_init_process(struct noti + } + printk("\n\n"); + if (read_trylock(&tasklist_lock)) { +- do_each_thread (g, t) { ++ do_each_thread_all (g, t) { + printk("\nBacktrace of pid %d (%s)\n", t->pid, t->comm); + show_stack(t, NULL); +- } while_each_thread (g, t); ++ } while_each_thread_all (g, t); + read_unlock(&tasklist_lock); + } + return NOTIFY_DONE; +diff -upr linux-2.6.16.orig/arch/ia64/kernel/perfmon.c linux-2.6.16-026test015/arch/ia64/kernel/perfmon.c +--- linux-2.6.16.orig/arch/ia64/kernel/perfmon.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/perfmon.c 2006-07-04 14:41:38.000000000 +0400 +@@ -2624,7 +2624,7 @@ pfm_get_task(pfm_context_t *ctx, pid_t p + + read_lock(&tasklist_lock); + +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + + /* make sure task cannot go away while we operate on it */ + if (p) get_task_struct(p); +@@ -4188,12 +4188,12 @@ pfm_check_task_exist(pfm_context_t *ctx) + + read_lock(&tasklist_lock); + +- do_each_thread (g, t) { ++ do_each_thread_ve (g, t) { + if (t->thread.pfm_context == ctx) { + ret = 0; + break; + } +- } while_each_thread (g, t); ++ } while_each_thread_ve (g, t); + + read_unlock(&tasklist_lock); + +diff -upr linux-2.6.16.orig/arch/ia64/kernel/process.c linux-2.6.16-026test015/arch/ia64/kernel/process.c +--- linux-2.6.16.orig/arch/ia64/kernel/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -109,7 +109,8 @@ show_regs (struct pt_regs *regs) + unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; + + print_modules(); +- printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); ++ printk("\nPid: %d, CPU %d, VCPU %d:%d, comm: %20s\n", current->pid, smp_processor_id(), ++ task_vsched_id(current), task_cpu(current), current->comm); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); + print_symbol("ip is at %s\n", ip); +@@ -681,6 +682,13 @@ kernel_thread (int (*fn)(void *), void * + struct pt_regs pt; + } regs; + ++ /* Don't allow kernel_thread() inside VE */ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ +diff -upr linux-2.6.16.orig/arch/ia64/kernel/ptrace.c linux-2.6.16-026test015/arch/ia64/kernel/ptrace.c +--- linux-2.6.16.orig/arch/ia64/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/ptrace.c 2006-07-04 14:41:38.000000000 +0400 +@@ -1433,7 +1433,7 @@ sys_ptrace (long request, pid_t pid, uns + ret = -ESRCH; + read_lock(&tasklist_lock); + { +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) { + if (peek_or_poke) + child = find_thread_for_addr(child, addr); +diff -upr linux-2.6.16.orig/arch/ia64/kernel/signal.c linux-2.6.16-026test015/arch/ia64/kernel/signal.c +--- linux-2.6.16.orig/arch/ia64/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/signal.c 2006-07-04 14:41:38.000000000 +0400 +@@ -270,7 +270,7 @@ ia64_rt_sigreturn (struct sigscratch *sc + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; +- si.si_pid = current->pid; ++ si.si_pid = virt_pid(current); + si.si_uid = current->uid; + si.si_addr = sc; + force_sig_info(SIGSEGV, &si, current); +@@ -375,7 +375,7 @@ force_sigsegv_info (int sig, void __user + si.si_signo = SIGSEGV; + si.si_errno = 0; + si.si_code = SI_KERNEL; +- si.si_pid = current->pid; ++ si.si_pid = virt_pid(current); + si.si_uid = current->uid; + si.si_addr = addr; + force_sig_info(SIGSEGV, &si, current); +@@ -641,7 +641,7 @@ set_sigdelayed(pid_t pid, int signo, int + for (i = 1; i <= 3; ++i) { + switch (i) { + case 1: +- t = find_task_by_pid(pid); ++ t = find_task_by_pid_ve(pid); + if (t) + start_time = start_time_ul(t); + break; +@@ -682,7 +682,7 @@ do_sigdelayed(void) + siginfo.si_code = current_thread_info()->sigdelayed.code; + siginfo.si_addr = current_thread_info()->sigdelayed.addr; + pid = current_thread_info()->sigdelayed.pid; +- t = find_task_by_pid(pid); ++ t = find_task_by_pid_ve(pid); + if (!t) + return; + if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) +diff -upr linux-2.6.16.orig/arch/ia64/kernel/traps.c linux-2.6.16-026test015/arch/ia64/kernel/traps.c +--- linux-2.6.16.orig/arch/ia64/kernel/traps.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/traps.c 2006-07-04 14:41:37.000000000 +0400 +@@ -54,34 +54,6 @@ trap_init (void) + fpswa_interface = __va(ia64_boot_param->fpswa); + } + +-/* +- * Unlock any spinlocks which will prevent us from getting the message out (timerlist_lock +- * is acquired through the console unblank code) +- */ +-void +-bust_spinlocks (int yes) +-{ +- int loglevel_save = console_loglevel; +- +- if (yes) { +- oops_in_progress = 1; +- return; +- } +- +-#ifdef CONFIG_VT +- unblank_screen(); +-#endif +- oops_in_progress = 0; +- /* +- * OK, the message is on the console. Now we call printk() without +- * oops_in_progress set so that printk will give klogd a poke. Hold onto +- * your hats... +- */ +- console_loglevel = 15; /* NMI oopser may have shut the console up */ +- printk(" "); +- console_loglevel = loglevel_save; +-} +- + void + die (const char *str, struct pt_regs *regs, long err) + { +diff -upr linux-2.6.16.orig/arch/ia64/kernel/unaligned.c linux-2.6.16-026test015/arch/ia64/kernel/unaligned.c +--- linux-2.6.16.orig/arch/ia64/kernel/unaligned.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/kernel/unaligned.c 2006-07-04 14:41:37.000000000 +0400 +@@ -1290,7 +1290,7 @@ within_logging_rate_limit (void) + { + static unsigned long count, last_time; + +- if (jiffies - last_time > 5*HZ) ++ if (jiffies - last_time > 60 * HZ) + count = 0; + if (count < 5) { + last_time = jiffies; +diff -upr linux-2.6.16.orig/arch/ia64/mm/contig.c linux-2.6.16-026test015/arch/ia64/mm/contig.c +--- linux-2.6.16.orig/arch/ia64/mm/contig.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/mm/contig.c 2006-07-04 14:41:38.000000000 +0400 +@@ -64,6 +64,7 @@ show_mem (void) + printk("%ld pages in page table cache\n", + pgtable_quicklist_total_size()); + } ++EXPORT_SYMBOL(show_mem); + + /* physical address where the bootmem map is located */ + unsigned long bootmap_start; +diff -upr linux-2.6.16.orig/arch/ia64/mm/discontig.c linux-2.6.16-026test015/arch/ia64/mm/discontig.c +--- linux-2.6.16.orig/arch/ia64/mm/discontig.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/mm/discontig.c 2006-07-04 14:41:38.000000000 +0400 +@@ -594,6 +594,7 @@ void show_mem(void) + pgtable_quicklist_total_size()); + printk("%d free buffer pages\n", nr_free_buffer_pages()); + } ++EXPORT_SYMBOL(show_mem); + + /** + * call_pernode_memory - use SRAT to call callback functions with node info +diff -upr linux-2.6.16.orig/arch/ia64/mm/fault.c linux-2.6.16-026test015/arch/ia64/mm/fault.c +--- linux-2.6.16.orig/arch/ia64/mm/fault.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/mm/fault.c 2006-07-04 14:41:37.000000000 +0400 +@@ -116,7 +116,6 @@ ia64_do_page_fault (unsigned long addres + if ((vma->vm_flags & mask) != mask) + goto bad_area; + +- survive: + /* + * If for any reason at all we couldn't handle the fault, make + * sure we exit gracefully rather than endlessly redo the +@@ -241,13 +240,13 @@ ia64_do_page_fault (unsigned long addres + + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk(KERN_CRIT "VM: killing process %s\n", current->comm); +- if (user_mode(regs)) +- do_exit(SIGKILL); ++ if (user_mode(regs)) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, current); ++ return; ++ } + goto no_context; + } +diff -upr linux-2.6.16.orig/arch/ia64/mm/init.c linux-2.6.16-026test015/arch/ia64/mm/init.c +--- linux-2.6.16.orig/arch/ia64/mm/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ia64/mm/init.c 2006-07-04 14:41:37.000000000 +0400 +@@ -37,6 +37,8 @@ + #include <asm/unistd.h> + #include <asm/mca.h> + ++#include <ub/ub_vmpages.h> ++ + DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + + DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); +@@ -96,7 +98,7 @@ check_pgt_cache(void) + preempt_disable(); + while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { + while (pages_to_free--) { +- free_page((unsigned long)pgtable_quicklist_alloc()); ++ free_page((unsigned long)pgtable_quicklist_alloc(0)); + } + preempt_enable(); + preempt_disable(); +@@ -146,6 +148,10 @@ ia64_init_addr_space (void) + + ia64_set_rbs_bot(); + ++ if (ub_memory_charge(current->mm, PAGE_SIZE, VM_DATA_DEFAULT_FLAGS, ++ NULL, UB_SOFT)) ++ goto skip; ++ + /* + * If we're out of memory and kmem_cache_alloc() returns NULL, we simply ignore + * the problem. When the process attempts to write to the register backing store +@@ -166,8 +172,11 @@ ia64_init_addr_space (void) + return; + } + up_write(¤t->mm->mmap_sem); +- } ++ } else ++ ub_memory_uncharge(current->mm, PAGE_SIZE, ++ VM_DATA_DEFAULT_FLAGS, NULL); + ++skip: + /* map NaT-page at address zero to speed up speculative dereferencing of NULL: */ + if (!(current->personality & MMAP_PAGE_ZERO)) { + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); +diff -upr linux-2.6.16.orig/arch/m32r/kernel/m32r_ksyms.c linux-2.6.16-026test015/arch/m32r/kernel/m32r_ksyms.c +--- linux-2.6.16.orig/arch/m32r/kernel/m32r_ksyms.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/m32r/kernel/m32r_ksyms.c 2006-07-04 14:41:36.000000000 +0400 +@@ -38,10 +38,6 @@ EXPORT_SYMBOL(__udelay); + EXPORT_SYMBOL(__delay); + EXPORT_SYMBOL(__const_udelay); + +-EXPORT_SYMBOL(__get_user_1); +-EXPORT_SYMBOL(__get_user_2); +-EXPORT_SYMBOL(__get_user_4); +- + EXPORT_SYMBOL(strpbrk); + EXPORT_SYMBOL(strstr); + +diff -upr linux-2.6.16.orig/arch/m32r/kernel/setup.c linux-2.6.16-026test015/arch/m32r/kernel/setup.c +--- linux-2.6.16.orig/arch/m32r/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/m32r/kernel/setup.c 2006-07-04 14:41:36.000000000 +0400 +@@ -9,6 +9,7 @@ + + #include <linux/config.h> + #include <linux/init.h> ++#include <linux/kernel.h> + #include <linux/stddef.h> + #include <linux/fs.h> + #include <linux/sched.h> +@@ -218,8 +219,6 @@ static unsigned long __init setup_memory + extern unsigned long setup_memory(void); + #endif /* CONFIG_DISCONTIGMEM */ + +-#define M32R_PCC_PCATCR 0x00ef7014 /* will move to m32r.h */ +- + void __init setup_arch(char **cmdline_p) + { + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); +@@ -268,15 +267,14 @@ void __init setup_arch(char **cmdline_p) + paging_init(); + } + +-static struct cpu cpu[NR_CPUS]; ++static struct cpu cpu_devices[NR_CPUS]; + + static int __init topology_init(void) + { +- int cpu_id; ++ int i; + +- for (cpu_id = 0; cpu_id < NR_CPUS; cpu_id++) +- if (cpu_possible(cpu_id)) +- register_cpu(&cpu[cpu_id], cpu_id, NULL); ++ for_each_present_cpu(i) ++ register_cpu(&cpu_devices[i], i, NULL); + + return 0; + } +diff -upr linux-2.6.16.orig/arch/m32r/kernel/smpboot.c linux-2.6.16-026test015/arch/m32r/kernel/smpboot.c +--- linux-2.6.16.orig/arch/m32r/kernel/smpboot.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/m32r/kernel/smpboot.c 2006-07-04 14:41:36.000000000 +0400 +@@ -39,8 +39,10 @@ + * Martin J. Bligh : Added support for multi-quad systems + */ + ++#include <linux/module.h> + #include <linux/config.h> + #include <linux/init.h> ++#include <linux/kernel.h> + #include <linux/mm.h> + #include <linux/smp_lock.h> + #include <linux/irq.h> +@@ -72,11 +74,15 @@ physid_mask_t phys_cpu_present_map; + + /* Bitmask of currently online CPUs */ + cpumask_t cpu_online_map; ++EXPORT_SYMBOL(cpu_online_map); + + cpumask_t cpu_bootout_map; + cpumask_t cpu_bootin_map; +-cpumask_t cpu_callout_map; + static cpumask_t cpu_callin_map; ++cpumask_t cpu_callout_map; ++EXPORT_SYMBOL(cpu_callout_map); ++cpumask_t cpu_possible_map = CPU_MASK_ALL; ++EXPORT_SYMBOL(cpu_possible_map); + + /* Per CPU bogomips and other parameters */ + struct cpuinfo_m32r cpu_data[NR_CPUS] __cacheline_aligned; +@@ -110,7 +116,6 @@ static unsigned int calibration_result; + + void smp_prepare_boot_cpu(void); + void smp_prepare_cpus(unsigned int); +-static void smp_tune_scheduling(void); + static void init_ipi_lock(void); + static void do_boot_cpu(int); + int __cpu_up(unsigned int); +@@ -177,6 +182,9 @@ void __init smp_prepare_cpus(unsigned in + } + for (phys_id = 0 ; phys_id < nr_cpu ; phys_id++) + physid_set(phys_id, phys_cpu_present_map); ++#ifndef CONFIG_HOTPLUG_CPU ++ cpu_present_map = cpu_possible_map; ++#endif + + show_mp_info(nr_cpu); + +@@ -186,7 +194,6 @@ void __init smp_prepare_cpus(unsigned in + * Setup boot CPU information + */ + smp_store_cpu_info(0); /* Final full version of the data */ +- smp_tune_scheduling(); + + /* + * If SMP should be disabled, then really disable it! +@@ -230,11 +237,6 @@ smp_done: + Dprintk("Boot done.\n"); + } + +-static void __init smp_tune_scheduling(void) +-{ +- /* Nothing to do. */ +-} +- + /* + * init_ipi_lock : Initialize IPI locks. + */ +@@ -629,4 +631,3 @@ static void __init unmap_cpu_to_physid(i + physid_2_cpu[phys_id] = -1; + cpu_2_physid[cpu_id] = -1; + } +- +diff -upr linux-2.6.16.orig/arch/m32r/lib/Makefile linux-2.6.16-026test015/arch/m32r/lib/Makefile +--- linux-2.6.16.orig/arch/m32r/lib/Makefile 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/m32r/lib/Makefile 2006-07-04 14:41:36.000000000 +0400 +@@ -2,6 +2,6 @@ + # Makefile for M32R-specific library files.. + # + +-lib-y := checksum.o ashxdi3.o memset.o memcpy.o getuser.o \ +- putuser.o delay.o strlen.o usercopy.o csum_partial_copy.o ++lib-y := checksum.o ashxdi3.o memset.o memcpy.o \ ++ delay.o strlen.o usercopy.o csum_partial_copy.o + +diff -upr linux-2.6.16.orig/arch/mips/kernel/branch.c linux-2.6.16-026test015/arch/mips/kernel/branch.c +--- linux-2.6.16.orig/arch/mips/kernel/branch.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/mips/kernel/branch.c 2006-07-04 14:41:36.000000000 +0400 +@@ -184,7 +184,7 @@ int __compute_return_epc(struct pt_regs + bit = (insn.i_format.rt >> 2); + bit += (bit != 0); + bit += 23; +- switch (insn.i_format.rt) { ++ switch (insn.i_format.rt & 3) { + case 0: /* bc1f */ + case 2: /* bc1fl */ + if (~fcr31 & (1 << bit)) +diff -upr linux-2.6.16.orig/arch/mips/kernel/irixelf.c linux-2.6.16-026test015/arch/mips/kernel/irixelf.c +--- linux-2.6.16.orig/arch/mips/kernel/irixelf.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/mips/kernel/irixelf.c 2006-07-04 14:41:37.000000000 +0400 +@@ -432,7 +432,7 @@ static inline int look_for_irix_interpre + if (retval < 0) + goto out; + +- file = open_exec(*name); ++ file = open_exec(*name, bprm); + if (IS_ERR(file)) { + retval = PTR_ERR(file); + goto out; +diff -upr linux-2.6.16.orig/arch/mips/kernel/sysirix.c linux-2.6.16-026test015/arch/mips/kernel/sysirix.c +--- linux-2.6.16.orig/arch/mips/kernel/sysirix.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/mips/kernel/sysirix.c 2006-07-04 14:41:38.000000000 +0400 +@@ -110,7 +110,7 @@ asmlinkage int irix_prctl(unsigned optio + printk("irix_prctl[%s:%d]: Wants PR_ISBLOCKED\n", + current->comm, current->pid); + read_lock(&tasklist_lock); +- task = find_task_by_pid(va_arg(args, pid_t)); ++ task = find_task_by_pid_ve(va_arg(args, pid_t)); + error = -ESRCH; + if (error) + error = (task->run_list.next != NULL); +diff -upr linux-2.6.16.orig/arch/mips/mm/c-r4k.c linux-2.6.16-026test015/arch/mips/mm/c-r4k.c +--- linux-2.6.16.orig/arch/mips/mm/c-r4k.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/mips/mm/c-r4k.c 2006-07-04 14:41:36.000000000 +0400 +@@ -154,7 +154,8 @@ static inline void blast_icache32_r4600_ + + static inline void tx49_blast_icache32_page_indexed(unsigned long page) + { +- unsigned long start = page; ++ unsigned long indexmask = current_cpu_data.icache.waysize - 1; ++ unsigned long start = INDEX_BASE + (page & indexmask); + unsigned long end = start + PAGE_SIZE; + unsigned long ws_inc = 1UL << current_cpu_data.icache.waybit; + unsigned long ws_end = current_cpu_data.icache.ways << +diff -upr linux-2.6.16.orig/arch/powerpc/Kconfig linux-2.6.16-026test015/arch/powerpc/Kconfig +--- linux-2.6.16.orig/arch/powerpc/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -517,6 +517,7 @@ config HIGHMEM + bool "High memory support" + depends on PPC32 + ++source "kernel/Kconfig.fairsched" + source kernel/Kconfig.hz + source kernel/Kconfig.preempt + source "fs/Kconfig.binfmt" +@@ -956,6 +957,8 @@ source "arch/powerpc/platforms/iseries/K + + source "lib/Kconfig" + ++source "kernel/ub/Kconfig" ++ + menu "Instrumentation Support" + depends on EXPERIMENTAL + +@@ -974,6 +977,8 @@ endmenu + + source "arch/powerpc/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + config KEYS_COMPAT +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/irq.c linux-2.6.16-026test015/arch/powerpc/kernel/irq.c +--- linux-2.6.16.orig/arch/powerpc/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400 +@@ -50,6 +50,8 @@ + #include <linux/profile.h> + #include <linux/bitops.h> + ++#include <ub/beancounter.h> ++ + #include <asm/uaccess.h> + #include <asm/system.h> + #include <asm/io.h> +@@ -189,7 +191,11 @@ void do_IRQ(struct pt_regs *regs) + #ifdef CONFIG_IRQSTACKS + struct thread_info *curtp, *irqtp; + #endif ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter(); + + #ifdef CONFIG_DEBUG_STACKOVERFLOW +@@ -236,6 +242,8 @@ void do_IRQ(struct pt_regs *regs) + ppc_spurious_interrupts++; + + irq_exit(); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + + #ifdef CONFIG_PPC_ISERIES + if (get_lppaca()->int_dword.fields.decr_int) { +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/misc_32.S linux-2.6.16-026test015/arch/powerpc/kernel/misc_32.S +--- linux-2.6.16.orig/arch/powerpc/kernel/misc_32.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/misc_32.S 2006-07-04 14:41:37.000000000 +0400 +@@ -973,7 +973,7 @@ _GLOBAL(_get_SP) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + stwu r1,-16(r1) + stw r30,8(r1) + stw r31,12(r1) +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/misc_64.S linux-2.6.16-026test015/arch/powerpc/kernel/misc_64.S +--- linux-2.6.16.orig/arch/powerpc/kernel/misc_64.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/misc_64.S 2006-07-04 14:41:37.000000000 +0400 +@@ -677,7 +677,7 @@ _GLOBAL(scom970_write) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + std r29,-24(r1) + std r30,-16(r1) + stdu r1,-STACK_FRAME_OVERHEAD(r1) +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/pci_64.c linux-2.6.16-026test015/arch/powerpc/kernel/pci_64.c +--- linux-2.6.16.orig/arch/powerpc/kernel/pci_64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/pci_64.c 2006-07-04 14:41:36.000000000 +0400 +@@ -78,6 +78,7 @@ int global_phb_number; /* Global phb co + + /* Cached ISA bridge dev. */ + struct pci_dev *ppc64_isabridge_dev = NULL; ++EXPORT_SYMBOL_GPL(ppc64_isabridge_dev); + + static void fixup_broken_pcnet32(struct pci_dev* dev) + { +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/process.c linux-2.6.16-026test015/arch/powerpc/kernel/process.c +--- linux-2.6.16.orig/arch/powerpc/kernel/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -429,7 +429,7 @@ void show_regs(struct pt_regs * regs) + current, current->pid, current->comm, task_thread_info(current)); + + #ifdef CONFIG_SMP +- printk(" CPU: %d", smp_processor_id()); ++ printk(" CPU: %d VCPU %d:%d", smp_processor_id(), task_vsched_id(current), task_cpu(current); + #endif /* CONFIG_SMP */ + + for (i = 0; i < 32; i++) { +@@ -774,12 +774,12 @@ static int validate_sp(unsigned long sp, + return 1; + + #ifdef CONFIG_IRQSTACKS +- stack_page = (unsigned long) hardirq_ctx[task_cpu(p)]; ++ stack_page = (unsigned long) hardirq_ctx[task_pcpu(p)]; + if (sp >= stack_page + sizeof(struct thread_struct) + && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; + +- stack_page = (unsigned long) softirq_ctx[task_cpu(p)]; ++ stack_page = (unsigned long) softirq_ctx[task_pcpu(p)]; + if (sp >= stack_page + sizeof(struct thread_struct) + && sp <= stack_page + THREAD_SIZE - nbytes) + return 1; +@@ -889,6 +889,20 @@ void dump_stack(void) + } + EXPORT_SYMBOL(dump_stack); + ++long kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) ++{ ++ extern long ppc_kernel_thread(int (*fn)(void *), void *arg, ++ unsigned long flags); ++ ++ if (!ve_is_super(get_exec_env())) { ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ ++ return ppc_kernel_thread(fn, arg, flags); ++} ++ + #ifdef CONFIG_PPC64 + void ppc64_runlatch_on(void) + { +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/setup_64.c linux-2.6.16-026test015/arch/powerpc/kernel/setup_64.c +--- linux-2.6.16.orig/arch/powerpc/kernel/setup_64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/setup_64.c 2006-07-04 14:41:36.000000000 +0400 +@@ -256,12 +256,10 @@ void __init early_setup(unsigned long dt + /* + * Initialize stab / SLB management except on iSeries + */ +- if (!firmware_has_feature(FW_FEATURE_ISERIES)) { +- if (cpu_has_feature(CPU_FTR_SLB)) +- slb_initialize(); +- else +- stab_initialize(lpaca->stab_real); +- } ++ if (cpu_has_feature(CPU_FTR_SLB)) ++ slb_initialize(); ++ else if (!firmware_has_feature(FW_FEATURE_ISERIES)) ++ stab_initialize(lpaca->stab_real); + + DBG(" <- early_setup()\n"); + } +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/signal_32.c linux-2.6.16-026test015/arch/powerpc/kernel/signal_32.c +--- linux-2.6.16.orig/arch/powerpc/kernel/signal_32.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/signal_32.c 2006-07-04 14:41:36.000000000 +0400 +@@ -802,10 +802,13 @@ static int do_setcontext(struct ucontext + if (__get_user(cmcp, &ucp->uc_regs)) + return -EFAULT; + mcp = (struct mcontext __user *)(u64)cmcp; ++ /* no need to check access_ok(mcp), since mcp < 4GB */ + } + #else + if (__get_user(mcp, &ucp->uc_regs)) + return -EFAULT; ++ if (!access_ok(VERIFY_READ, mcp, sizeof(*mcp))) ++ return -EFAULT; + #endif + restore_sigmask(&set); + if (restore_user_regs(regs, mcp, sig)) +@@ -907,13 +910,14 @@ int sys_debug_setcontext(struct ucontext + { + struct sig_dbg_op op; + int i; ++ unsigned char tmp; + unsigned long new_msr = regs->msr; + #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) + unsigned long new_dbcr0 = current->thread.dbcr0; + #endif + + for (i=0; i<ndbg; i++) { +- if (__copy_from_user(&op, dbg, sizeof(op))) ++ if (copy_from_user(&op, dbg + i, sizeof(op))) + return -EFAULT; + switch (op.dbg_type) { + case SIG_DBG_SINGLE_STEPPING: +@@ -958,6 +962,11 @@ int sys_debug_setcontext(struct ucontext + current->thread.dbcr0 = new_dbcr0; + #endif + ++ if (!access_ok(VERIFY_READ, ctx, sizeof(*ctx)) ++ || __get_user(tmp, (u8 __user *) ctx) ++ || __get_user(tmp, (u8 __user *) (ctx + 1) - 1)) ++ return -EFAULT; ++ + /* + * If we get a fault copying the context into the kernel's + * image of the user's registers, we can't just return -EFAULT +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/signal_64.c linux-2.6.16-026test015/arch/powerpc/kernel/signal_64.c +--- linux-2.6.16.orig/arch/powerpc/kernel/signal_64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/signal_64.c 2006-07-04 14:41:36.000000000 +0400 +@@ -183,6 +183,8 @@ static long restore_sigcontext(struct pt + err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + if (err) + return err; ++ if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128))) ++ return -EFAULT; + /* Copy 33 vec registers (vr0..31 and vscr) from the stack */ + if (v_regs != 0 && (msr & MSR_VEC) != 0) + err |= __copy_from_user(current->thread.vr, v_regs, +@@ -213,7 +215,7 @@ static inline void __user * get_sigframe + /* Default to using normal stack */ + newsp = regs->gpr[1]; + +- if (ka->sa.sa_flags & SA_ONSTACK) { ++ if ((ka->sa.sa_flags & SA_ONSTACK) && current->sas_ss_size) { + if (! on_sig_stack(regs->gpr[1])) + newsp = (current->sas_ss_sp + current->sas_ss_size); + } +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/syscalls.c linux-2.6.16-026test015/arch/powerpc/kernel/syscalls.c +--- linux-2.6.16.orig/arch/powerpc/kernel/syscalls.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/syscalls.c 2006-07-04 14:41:38.000000000 +0400 +@@ -259,7 +259,7 @@ long ppc_newuname(struct new_utsname __u + int err = 0; + + down_read(&uts_sem); +- if (copy_to_user(name, &system_utsname, sizeof(*name))) ++ if (copy_to_user(name, &ve_utsname, sizeof(*name))) + err = -EFAULT; + up_read(&uts_sem); + if (!err) +@@ -272,7 +272,7 @@ int sys_uname(struct old_utsname __user + int err = 0; + + down_read(&uts_sem); +- if (copy_to_user(name, &system_utsname, sizeof(*name))) ++ if (copy_to_user(name, &ve_utsname, sizeof(*name))) + err = -EFAULT; + up_read(&uts_sem); + if (!err) +@@ -288,19 +288,19 @@ int sys_olduname(struct oldold_utsname _ + return -EFAULT; + + down_read(&uts_sem); +- error = __copy_to_user(&name->sysname, &system_utsname.sysname, ++ error = __copy_to_user(&name->sysname, &ve_utsname.sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->nodename, &system_utsname.nodename, ++ error |= __copy_to_user(&name->nodename, &ve_utsname.nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->release, &system_utsname.release, ++ error |= __copy_to_user(&name->release, &ve_utsname.release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->version, &system_utsname.version, ++ error |= __copy_to_user(&name->version, &ve_utsname.version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); +- error |= __copy_to_user(&name->machine, &system_utsname.machine, ++ error |= __copy_to_user(&name->machine, &ve_utsname.machine, + __OLD_UTS_LEN); + error |= override_machine(name->machine); + up_read(&uts_sem); +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/systbl.S linux-2.6.16-026test015/arch/powerpc/kernel/systbl.S +--- linux-2.6.16.orig/arch/powerpc/kernel/systbl.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/systbl.S 2006-07-04 14:41:37.000000000 +0400 +@@ -322,3 +322,12 @@ SYSCALL(spu_create) + COMPAT_SYS(pselect6) + COMPAT_SYS(ppoll) + SYSCALL(unshare) ++ ++.rept 410 - (. - sys_call_table)/8 ++SYSX(sys_ni_syscall, sys_ni_syscall, sys_ni_syscall) ++.endr ++ ++SYSX(sys_getluid, sys_ni_syscall, sys_getluid) ++SYSX(sys_setluid, sys_ni_syscall, sys_setluid) ++SYSX(sys_setublimit, sys_ni_syscall, sys_setublimit) ++SYSX(sys_ubstat, sys_ni_syscall, sys_ubstat) +diff -upr linux-2.6.16.orig/arch/powerpc/kernel/time.c linux-2.6.16-026test015/arch/powerpc/kernel/time.c +--- linux-2.6.16.orig/arch/powerpc/kernel/time.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/kernel/time.c 2006-07-04 14:41:38.000000000 +0400 +@@ -431,12 +431,14 @@ void timer_interrupt(struct pt_regs * re + int next_dec; + int cpu = smp_processor_id(); + unsigned long ticks; ++ struct ve_struct *ve; + + #ifdef CONFIG_PPC32 + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); + #endif + ++ ve = set_exec_env(get_ve0()); + irq_enter(); + + profile_tick(CPU_PROFILING, regs); +@@ -496,6 +498,7 @@ void timer_interrupt(struct pt_regs * re + #endif + + irq_exit(); ++ (void)set_exec_env(ve); + } + + void wakeup_decrementer(void) +diff -upr linux-2.6.16.orig/arch/powerpc/mm/fault.c linux-2.6.16-026test015/arch/powerpc/mm/fault.c +--- linux-2.6.16.orig/arch/powerpc/mm/fault.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/mm/fault.c 2006-07-04 14:41:37.000000000 +0400 +@@ -307,7 +307,6 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- survive: + switch (handle_mm_fault(mm, vma, address, is_write)) { + + case VM_FAULT_MINOR: +@@ -351,14 +350,12 @@ bad_area_nosemaphore: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) +- do_exit(SIGKILL); ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. Den ++ */ ++ force_sig(SIGKILL, current); + return SIGKILL; + + do_sigbus: +diff -upr linux-2.6.16.orig/arch/powerpc/mm/init_64.c linux-2.6.16-026test015/arch/powerpc/mm/init_64.c +--- linux-2.6.16.orig/arch/powerpc/mm/init_64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/mm/init_64.c 2006-07-04 14:41:37.000000000 +0400 +@@ -225,7 +225,8 @@ void pgtable_cache_init(void) + pgtable_cache[i] = kmem_cache_create(name, + size, size, + SLAB_HWCACHE_ALIGN | +- SLAB_MUST_HWCACHE_ALIGN, ++ SLAB_MUST_HWCACHE_ALIGN | ++ SLAB_UBC | SLAB_NO_CHARGE, + zero_ctor, + NULL); + if (! pgtable_cache[i]) +diff -upr linux-2.6.16.orig/arch/powerpc/mm/mem.c linux-2.6.16-026test015/arch/powerpc/mm/mem.c +--- linux-2.6.16.orig/arch/powerpc/mm/mem.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/mm/mem.c 2006-07-04 14:41:38.000000000 +0400 +@@ -222,6 +222,7 @@ void show_mem(void) + printk("%ld pages shared\n", shared); + printk("%ld pages swap cached\n", cached); + } ++EXPORT_SYMBOL(show_mem); + + /* + * Initialize the bootmem system and give it all the memory we +diff -upr linux-2.6.16.orig/arch/powerpc/mm/pgtable_32.c linux-2.6.16-026test015/arch/powerpc/mm/pgtable_32.c +--- linux-2.6.16.orig/arch/powerpc/mm/pgtable_32.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/mm/pgtable_32.c 2006-07-04 14:41:37.000000000 +0400 +@@ -85,7 +85,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *ret; + +- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); ++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_ZERO, PGDIR_ORDER); + return ret; + } + +@@ -119,6 +120,7 @@ struct page *pte_alloc_one(struct mm_str + #else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT; + #endif ++ flags |= (__GFP_UBC | __GFP_SOFT_UBC); + + ptepage = alloc_pages(flags, 0); + if (ptepage) +diff -upr linux-2.6.16.orig/arch/powerpc/platforms/powermac/setup.c linux-2.6.16-026test015/arch/powerpc/platforms/powermac/setup.c +--- linux-2.6.16.orig/arch/powerpc/platforms/powermac/setup.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/powerpc/platforms/powermac/setup.c 2006-07-04 14:41:36.000000000 +0400 +@@ -456,11 +456,23 @@ static int pmac_pm_finish(suspend_state_ + return 0; + } + ++static int pmac_pm_valid(suspend_state_t state) ++{ ++ switch (state) { ++ case PM_SUSPEND_DISK: ++ return 1; ++ /* can't do any other states via generic mechanism yet */ ++ default: ++ return 0; ++ } ++} ++ + static struct pm_ops pmac_pm_ops = { + .pm_disk_mode = PM_DISK_SHUTDOWN, + .prepare = pmac_pm_prepare, + .enter = pmac_pm_enter, + .finish = pmac_pm_finish, ++ .valid = pmac_pm_valid, + }; + + #endif /* CONFIG_SOFTWARE_SUSPEND */ +diff -upr linux-2.6.16.orig/arch/ppc/Kconfig linux-2.6.16-026test015/arch/ppc/Kconfig +--- linux-2.6.16.orig/arch/ppc/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ppc/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -920,6 +920,7 @@ config NR_CPUS + config HIGHMEM + bool "High memory support" + ++source "kernel/Kconfig.fairsched" + source kernel/Kconfig.hz + source kernel/Kconfig.preempt + source "mm/Kconfig" +@@ -1394,6 +1395,10 @@ source "arch/powerpc/oprofile/Kconfig" + + source "arch/ppc/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + ++source "kernel/ub/Kconfig" ++ + source "crypto/Kconfig" +diff -upr linux-2.6.16.orig/arch/ppc/kernel/misc.S linux-2.6.16-026test015/arch/ppc/kernel/misc.S +--- linux-2.6.16.orig/arch/ppc/kernel/misc.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ppc/kernel/misc.S 2006-07-04 14:41:37.000000000 +0400 +@@ -1004,7 +1004,7 @@ _GLOBAL(_get_SP) + * Create a kernel thread + * kernel_thread(fn, arg, flags) + */ +-_GLOBAL(kernel_thread) ++_GLOBAL(ppc_kernel_thread) + stwu r1,-16(r1) + stw r30,8(r1) + stw r31,12(r1) +diff -upr linux-2.6.16.orig/arch/ppc/kernel/time.c linux-2.6.16-026test015/arch/ppc/kernel/time.c +--- linux-2.6.16.orig/arch/ppc/kernel/time.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ppc/kernel/time.c 2006-07-04 14:41:38.000000000 +0400 +@@ -58,6 +58,8 @@ + #include <linux/init.h> + #include <linux/profile.h> + ++#include <ub/beancounter.h> ++ + #include <asm/io.h> + #include <asm/nvram.h> + #include <asm/cache.h> +@@ -136,10 +138,14 @@ void timer_interrupt(struct pt_regs * re + unsigned long cpu = smp_processor_id(); + unsigned jiffy_stamp = last_jiffy_stamp(cpu); + extern void do_IRQ(struct pt_regs *); ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + + if (atomic_read(&ppc_n_lost_interrupts) != 0) + do_IRQ(regs); + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter(); + + while ((next_dec = tb_ticks_per_jiffy - tb_delta(&jiffy_stamp)) <= 0) { +@@ -192,6 +198,8 @@ void timer_interrupt(struct pt_regs * re + ppc_md.heartbeat(); + + irq_exit(); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + } + + /* +diff -upr linux-2.6.16.orig/arch/ppc/mm/fault.c linux-2.6.16-026test015/arch/ppc/mm/fault.c +--- linux-2.6.16.orig/arch/ppc/mm/fault.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ppc/mm/fault.c 2006-07-04 14:41:37.000000000 +0400 +@@ -247,7 +247,6 @@ good_area: + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ +- survive: + switch (handle_mm_fault(mm, vma, address, is_write)) { + case VM_FAULT_MINOR: + current->min_flt++; +@@ -290,14 +289,12 @@ bad_area: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- down_read(&mm->mmap_sem); +- goto survive; +- } +- printk("VM: killing process %s\n", current->comm); + if (user_mode(regs)) +- do_exit(SIGKILL); ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. Den ++ */ ++ force_sig(SIGKILL, current); + return SIGKILL; + + do_sigbus: +diff -upr linux-2.6.16.orig/arch/ppc/mm/init.c linux-2.6.16-026test015/arch/ppc/mm/init.c +--- linux-2.6.16.orig/arch/ppc/mm/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ppc/mm/init.c 2006-07-04 14:41:38.000000000 +0400 +@@ -132,6 +132,7 @@ void show_mem(void) + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); + } ++EXPORT_SYMBOL(show_mem); + + /* Free up now-unused memory */ + static void free_sec(unsigned long start, unsigned long end, const char *name) +diff -upr linux-2.6.16.orig/arch/ppc/mm/pgtable.c linux-2.6.16-026test015/arch/ppc/mm/pgtable.c +--- linux-2.6.16.orig/arch/ppc/mm/pgtable.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/ppc/mm/pgtable.c 2006-07-04 14:41:37.000000000 +0400 +@@ -84,7 +84,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + { + pgd_t *ret; + +- ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, PGDIR_ORDER); ++ ret = (pgd_t *)__get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_ZERO, PGDIR_ORDER); + return ret; + } + +@@ -118,6 +119,7 @@ struct page *pte_alloc_one(struct mm_str + #else + gfp_t flags = GFP_KERNEL | __GFP_REPEAT; + #endif ++ flags |= (__GFP_UBC | __GFP_SOFT_UBC); + + ptepage = alloc_pages(flags, 0); + if (ptepage) +diff -upr linux-2.6.16.orig/arch/s390/Kconfig linux-2.6.16-026test015/arch/s390/Kconfig +--- linux-2.6.16.orig/arch/s390/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/Kconfig 2006-07-04 14:41:37.000000000 +0400 +@@ -472,8 +472,12 @@ source "arch/s390/oprofile/Kconfig" + + source "arch/s390/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "kernel/ub/Kconfig" +diff -upr linux-2.6.16.orig/arch/s390/kernel/process.c linux-2.6.16-026test015/arch/s390/kernel/process.c +--- linux-2.6.16.orig/arch/s390/kernel/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/kernel/process.c 2006-07-04 14:41:38.000000000 +0400 +@@ -164,9 +164,10 @@ void show_regs(struct pt_regs *regs) + struct task_struct *tsk = current; + + printk("CPU: %d %s\n", task_thread_info(tsk)->cpu, print_tainted()); +- printk("Process %s (pid: %d, task: %p, ksp: %p)\n", +- current->comm, current->pid, (void *) tsk, +- (void *) tsk->thread.ksp); ++ printk("Process %s (pid: %d, veid: %d, task: %p, ksp: %p)\n", ++ current->comm, current->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ (void *) tsk, (void *) tsk->thread.ksp); + + show_registers(regs); + /* Show stack backtrace if pt_regs is from kernel mode */ +@@ -187,6 +188,13 @@ int kernel_thread(int (*fn)(void *), voi + { + struct pt_regs regs; + ++ if (!ve_is_super(get_exec_env())) { ++ /* Don't allow kernel_thread() inside VE */ ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++ } ++ + memset(®s, 0, sizeof(regs)); + regs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_IO | PSW_MASK_EXT; + regs.psw.addr = (unsigned long) kernel_thread_starter | PSW_ADDR_AMODE; +diff -upr linux-2.6.16.orig/arch/s390/kernel/s390_ext.c linux-2.6.16-026test015/arch/s390/kernel/s390_ext.c +--- linux-2.6.16.orig/arch/s390/kernel/s390_ext.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/kernel/s390_ext.c 2006-07-04 14:41:38.000000000 +0400 +@@ -114,7 +114,9 @@ void do_extint(struct pt_regs *regs, uns + { + ext_int_info_t *p; + int index; ++ struct ve_struct *envid; + ++ envid = set_exec_env(get_ve0()); + irq_enter(); + asm volatile ("mc 0,0"); + if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer) +@@ -132,6 +134,7 @@ void do_extint(struct pt_regs *regs, uns + } + } + irq_exit(); ++ (void)set_exec_env(envid); + } + + EXPORT_SYMBOL(register_external_interrupt); +diff -upr linux-2.6.16.orig/arch/s390/kernel/smp.c linux-2.6.16-026test015/arch/s390/kernel/smp.c +--- linux-2.6.16.orig/arch/s390/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/kernel/smp.c 2006-07-04 14:41:38.000000000 +0400 +@@ -526,6 +526,17 @@ int __devinit start_secondary(void *cpuv + { + /* Setup the cpu */ + cpu_init(); ++ ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++ /* ++ * Cosmetic: sleep_time won't be changed afterwards for the idle ++ * thread; keep it 0 rather than -cycles. ++ */ ++ VE_TASK_INFO(idle)->sleep_time = 0; ++#endif ++ + preempt_disable(); + /* init per CPU timer */ + init_cpu_timer(); +@@ -834,6 +845,11 @@ void __init smp_prepare_cpus(unsigned in + for_each_cpu(cpu) + if (cpu != smp_processor_id()) + smp_create_idle(cpu); ++ ++#ifdef CONFIG_VE ++ /* TSC reset. kill whatever might rely on old values */ ++ VE_TASK_INFO(current)->wakeup_stamp = 0; ++#endif + } + + void __devinit smp_prepare_boot_cpu(void) +diff -upr linux-2.6.16.orig/arch/s390/kernel/syscalls.S linux-2.6.16-026test015/arch/s390/kernel/syscalls.S +--- linux-2.6.16.orig/arch/s390/kernel/syscalls.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/kernel/syscalls.S 2006-07-04 14:41:37.000000000 +0400 +@@ -312,3 +312,12 @@ SYSCALL(sys_faccessat,sys_faccessat,sys_ + SYSCALL(sys_pselect6,sys_pselect6,compat_sys_pselect6_wrapper) + SYSCALL(sys_ppoll,sys_ppoll,compat_sys_ppoll_wrapper) + SYSCALL(sys_unshare,sys_unshare,sys_unshare_wrapper) ++ ++.rept 410-(.-sys_call_table)/4 ++ NI_SYSCALL ++.endr ++ ++SYSCALL(sys_getluid, sys_getluid, sys_ni_syscall) /* 410 */ ++SYSCALL(sys_setluid, sys_setluid, sys_ni_syscall) ++SYSCALL(sys_setublimit, sys_setublimit, sys_ni_syscall) ++SYSCALL(sys_ubstat, sys_ubstat, sys_ni_syscall) +diff -upr linux-2.6.16.orig/arch/s390/mm/fault.c linux-2.6.16-026test015/arch/s390/mm/fault.c +--- linux-2.6.16.orig/arch/s390/mm/fault.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/mm/fault.c 2006-07-04 14:41:37.000000000 +0400 +@@ -61,17 +61,9 @@ void bust_spinlocks(int yes) + if (yes) { + oops_in_progress = 1; + } else { +- int loglevel_save = console_loglevel; + console_unblank(); + oops_in_progress = 0; +- /* +- * OK, the message is on the console. Now we call printk() +- * without oops_in_progress set so that printk will give klogd +- * a poke. Hold onto your hats... +- */ +- console_loglevel = 15; +- printk(" "); +- console_loglevel = loglevel_save; ++ wake_up_klogd(); + } + } + +diff -upr linux-2.6.16.orig/arch/s390/mm/init.c linux-2.6.16-026test015/arch/s390/mm/init.c +--- linux-2.6.16.orig/arch/s390/mm/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/s390/mm/init.c 2006-07-04 14:41:38.000000000 +0400 +@@ -89,6 +89,7 @@ void show_mem(void) + printk("%d pages shared\n",shared); + printk("%d pages swap cached\n",cached); + } ++EXPORT_SYMBOL(show_mem); + + /* References to section boundaries */ + +diff -upr linux-2.6.16.orig/arch/sh/kernel/kgdb_stub.c linux-2.6.16-026test015/arch/sh/kernel/kgdb_stub.c +--- linux-2.6.16.orig/arch/sh/kernel/kgdb_stub.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sh/kernel/kgdb_stub.c 2006-07-04 14:41:38.000000000 +0400 +@@ -412,7 +412,7 @@ static struct task_struct *get_thread(in + if (pid == PID_MAX) pid = 0; + + /* First check via PID */ +- thread = find_task_by_pid(pid); ++ thread = find_task_by_pid_all(pid); + + if (thread) + return thread; +diff -upr linux-2.6.16.orig/arch/sh64/kernel/process.c linux-2.6.16-026test015/arch/sh64/kernel/process.c +--- linux-2.6.16.orig/arch/sh64/kernel/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sh64/kernel/process.c 2006-07-04 14:41:38.000000000 +0400 +@@ -906,7 +906,7 @@ asids_proc_info(char *buf, char **start, + int len=0; + struct task_struct *p; + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_ve(p) { + int pid = p->pid; + struct mm_struct *mm; + if (!pid) continue; +diff -upr linux-2.6.16.orig/arch/sparc64/kernel/pci_iommu.c linux-2.6.16-026test015/arch/sparc64/kernel/pci_iommu.c +--- linux-2.6.16.orig/arch/sparc64/kernel/pci_iommu.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sparc64/kernel/pci_iommu.c 2006-07-04 14:41:36.000000000 +0400 +@@ -219,7 +219,7 @@ static inline void iommu_free_ctx(struct + * DMA for PCI device PDEV. Return non-NULL cpu-side address if + * successful and set *DMA_ADDRP to the PCI side dma address. + */ +-void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp) ++void *__pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp, gfp_t gfp) + { + struct pcidev_cookie *pcp; + struct pci_iommu *iommu; +@@ -233,7 +233,7 @@ void *pci_alloc_consistent(struct pci_de + if (order >= 10) + return NULL; + +- first_page = __get_free_pages(GFP_ATOMIC, order); ++ first_page = __get_free_pages(gfp, order); + if (first_page == 0UL) + return NULL; + memset((char *)first_page, 0, PAGE_SIZE << order); +diff -upr linux-2.6.16.orig/arch/sparc64/kernel/setup.c linux-2.6.16-026test015/arch/sparc64/kernel/setup.c +--- linux-2.6.16.orig/arch/sparc64/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sparc64/kernel/setup.c 2006-07-04 14:41:38.000000000 +0400 +@@ -156,7 +156,7 @@ int prom_callback(long *args) + pte_t *ptep; + pte_t pte; + +- for_each_process(p) { ++ for_each_process_all(p) { + mm = p->mm; + if (CTX_NRBITS(mm->context) == ctx) + break; +diff -upr linux-2.6.16.orig/arch/sparc64/kernel/sparc64_ksyms.c linux-2.6.16-026test015/arch/sparc64/kernel/sparc64_ksyms.c +--- linux-2.6.16.orig/arch/sparc64/kernel/sparc64_ksyms.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sparc64/kernel/sparc64_ksyms.c 2006-07-04 14:41:36.000000000 +0400 +@@ -221,7 +221,7 @@ EXPORT_SYMBOL(insl); + EXPORT_SYMBOL(ebus_chain); + EXPORT_SYMBOL(isa_chain); + EXPORT_SYMBOL(pci_memspace_mask); +-EXPORT_SYMBOL(pci_alloc_consistent); ++EXPORT_SYMBOL(__pci_alloc_consistent); + EXPORT_SYMBOL(pci_free_consistent); + EXPORT_SYMBOL(pci_map_single); + EXPORT_SYMBOL(pci_unmap_single); +diff -upr linux-2.6.16.orig/arch/sparc64/lib/checksum.S linux-2.6.16-026test015/arch/sparc64/lib/checksum.S +--- linux-2.6.16.orig/arch/sparc64/lib/checksum.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sparc64/lib/checksum.S 2006-07-04 14:41:36.000000000 +0400 +@@ -165,8 +165,9 @@ csum_partial_end_cruft: + sll %g1, 8, %g1 + or %o5, %g1, %o4 + +-1: add %o2, %o4, %o2 ++1: addcc %o2, %o4, %o2 ++ addc %g0, %o2, %o2 + + csum_partial_finish: + retl +- mov %o2, %o0 ++ srl %o2, 0, %o0 +diff -upr linux-2.6.16.orig/arch/sparc64/lib/csum_copy.S linux-2.6.16-026test015/arch/sparc64/lib/csum_copy.S +--- linux-2.6.16.orig/arch/sparc64/lib/csum_copy.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/sparc64/lib/csum_copy.S 2006-07-04 14:41:36.000000000 +0400 +@@ -221,11 +221,12 @@ FUNC_NAME: /* %o0=src, %o1=dst, %o2=len + sll %g1, 8, %g1 + or %o5, %g1, %o4 + +-1: add %o3, %o4, %o3 ++1: addcc %o3, %o4, %o3 ++ addc %g0, %o3, %o3 + + 70: + retl +- mov %o3, %o0 ++ srl %o3, 0, %o0 + + 95: mov 0, GLOBAL_SPARE + brlez,pn %o2, 4f +diff -upr linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c linux-2.6.16-026test015/arch/um/drivers/mconsole_kern.c +--- linux-2.6.16.orig/arch/um/drivers/mconsole_kern.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/um/drivers/mconsole_kern.c 2006-07-04 14:41:38.000000000 +0400 +@@ -600,7 +600,7 @@ static void do_stack_trace(struct mc_req + + from = current; + +- to = find_task_by_pid(pid_requested); ++ to = find_task_by_pid_all(pid_requested); + if((to == NULL) || (pid_requested == 0)) { + mconsole_reply(req, "Couldn't find that pid", 1, 0); + return; +diff -upr linux-2.6.16.orig/arch/um/kernel/skas/process_kern.c linux-2.6.16-026test015/arch/um/kernel/skas/process_kern.c +--- linux-2.6.16.orig/arch/um/kernel/skas/process_kern.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/um/kernel/skas/process_kern.c 2006-07-04 14:41:38.000000000 +0400 +@@ -197,7 +197,7 @@ void kill_off_processes_skas(void) + int pid, me; + + me = os_getpid(); +- for_each_process(p){ ++ for_each_process_all(p){ + if(p->mm == NULL) + continue; + +diff -upr linux-2.6.16.orig/arch/um/kernel/tt/process_kern.c linux-2.6.16-026test015/arch/um/kernel/tt/process_kern.c +--- linux-2.6.16.orig/arch/um/kernel/tt/process_kern.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/um/kernel/tt/process_kern.c 2006-07-04 14:41:38.000000000 +0400 +@@ -301,7 +301,7 @@ void kill_off_processes_tt(void) + int me; + + me = os_getpid(); +- for_each_process(p){ ++ for_each_process_all(p){ + if(p->thread.mode.tt.extern_pid != me) + os_kill_process(p->thread.mode.tt.extern_pid, 0); + } +@@ -444,7 +444,7 @@ int is_valid_pid(int pid) + struct task_struct *task; + + read_lock(&tasklist_lock); +- for_each_process(task){ ++ for_each_process_all(task){ + if(task->thread.mode.tt.extern_pid == pid){ + read_unlock(&tasklist_lock); + return(1); +diff -upr linux-2.6.16.orig/arch/x86_64/Kconfig linux-2.6.16-026test015/arch/x86_64/Kconfig +--- linux-2.6.16.orig/arch/x86_64/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -246,6 +246,8 @@ config SCHED_SMT + cost of slightly increased overhead in some places. If unsure say + N here. + ++source "kernel/Kconfig.fairsched" ++ + source "kernel/Kconfig.preempt" + + config NUMA +@@ -588,8 +590,12 @@ endmenu + + source "arch/x86_64/Kconfig.debug" + ++source "kernel/Kconfig.openvz" ++ + source "security/Kconfig" + + source "crypto/Kconfig" + + source "lib/Kconfig" ++ ++source "kernel/ub/Kconfig" +diff -upr linux-2.6.16.orig/arch/x86_64/boot/compressed/head.S linux-2.6.16-026test015/arch/x86_64/boot/compressed/head.S +--- linux-2.6.16.orig/arch/x86_64/boot/compressed/head.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/boot/compressed/head.S 2006-07-04 14:41:39.000000000 +0400 +@@ -34,7 +34,7 @@ + startup_32: + cld + cli +- movl $(__KERNEL_DS),%eax ++ movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs +@@ -76,7 +76,7 @@ startup_32: + jnz 3f + addl $8,%esp + xorl %ebx,%ebx +- ljmp $(__KERNEL_CS), $__PHYSICAL_START ++ ljmp $(__BOOT_CS), $__PHYSICAL_START + + /* + * We come here, if we were loaded high. +@@ -104,7 +104,7 @@ startup_32: + popl %eax # hcount + movl $__PHYSICAL_START,%edi + cli # make sure we don't get interrupted +- ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine ++ ljmp $(__BOOT_CS), $0x1000 # and jump to the move routine + + /* + * Routine (template) for moving the decompressed kernel in place, +@@ -127,7 +127,7 @@ move_routine_start: + movsl + movl %ebx,%esi # Restore setup pointer + xorl %ebx,%ebx +- ljmp $(__KERNEL_CS), $__PHYSICAL_START ++ ljmp $(__BOOT_CS), $__PHYSICAL_START + move_routine_end: + + +@@ -137,5 +137,5 @@ user_stack: + .fill 4096,4,0 + stack_start: + .long user_stack+4096 +- .word __KERNEL_DS ++ .word __BOOT_DS + +diff -upr linux-2.6.16.orig/arch/x86_64/boot/setup.S linux-2.6.16-026test015/arch/x86_64/boot/setup.S +--- linux-2.6.16.orig/arch/x86_64/boot/setup.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/boot/setup.S 2006-07-04 14:41:39.000000000 +0400 +@@ -729,7 +729,7 @@ flush_instr: + subw $DELTA_INITSEG, %si + shll $4, %esi # Convert to 32-bit pointer + # NOTE: For high loaded big kernels we need a +-# jmpi 0x100000,__KERNEL_CS ++# jmpi 0x100000,__BOOT_CS + # + # but we yet haven't reloaded the CS register, so the default size + # of the target offset still is 16 bit. +@@ -740,7 +740,7 @@ flush_instr: + .byte 0x66, 0xea # prefix + jmpi-opcode + code32: .long 0x1000 # will be set to 0x100000 + # for big kernels +- .word __KERNEL_CS ++ .word __BOOT_CS + + # Here's a bunch of information about your current kernel.. + kernel_version: .ascii UTS_RELEASE +diff -upr linux-2.6.16.orig/arch/x86_64/ia32/Makefile linux-2.6.16-026test015/arch/x86_64/ia32/Makefile +--- linux-2.6.16.orig/arch/x86_64/ia32/Makefile 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/ia32/Makefile 2006-07-04 14:41:36.000000000 +0400 +@@ -27,5 +27,5 @@ $(obj)/vsyscall-sysenter.so $(obj)/vsysc + $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE + $(call if_changed,syscall) + +-AFLAGS_vsyscall-sysenter.o = -m32 +-AFLAGS_vsyscall-syscall.o = -m32 ++AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 ++AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 +diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_aout.c linux-2.6.16-026test015/arch/x86_64/ia32/ia32_aout.c +--- linux-2.6.16.orig/arch/x86_64/ia32/ia32_aout.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/ia32/ia32_aout.c 2006-07-04 14:41:38.000000000 +0400 +@@ -347,14 +347,14 @@ static int load_aout_binary(struct linux + if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && + (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) + { +- printk(KERN_NOTICE "executable not page aligned\n"); ++ ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n"); + error_time2 = jiffies; + } + + if ((fd_offset & ~PAGE_MASK) != 0 && + (jiffies-error_time) > 5*HZ) + { +- printk(KERN_WARNING ++ ve_printk(VE_LOG, KERN_WARNING + "fd_offset is not page aligned. Please convert program: %s\n", + bprm->file->f_dentry->d_name.name); + error_time = jiffies; +@@ -467,7 +467,7 @@ static int load_aout_library(struct file + static unsigned long error_time; + if ((jiffies-error_time) > 5*HZ) + { +- printk(KERN_WARNING ++ ve_printk(VE_LOG, KERN_WARNING + "N_TXTOFF is not page aligned. Please convert library: %s\n", + file->f_dentry->d_name.name); + error_time = jiffies; +diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_binfmt.c linux-2.6.16-026test015/arch/x86_64/ia32/ia32_binfmt.c +--- linux-2.6.16.orig/arch/x86_64/ia32/ia32_binfmt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/ia32/ia32_binfmt.c 2006-07-04 14:41:39.000000000 +0400 +@@ -27,12 +27,14 @@ + #include <asm/ia32.h> + #include <asm/vsyscall32.h> + ++#include <ub/ub_vmpages.h> ++ + #define ELF_NAME "elf/i386" + + #define AT_SYSINFO 32 + #define AT_SYSINFO_EHDR 33 + +-int sysctl_vsyscall32 = 1; ++int sysctl_vsyscall32 = 0; + + #define ARCH_DLINFO do { \ + if (sysctl_vsyscall32) { \ +@@ -347,9 +349,15 @@ int ia32_setup_arg_pages(struct linux_bi + bprm->loader += stack_base; + bprm->exec += stack_base; + ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL, UB_SOFT)) ++ goto err_charge; ++ + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!mpnt) +- return -ENOMEM; ++ goto err_alloc; + + memset(mpnt, 0, sizeof(*mpnt)); + +@@ -366,11 +374,8 @@ int ia32_setup_arg_pages(struct linux_bi + mpnt->vm_flags = VM_STACK_FLAGS; + mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? + PAGE_COPY_EXEC : PAGE_COPY; +- if ((ret = insert_vm_struct(mm, mpnt))) { +- up_write(&mm->mmap_sem); +- kmem_cache_free(vm_area_cachep, mpnt); +- return ret; +- } ++ if ((ret = insert_vm_struct(mm, mpnt))) ++ goto err_insert; + mm->stack_vm = mm->total_vm = vma_pages(mpnt); + } + +@@ -385,6 +390,16 @@ int ia32_setup_arg_pages(struct linux_bi + up_write(&mm->mmap_sem); + + return 0; ++ ++err_insert: ++ up_write(&mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, mpnt); ++err_alloc: ++ ub_memory_uncharge(mm, IA32_STACK_TOP - ++ (PAGE_MASK & (unsigned long)bprm->p), ++ VM_STACK_FLAGS, NULL); ++err_charge: ++ return ret; + } + EXPORT_SYMBOL(ia32_setup_arg_pages); + +diff -upr linux-2.6.16.orig/arch/x86_64/ia32/ia32_signal.c linux-2.6.16-026test015/arch/x86_64/ia32/ia32_signal.c +--- linux-2.6.16.orig/arch/x86_64/ia32/ia32_signal.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/ia32/ia32_signal.c 2006-07-04 14:41:39.000000000 +0400 +@@ -39,7 +39,6 @@ + + #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) + +-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); + void signal_fault(struct pt_regs *regs, void __user *frame, char *where); + + int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) +@@ -118,22 +117,17 @@ asmlinkage long + sys32_sigsuspend(int history0, int history1, old_sigset_t mask, + struct pt_regs *regs) + { +- sigset_t saveset; +- + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); +- saveset = current->blocked; ++ current->saved_sigmask = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + +- regs->rax = -EINTR; +- while (1) { +- current->state = TASK_INTERRUPTIBLE; +- schedule(); +- if (do_signal(regs, &saveset)) +- return -EINTR; +- } ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ set_thread_flag(TIF_RESTORE_SIGMASK); ++ return -ERESTARTNOHAND; + } + + asmlinkage long +@@ -510,11 +504,11 @@ int ia32_setup_frame(int sig, struct k_s + current->comm, current->pid, frame, regs->rip, frame->pretcode); + #endif + +- return 1; ++ return 0; + + give_sigsegv: + force_sigsegv(sig, current); +- return 0; ++ return -EFAULT; + } + + int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +@@ -606,9 +600,9 @@ int ia32_setup_rt_frame(int sig, struct + current->comm, current->pid, frame, regs->rip, frame->pretcode); + #endif + +- return 1; ++ return 0; + + give_sigsegv: + force_sigsegv(sig, current); +- return 0; ++ return -EFAULT; + } +diff -upr linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c linux-2.6.16-026test015/arch/x86_64/ia32/sys_ia32.c +--- linux-2.6.16.orig/arch/x86_64/ia32/sys_ia32.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/ia32/sys_ia32.c 2006-07-04 14:41:38.000000000 +0400 +@@ -527,7 +527,7 @@ int sys32_ni_syscall(int call) + static char lastcomm[sizeof(me->comm)]; + + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { +- printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", ++ ve_printk(VE_LOG, KERN_INFO "IA32 syscall %d from %s not implemented\n", + call, me->comm); + strncpy(lastcomm, me->comm, sizeof(lastcomm)); + } +@@ -890,13 +890,13 @@ asmlinkage long sys32_olduname(struct ol + + down_read(&uts_sem); + +- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); ++ error = __copy_to_user(&name->sysname,&ve_utsname.sysname,__OLD_UTS_LEN); + __put_user(0,name->sysname+__OLD_UTS_LEN); +- __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); ++ __copy_to_user(&name->nodename,&ve_utsname.nodename,__OLD_UTS_LEN); + __put_user(0,name->nodename+__OLD_UTS_LEN); +- __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); ++ __copy_to_user(&name->release,&ve_utsname.release,__OLD_UTS_LEN); + __put_user(0,name->release+__OLD_UTS_LEN); +- __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); ++ __copy_to_user(&name->version,&ve_utsname.version,__OLD_UTS_LEN); + __put_user(0,name->version+__OLD_UTS_LEN); + { + char *arch = "x86_64"; +@@ -919,7 +919,7 @@ long sys32_uname(struct old_utsname __us + if (!name) + return -EFAULT; + down_read(&uts_sem); +- err=copy_to_user(name, &system_utsname, sizeof (*name)); ++ err=copy_to_user(name, &ve_utsname, sizeof (*name)); + up_read(&uts_sem); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); +@@ -1005,7 +1005,7 @@ long sys32_vm86_warning(void) + struct task_struct *me = current; + static char lastcomm[sizeof(me->comm)]; + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { +- printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", ++ ve_printk(VE_LOG, KERN_INFO "%s: vm87 mode not supported on 64 bit kernel\n", + me->comm); + strncpy(lastcomm, me->comm, sizeof(lastcomm)); + } +diff -upr linux-2.6.16.orig/arch/x86_64/ia32/syscall32.c linux-2.6.16-026test015/arch/x86_64/ia32/syscall32.c +--- linux-2.6.16.orig/arch/x86_64/ia32/syscall32.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/ia32/syscall32.c 2006-07-04 14:41:37.000000000 +0400 +@@ -14,6 +14,8 @@ + #include <asm/tlbflush.h> + #include <asm/ia32_unistd.h> + ++#include <ub/ub_vmpages.h> ++ + extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; + extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; + extern int sysctl_vsyscall32; +@@ -47,32 +49,45 @@ int syscall32_setup_pages(struct linux_b + int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; ++ unsigned long flags; + int ret; + ++ flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | ++ mm->def_flags; ++ ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, VSYSCALL32_END - VSYSCALL32_BASE, ++ flags, NULL, UB_SOFT)) ++ goto err_charge; ++ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) +- return -ENOMEM; ++ goto err_alloc; + + memset(vma, 0, sizeof(struct vm_area_struct)); + /* Could randomize here */ + vma->vm_start = VSYSCALL32_BASE; + vma->vm_end = VSYSCALL32_END; + /* MAYWRITE to allow gdb to COW and set breakpoints */ +- vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; +- vma->vm_flags |= mm->def_flags; ++ vma->vm_flags = flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall32_vm_ops; + vma->vm_mm = mm; + + down_write(&mm->mmap_sem); +- if ((ret = insert_vm_struct(mm, vma))) { +- up_write(&mm->mmap_sem); +- kmem_cache_free(vm_area_cachep, vma); +- return ret; +- } ++ if ((ret = insert_vm_struct(mm, vma))) ++ goto err_ins; + mm->total_vm += npages; + up_write(&mm->mmap_sem); + return 0; ++ ++err_ins: ++ up_write(&mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, vma); ++err_alloc: ++ ub_memory_uncharge(mm, VSYSCALL32_END - VSYSCALL32_BASE, flags, NULL); ++err_charge: ++ return ret; + } + + static int __init init_syscall32(void) +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/acpi/wakeup.S linux-2.6.16-026test015/arch/x86_64/kernel/acpi/wakeup.S +--- linux-2.6.16.orig/arch/x86_64/kernel/acpi/wakeup.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/acpi/wakeup.S 2006-07-04 14:41:39.000000000 +0400 +@@ -77,7 +77,7 @@ wakeup_code: + + .byte 0x66, 0xea # prefix + jmpi-opcode + .long wakeup_32 - __START_KERNEL_map +- .word __KERNEL_CS ++ .word __BOOT_CS + + .code32 + wakeup_32: +@@ -96,13 +96,13 @@ wakeup_32: + jnc bogus_cpu + movl %edx,%edi + +- movw $__KERNEL_DS, %ax ++ movw $__BOOT_DS, %ax + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + +- movw $__KERNEL_DS, %ax ++ movw $__BOOT_DS, %ax + movw %ax, %ss + + mov $(wakeup_stack - __START_KERNEL_map), %esp +@@ -187,7 +187,7 @@ reach_compatibility_mode: + + wakeup_jumpvector: + .long wakeup_long64 - __START_KERNEL_map +- .word __KERNEL_CS ++ .word __BOOT_CS + + .code64 + +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/apic.c linux-2.6.16-026test015/arch/x86_64/kernel/apic.c +--- linux-2.6.16.orig/arch/x86_64/kernel/apic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/apic.c 2006-07-04 14:41:38.000000000 +0400 +@@ -941,6 +941,7 @@ void smp_local_timer_interrupt(struct pt + */ + void smp_apic_timer_interrupt(struct pt_regs *regs) + { ++ struct ve_struct *ve; + /* + * the NMI deadlock-detector uses this. + */ +@@ -957,9 +958,11 @@ void smp_apic_timer_interrupt(struct pt_ + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); ++ ve = set_exec_env(get_ve0()); + irq_enter(); + smp_local_timer_interrupt(regs); + irq_exit(); ++ (void)set_exec_env(ve); + } + + /* +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/entry.S linux-2.6.16-026test015/arch/x86_64/kernel/entry.S +--- linux-2.6.16.orig/arch/x86_64/kernel/entry.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/entry.S 2006-07-04 14:41:39.000000000 +0400 +@@ -180,6 +180,10 @@ rff_trace: + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. ++ * ++ * When user can change the frames always force IRET. That is because ++ * it deals with uncanonical addresses better. SYSRET has trouble ++ * with them due to bugs in both AMD and Intel CPUs. + */ + + ENTRY(system_call) +@@ -244,7 +248,7 @@ sysret_careful: + /* Handle a signal */ + sysret_signal: + sti +- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx ++ testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz 1f + + /* Really a signal */ +@@ -254,7 +258,10 @@ sysret_signal: + xorl %esi,%esi # oldset -> arg2 + call ptregscall_common + 1: movl $_TIF_NEED_RESCHED,%edi +- jmp sysret_check ++ /* Use IRET because user could have changed frame. This ++ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ ++ cli ++ jmp int_with_check + + badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) +@@ -274,13 +281,9 @@ tracesys: + ja 1f + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) +- movq %rax,RAX-ARGOFFSET(%rsp) +-1: SAVE_REST +- movq %rsp,%rdi +- call syscall_trace_leave +- RESTORE_TOP_OF_STACK %rbx +- RESTORE_REST +- jmp ret_from_sys_call ++1: movq %rax,RAX-ARGOFFSET(%rsp) ++ /* Use IRET because user could have changed frame */ ++ jmp int_ret_from_sys_call + CFI_ENDPROC + + /* +@@ -350,7 +353,7 @@ int_very_careful: + jmp int_restore_rest + + int_signal: +- testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx ++ testl $(_TIF_NOTIFY_RESUME|_TIF_RESTORE_SIGMASK|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 +@@ -408,25 +411,9 @@ ENTRY(stub_execve) + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 + SAVE_REST +- movq %r11, %r15 +- CFI_REGISTER rip, r15 + FIXUP_TOP_OF_STACK %r11 + call sys_execve +- GET_THREAD_INFO(%rcx) +- bt $TIF_IA32,threadinfo_flags(%rcx) +- CFI_REMEMBER_STATE +- jc exec_32bit + RESTORE_TOP_OF_STACK %r11 +- movq %r15, %r11 +- CFI_REGISTER rip, r11 +- RESTORE_REST +- pushq %r11 +- CFI_ADJUST_CFA_OFFSET 8 +- CFI_REL_OFFSET rip, 0 +- ret +- +-exec_32bit: +- CFI_RESTORE_STATE + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call +@@ -574,7 +561,7 @@ retint_careful: + jmp retint_check + + retint_signal: +- testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx ++ testl $(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + jz retint_swapgs + sti + SAVE_REST +@@ -845,7 +832,7 @@ ENTRY(kernel_thread) + xorl %r9d,%r9d + + # clone now +- call do_fork ++ call do_fork_kthread + movq %rax,RAX(%rsp) + xorl %edi,%edi + +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/head.S linux-2.6.16-026test015/arch/x86_64/kernel/head.S +--- linux-2.6.16.orig/arch/x86_64/kernel/head.S 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/head.S 2006-07-04 14:41:39.000000000 +0400 +@@ -40,7 +40,7 @@ startup_32: + */ + + /* Initialize the %ds segment register */ +- movl $__KERNEL_DS,%eax ++ movl $__BOOT_DS,%eax + movl %eax,%ds + + /* Load new GDT with the 64bit segments using 32bit descriptor */ +@@ -183,7 +183,14 @@ startup_64: + /* esi is pointer to real mode structure with interesting info. + pass it to C */ + movl %esi, %edi +- ++ ++ /* Switch to __KERNEL_CS. The segment is the same, but selector ++ * is different. */ ++ pushq $__KERNEL_CS ++ pushq $switch_cs ++ lretq ++switch_cs: ++ + /* Finally jump to run C code and to be on real kernel address + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address , this is only possible as indirect +@@ -243,7 +250,7 @@ pGDT32: + .org 0xf10 + ljumpvector: + .long startup_64-__START_KERNEL_map +- .word __KERNEL_CS ++ .word __BOOT_CS + + ENTRY(stext) + ENTRY(_stext) +@@ -355,21 +362,30 @@ gdt: + .align PAGE_SIZE + + /* The TLS descriptors are currently at a different place compared to i386. +- Hopefully nobody expects them at a fixed place (Wine?) */ ++ Hopefully nobody expects them at a fixed place (Wine?) ++ Descriptors rearranged to plase 32bit and TLS selectors in the same ++ places, because it is really necessary. sysret/exit mandates order ++ of kernel/user cs/ds, so we have to extend gdt. ++*/ + + ENTRY(cpu_gdt_table) +- .quad 0x0000000000000000 /* NULL descriptor */ +- .quad 0x0 /* unused */ +- .quad 0x00af9a000000ffff /* __KERNEL_CS */ +- .quad 0x00cf92000000ffff /* __KERNEL_DS */ +- .quad 0x00cffa000000ffff /* __USER32_CS */ +- .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ +- .quad 0x00affa000000ffff /* __USER_CS */ +- .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ +- .quad 0,0 /* TSS */ +- .quad 0,0 /* LDT */ +- .quad 0,0,0 /* three TLS descriptors */ +- .quad 0 /* unused */ ++ .quad 0x0000000000000000 /* 0 NULL descriptor */ ++ .quad 0x0 /* 1 unused */ ++ .quad 0x00af9a000000ffff /* 2 __BOOT_CS */ ++ .quad 0x00cf92000000ffff /* 3 __BOOT_DS */ ++ .quad 0,0 /* 4,5 TSS */ ++ .quad 0,0,0 /* 6-8 three TLS descriptors */ ++ .quad 0,0 /* 9,10 LDT */ ++ .quad 0x00cf9a000000ffff /* 11 __KERNEL32_CS */ ++ .quad 0x00af9a000000ffff /* 12 __KERNEL_CS */ ++ .quad 0x00cf92000000ffff /* 13 __KERNEL_DS */ ++ .quad 0x00cffa000000ffff /* 14 __USER32_CS */ ++ .quad 0x00cff2000000ffff /* 15 __USER_DS, __USER32_DS */ ++ .quad 0x00affa000000ffff /* 16 __USER_CS */ ++ .quad 0x0 /* 17 unused */ ++ .quad 0,0,0,0,0,0 ++ .quad 0,0,0,0,0,0,0,0 ++ + gdt_end: + /* asm/segment.h:GDT_ENTRIES must match this */ + /* This should be a multiple of the cache line size */ +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/irq.c linux-2.6.16-026test015/arch/x86_64/kernel/irq.c +--- linux-2.6.16.orig/arch/x86_64/kernel/irq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/irq.c 2006-07-04 14:41:38.000000000 +0400 +@@ -98,12 +98,15 @@ asmlinkage unsigned int do_IRQ(struct pt + { + /* high bits used in ret_from_ code */ + unsigned irq = regs->orig_rax & 0xff; ++ struct ve_struct *ve; + + exit_idle(); ++ ve = set_exec_env(get_ve0()); + irq_enter(); + + __do_IRQ(irq, regs); + irq_exit(); ++ (void)set_exec_env(ve); + + return 1; + } +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/ldt.c linux-2.6.16-026test015/arch/x86_64/kernel/ldt.c +--- linux-2.6.16.orig/arch/x86_64/kernel/ldt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/ldt.c 2006-07-04 14:41:39.000000000 +0400 +@@ -16,6 +16,7 @@ + #include <linux/smp_lock.h> + #include <linux/vmalloc.h> + #include <linux/slab.h> ++#include <linux/module.h> + + #include <asm/uaccess.h> + #include <asm/system.h> +@@ -23,6 +24,8 @@ + #include <asm/desc.h> + #include <asm/proto.h> + ++#include <ub/ub_mem.h> ++ + #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ + static void flush_ldt(void *null) + { +@@ -42,9 +45,9 @@ static int alloc_ldt(mm_context_t *pc, u + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) +- newldt = vmalloc(mincount*LDT_ENTRY_SIZE); ++ newldt = ub_vmalloc(mincount*LDT_ENTRY_SIZE); + else +- newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); ++ newldt = ub_kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; +@@ -109,6 +112,7 @@ int init_new_context(struct task_struct + } + return retval; + } ++EXPORT_SYMBOL_GPL(init_new_context); + + /* + * +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/nmi.c linux-2.6.16-026test015/arch/x86_64/kernel/nmi.c +--- linux-2.6.16.orig/arch/x86_64/kernel/nmi.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/nmi.c 2006-07-04 14:41:37.000000000 +0400 +@@ -522,6 +522,7 @@ static __kprobes int dummy_nmi_callback( + } + + static nmi_callback_t nmi_callback = dummy_nmi_callback; ++static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback; + + asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) + { +@@ -531,9 +532,21 @@ asmlinkage __kprobes void do_nmi(struct + add_pda(__nmi_count,1); + if (!rcu_dereference(nmi_callback)(regs, cpu)) + default_do_nmi(regs); ++ ++ nmi_ipi_callback(regs, cpu); + nmi_exit(); + } + ++void set_nmi_ipi_callback(nmi_callback_t callback) ++{ ++ nmi_ipi_callback = callback; ++} ++ ++void unset_nmi_ipi_callback(void) ++{ ++ nmi_ipi_callback = dummy_nmi_callback; ++} ++ + void set_nmi_callback(nmi_callback_t callback) + { + rcu_assign_pointer(nmi_callback, callback); +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/pci-gart.c linux-2.6.16-026test015/arch/x86_64/kernel/pci-gart.c +--- linux-2.6.16.orig/arch/x86_64/kernel/pci-gart.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/pci-gart.c 2006-07-04 14:41:36.000000000 +0400 +@@ -114,10 +114,6 @@ static unsigned long alloc_iommu(int siz + static void free_iommu(unsigned long offset, int size) + { + unsigned long flags; +- if (size == 1) { +- clear_bit(offset, iommu_gart_bitmap); +- return; +- } + spin_lock_irqsave(&iommu_bitmap_lock, flags); + __clear_bit_string(iommu_gart_bitmap, offset, size); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/process.c linux-2.6.16-026test015/arch/x86_64/kernel/process.c +--- linux-2.6.16.orig/arch/x86_64/kernel/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -54,6 +54,11 @@ + #include <asm/idle.h> + + asmlinkage extern void ret_from_fork(void); ++asmlinkage extern void int_ret_from_sys_call(void); ++asmlinkage extern void execve(void); ++EXPORT_SYMBOL_GPL(ret_from_fork); ++EXPORT_SYMBOL_GPL(int_ret_from_sys_call); ++EXPORT_SYMBOL_GPL(execve); + + unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; + +@@ -303,7 +308,8 @@ void __show_regs(struct pt_regs * regs) + (int)strcspn(system_utsname.version, " "), + system_utsname.version); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); +- printk_address(regs->rip); ++ if (decode_call_traces) ++ printk_address(regs->rip); + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + regs->eflags); + printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", +@@ -340,11 +346,26 @@ void __show_regs(struct pt_regs * regs) + + void show_regs(struct pt_regs *regs) + { +- printk("CPU %d:", smp_processor_id()); ++ printk("CPU %d, VCPU %d%d", smp_processor_id(), task_vsched_id(current), task_cpu(current)); + __show_regs(regs); + show_trace(®s->rsp); + } + ++void smp_show_regs(struct pt_regs *regs, void *data) ++{ ++ static DEFINE_SPINLOCK(show_regs_lock); ++ ++ if (regs == NULL) ++ return; ++ ++ bust_spinlocks(1); ++ spin_lock(&show_regs_lock); ++ printk("----------- IPI show regs -----------\n"); ++ show_regs(regs); ++ spin_unlock(&show_regs_lock); ++ bust_spinlocks(0); ++} ++ + /* + * Free current thread data structures etc.. + */ +@@ -527,8 +548,6 @@ __switch_to(struct task_struct *prev_p, + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + +- unlazy_fpu(prev_p); +- + /* + * Reload esp0, LDT and the page table pointer: + */ +@@ -591,6 +610,12 @@ __switch_to(struct task_struct *prev_p, + prev->userrsp = read_pda(oldrsp); + write_pda(oldrsp, next->userrsp); + write_pda(pcurrent, next_p); ++ ++ /* This must be here to ensure both math_state_restore() and ++ kernel_fpu_begin() work consistently. ++ And the AMD workaround requires it to be after DS reload. */ ++ unlazy_fpu(prev_p); ++ + write_pda(kernelstack, + task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + +@@ -841,3 +866,20 @@ unsigned long arch_align_stack(unsigned + sp -= get_random_int() % 8192; + return sp & ~0xf; + } ++ ++long do_fork_kthread(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr) ++{ ++ if (ve_is_super(get_exec_env())) ++ return do_fork(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr); ++ ++ /* Don't allow kernel_thread() inside VE */ ++ printk("kernel_thread call inside VE\n"); ++ dump_stack(); ++ return -EPERM; ++} +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/ptrace.c linux-2.6.16-026test015/arch/x86_64/kernel/ptrace.c +--- linux-2.6.16.orig/arch/x86_64/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/ptrace.c 2006-07-04 14:41:39.000000000 +0400 +@@ -300,6 +300,15 @@ static unsigned long getreg(struct task_ + return child->thread.fs; + case offsetof(struct user_regs_struct, gs_base): + return child->thread.gs; ++ case offsetof(struct user_regs_struct, cs): ++ if (test_tsk_thread_flag(child, TIF_SYSCALL_TRACE)) { ++ val = get_stack_long(child, regno - sizeof(struct pt_regs)); ++ if (val == __USER_CS) ++ return 0x33; ++ if (val == __USER32_CS) ++ return 0x23; ++ } ++ /* fall through */ + default: + regno = regno - sizeof(struct pt_regs); + val = get_stack_long(child, regno); +@@ -581,8 +590,10 @@ static void syscall_trace(struct pt_regs + current_thread_info()->flags, current->ptrace); + #endif + ++ set_pn_state(current, (regs->rax != -ENOSYS) ? PN_STOP_LEAVE : PN_STOP_ENTRY); + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); ++ clear_pn_state(current); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/setup.c linux-2.6.16-026test015/arch/x86_64/kernel/setup.c +--- linux-2.6.16.orig/arch/x86_64/kernel/setup.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/setup.c 2006-07-04 14:41:36.000000000 +0400 +@@ -909,6 +909,10 @@ static int __init init_amd(struct cpuinf + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + ++ /* Enable workaround for FXSAVE leak */ ++ if (c->x86 >= 6) ++ set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); ++ + r = get_model_name(c); + if (!r) { + switch (c->x86) { +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/setup64.c linux-2.6.16-026test015/arch/x86_64/kernel/setup64.c +--- linux-2.6.16.orig/arch/x86_64/kernel/setup64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/setup64.c 2006-07-04 14:41:39.000000000 +0400 +@@ -290,3 +290,5 @@ void __cpuinit cpu_init (void) + + fpu_init(); + } ++ ++EXPORT_SYMBOL_GPL(cpu_gdt_descr); +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/signal.c linux-2.6.16-026test015/arch/x86_64/kernel/signal.c +--- linux-2.6.16.orig/arch/x86_64/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/signal.c 2006-07-04 14:41:39.000000000 +0400 +@@ -40,37 +40,6 @@ int ia32_setup_frame(int sig, struct k_s + sigset_t *set, struct pt_regs * regs); + + asmlinkage long +-sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) +-{ +- sigset_t saveset, newset; +- +- /* XXX: Don't preclude handling different sized sigset_t's. */ +- if (sigsetsize != sizeof(sigset_t)) +- return -EINVAL; +- +- if (copy_from_user(&newset, unewset, sizeof(newset))) +- return -EFAULT; +- sigdelsetmask(&newset, ~_BLOCKABLE); +- +- spin_lock_irq(¤t->sighand->siglock); +- saveset = current->blocked; +- current->blocked = newset; +- recalc_sigpending(); +- spin_unlock_irq(¤t->sighand->siglock); +-#ifdef DEBUG_SIG +- printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", +- saveset, newset, regs, regs->rip); +-#endif +- regs->rax = -EINTR; +- while (1) { +- current->state = TASK_INTERRUPTIBLE; +- schedule(); +- if (do_signal(regs, &saveset)) +- return -EINTR; +- } +-} +- +-asmlinkage long + sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, + struct pt_regs *regs) + { +@@ -344,11 +313,11 @@ static int setup_rt_frame(int sig, struc + current->comm, current->pid, frame, regs->rip, frame->pretcode); + #endif + +- return 1; ++ return 0; + + give_sigsegv: + force_sigsegv(sig, current); +- return 0; ++ return -EFAULT; + } + + /* +@@ -411,7 +380,7 @@ handle_signal(unsigned long sig, siginfo + #endif + ret = setup_rt_frame(sig, ka, info, oldset, regs); + +- if (ret) { ++ if (ret == 0) { + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) +@@ -428,9 +397,10 @@ handle_signal(unsigned long sig, siginfo + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +-int do_signal(struct pt_regs *regs, sigset_t *oldset) ++static void do_signal(struct pt_regs *regs) + { + struct k_sigaction ka; ++ sigset_t *oldset; + siginfo_t info; + int signr; + +@@ -441,12 +411,14 @@ int do_signal(struct pt_regs *regs, sigs + * if so. + */ + if (!user_mode(regs)) +- return 1; ++ return; + +- if (try_to_freeze()) ++ if (try_to_freeze() && !signal_pending(current)) + goto no_signal; + +- if (!oldset) ++ if (test_thread_flag(TIF_RESTORE_SIGMASK)) ++ oldset = ¤t->saved_sigmask; ++ else + oldset = ¤t->blocked; + + signr = get_signal_to_deliver(&info, &ka, regs, NULL); +@@ -460,7 +432,15 @@ int do_signal(struct pt_regs *regs, sigs + set_debugreg(current->thread.debugreg7, 7); + + /* Whee! Actually deliver the signal. */ +- return handle_signal(signr, &info, &ka, oldset, regs); ++ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { ++ /* a signal was successfully delivered; the saved ++ * sigmask will have been stored in the signal frame, ++ * and will be restored by sigreturn, so we can simply ++ * clear the TIF_RESTORE_SIGMASK flag */ ++ if (test_thread_flag(TIF_RESTORE_SIGMASK)) ++ clear_thread_flag(TIF_RESTORE_SIGMASK); ++ } ++ return; + } + + no_signal: +@@ -481,10 +461,16 @@ int do_signal(struct pt_regs *regs, sigs + regs->rip -= 2; + } + } +- return 0; ++ ++ /* if there's no signal to deliver, we just put the saved sigmask ++ * back */ ++ if (test_thread_flag(TIF_RESTORE_SIGMASK)) { ++ clear_thread_flag(TIF_RESTORE_SIGMASK); ++ sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); ++ } + } + +-void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) ++void do_notify_resume(struct pt_regs *regs, sigset_t *unused, __u32 thread_info_flags) + { + #ifdef DEBUG_SIG + printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", +@@ -498,8 +484,8 @@ void do_notify_resume(struct pt_regs *re + } + + /* deal with pending signal delivery */ +- if (thread_info_flags & _TIF_SIGPENDING) +- do_signal(regs,oldset); ++ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) ++ do_signal(regs); + } + + void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/smp.c linux-2.6.16-026test015/arch/x86_64/kernel/smp.c +--- linux-2.6.16.orig/arch/x86_64/kernel/smp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/smp.c 2006-07-04 14:41:37.000000000 +0400 +@@ -28,6 +28,7 @@ + #include <asm/proto.h> + #include <asm/apicdef.h> + #include <asm/idle.h> ++#include <asm/nmi.h> + + /* + * Smarter SMP flushing macros. +@@ -444,6 +445,84 @@ int smp_call_function (void (*func) (voi + return 0; + } + ++static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED; ++static struct nmi_call_data_struct { ++ smp_nmi_function func; ++ void *info; ++ atomic_t started; ++ atomic_t finished; ++ cpumask_t cpus_called; ++ int wait; ++} *nmi_call_data; ++ ++static int smp_nmi_callback(struct pt_regs * regs, int cpu) ++{ ++ smp_nmi_function func; ++ void *info; ++ int wait; ++ ++ func = nmi_call_data->func; ++ info = nmi_call_data->info; ++ wait = nmi_call_data->wait; ++ ack_APIC_irq(); ++ /* prevent from calling func() multiple times */ ++ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called)) ++ return 0; ++ /* ++ * notify initiating CPU that I've grabbed the data and am ++ * about to execute the function ++ */ ++ mb(); ++ atomic_inc(&nmi_call_data->started); ++ /* at this point the nmi_call_data structure is out of scope */ ++ irq_enter(); ++ func(regs, info); ++ irq_exit(); ++ if (wait) ++ atomic_inc(&nmi_call_data->finished); ++ ++ return 0; ++} ++ ++int smp_nmi_call_function(smp_nmi_function func, void *info, int wait) ++{ ++ struct nmi_call_data_struct data; ++ int cpus; ++ ++ cpus = num_online_cpus() - 1; ++ if (!cpus) ++ return 0; ++ ++ data.func = func; ++ data.info = info; ++ data.wait = wait; ++ atomic_set(&data.started, 0); ++ atomic_set(&data.finished, 0); ++ cpus_clear(data.cpus_called); ++ /* prevent this cpu from calling func if NMI happens */ ++ cpu_set(smp_processor_id(), data.cpus_called); ++ ++ if (!spin_trylock(&nmi_call_lock)) ++ return -1; ++ ++ nmi_call_data = &data; ++ set_nmi_ipi_callback(smp_nmi_callback); ++ mb(); ++ ++ /* Send a message to all other CPUs and wait for them to respond */ ++ send_IPI_allbutself(APIC_DM_NMI); ++ while (atomic_read(&data.started) != cpus) ++ barrier(); ++ ++ unset_nmi_ipi_callback(); ++ if (wait) ++ while (atomic_read(&data.finished) != cpus) ++ barrier(); ++ spin_unlock(&nmi_call_lock); ++ ++ return 0; ++} ++ + void smp_stop_cpu(void) + { + unsigned long flags; +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/sys_x86_64.c linux-2.6.16-026test015/arch/x86_64/kernel/sys_x86_64.c +--- linux-2.6.16.orig/arch/x86_64/kernel/sys_x86_64.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/sys_x86_64.c 2006-07-04 14:41:38.000000000 +0400 +@@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_uts + { + int err; + down_read(&uts_sem); +- err = copy_to_user(name, &system_utsname, sizeof (*name)); ++ err = copy_to_user(name, &ve_utsname, sizeof (*name)); + up_read(&uts_sem); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/time.c linux-2.6.16-026test015/arch/x86_64/kernel/time.c +--- linux-2.6.16.orig/arch/x86_64/kernel/time.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/time.c 2006-07-04 14:41:38.000000000 +0400 +@@ -66,6 +66,8 @@ unsigned long vxtime_hz = PIT_TICK_RATE; + int report_lost_ticks; /* command line option */ + unsigned long long monotonic_base; + ++EXPORT_SYMBOL(cpu_khz); ++ + struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ + + volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +diff -upr linux-2.6.16.orig/arch/x86_64/kernel/traps.c linux-2.6.16-026test015/arch/x86_64/kernel/traps.c +--- linux-2.6.16.orig/arch/x86_64/kernel/traps.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/kernel/traps.c 2006-07-04 14:41:38.000000000 +0400 +@@ -30,6 +30,7 @@ + #include <linux/moduleparam.h> + #include <linux/nmi.h> + #include <linux/kprobes.h> ++#include <linux/kexec.h> + + #include <asm/system.h> + #include <asm/uaccess.h> +@@ -116,6 +117,9 @@ int printk_address(unsigned long address + char *delim = ":"; + char namebuf[128]; + ++ if (!decode_call_traces) ++ return printk("[<%016lx>]", address); ++ + symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); + if (!symname) + return printk("[<%016lx>]", address); +@@ -208,7 +212,7 @@ void show_trace(unsigned long *stack) + do while (cond) { \ + unsigned long addr = *stack++; \ + if (kernel_text_address(addr)) { \ +- if (i > 50) { \ ++ if (i > 50 && decode_call_traces) { \ + printk("\n "); \ + i = 0; \ + } \ +@@ -290,7 +294,7 @@ void show_stack(struct task_struct *tsk, + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } +- if (i && ((i % 4) == 0)) ++ if (i && ((i % 4) == 0) && decode_call_traces) + printk("\n "); + printk("%016lx ", *stack++); + touch_nmi_watchdog(); +@@ -319,10 +323,12 @@ void show_registers(struct pt_regs *regs + + rsp = regs->rsp; + +- printk("CPU %d ", cpu); ++ printk("CPU: %d ", cpu); + __show_regs(regs); +- printk("Process %s (pid: %d, threadinfo %p, task %p)\n", +- cur->comm, cur->pid, task_thread_info(cur), cur); ++ printk("Process %s (pid: %d, veid=%d, threadinfo %p, task %p)\n", ++ cur->comm, cur->pid, ++ VEID(VE_TASK_INFO(current)->owner_env), ++ task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the +@@ -434,6 +440,8 @@ void __kprobes __die(const char * str, s + printk(KERN_ALERT "RIP "); + printk_address(regs->rip); + printk(" RSP <%016lx>\n", regs->rsp); ++ if (kexec_should_crash(current)) ++ crash_kexec(regs); + } + + void die(const char * str, struct pt_regs * regs, long err) +@@ -456,8 +464,11 @@ void __kprobes die_nmi(char *str, struct + */ + printk(str, safe_smp_processor_id()); + show_registers(regs); ++ if (kexec_should_crash(current)) ++ crash_kexec(regs); + if (panic_on_timeout || panic_on_oops) + panic("nmi watchdog"); ++ smp_nmi_call_function(smp_show_regs, NULL, 1); + printk("console shuts up ...\n"); + oops_end(flags); + do_exit(SIGSEGV); +diff -upr linux-2.6.16.orig/arch/x86_64/mm/fault.c linux-2.6.16-026test015/arch/x86_64/mm/fault.c +--- linux-2.6.16.orig/arch/x86_64/mm/fault.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/mm/fault.c 2006-07-04 14:41:38.000000000 +0400 +@@ -41,27 +41,6 @@ + #define PF_RSVD (1<<3) + #define PF_INSTR (1<<4) + +-void bust_spinlocks(int yes) +-{ +- int loglevel_save = console_loglevel; +- if (yes) { +- oops_in_progress = 1; +- } else { +-#ifdef CONFIG_VT +- unblank_screen(); +-#endif +- oops_in_progress = 0; +- /* +- * OK, the message is on the console. Now we call printk() +- * without oops_in_progress set so that printk will give klogd +- * a poke. Hold onto your hats... +- */ +- console_loglevel = 15; /* NMI oopser may have shut the console up */ +- printk(" "); +- console_loglevel = loglevel_save; +- } +-} +- + /* Sometimes the CPU reports invalid exceptions on prefetch. + Check that here and ignore. + Opcode checker based on code by Richard Brunner */ +@@ -293,7 +272,7 @@ static int vmalloc_fault(unsigned long a + } + + int page_fault_trace = 0; +-int exception_trace = 1; ++int exception_trace = 0; + + /* + * This routine handles page faults. It determines the address, +@@ -322,7 +301,7 @@ asmlinkage void __kprobes do_page_fault( + local_irq_enable(); + + if (unlikely(page_fault_trace)) +- printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", ++ ve_printk(VE_LOG, "pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", + regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); + + tsk = current; +@@ -372,7 +351,6 @@ asmlinkage void __kprobes do_page_fault( + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + +- again: + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunatly, in the case of an +@@ -476,7 +454,7 @@ bad_area_nosemaphore: + return; + + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { +- printk( ++ ve_printk(VE_LOG, + "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + tsk->pid > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, tsk->pid, address, regs->rip, +@@ -526,8 +504,10 @@ no_context: + else + printk(KERN_ALERT "Unable to handle kernel paging request"); + printk(" at %016lx RIP: \n" KERN_ALERT,address); +- printk_address(regs->rip); +- printk("\n"); ++ if (decode_call_traces) { ++ printk_address(regs->rip); ++ printk("\n"); ++ } + dump_pagetable(address); + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; +@@ -544,13 +524,14 @@ no_context: + */ + out_of_memory: + up_read(&mm->mmap_sem); +- if (current->pid == 1) { +- yield(); +- goto again; +- } +- printk("VM: killing process %s\n", tsk->comm); +- if (error_code & 4) +- do_exit(SIGKILL); ++ if (error_code & 4) { ++ /* ++ * 0-order allocation always success if something really ++ * fatal not happen: beancounter overdraft or OOM. ++ */ ++ force_sig(SIGKILL, tsk); ++ return; ++ } + goto no_context; + + do_sigbus: +diff -upr linux-2.6.16.orig/arch/x86_64/mm/init.c linux-2.6.16-026test015/arch/x86_64/mm/init.c +--- linux-2.6.16.orig/arch/x86_64/mm/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/arch/x86_64/mm/init.c 2006-07-04 14:41:38.000000000 +0400 +@@ -89,6 +89,7 @@ void show_mem(void) + printk(KERN_INFO "%lu pages shared\n",shared); + printk(KERN_INFO "%lu pages swap cached\n",cached); + } ++EXPORT_SYMBOL(show_mem); + + /* References to section boundaries */ + +diff -upr linux-2.6.16.orig/block/elevator.c linux-2.6.16-026test015/block/elevator.c +--- linux-2.6.16.orig/block/elevator.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/block/elevator.c 2006-07-04 14:41:38.000000000 +0400 +@@ -314,6 +314,7 @@ void elv_insert(request_queue_t *q, stru + { + struct list_head *pos; + unsigned ordseq; ++ int unplug_it = 1; + + rq->q = q; + +@@ -378,6 +379,11 @@ void elv_insert(request_queue_t *q, stru + } + + list_add_tail(&rq->queuelist, pos); ++ /* ++ * most requeues happen because of a busy condition, don't ++ * force unplug of the queue for that case. ++ */ ++ unplug_it = 0; + break; + + default: +@@ -386,7 +392,7 @@ void elv_insert(request_queue_t *q, stru + BUG(); + } + +- if (blk_queue_plugged(q)) { ++ if (unplug_it && blk_queue_plugged(q)) { + int nrq = q->rq.count[READ] + q->rq.count[WRITE] + - q->in_flight; + +@@ -676,7 +682,7 @@ void elv_unregister(struct elevator_type + * Iterate every thread in the process to remove the io contexts. + */ + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + struct io_context *ioc = p->io_context; + if (ioc && ioc->cic) { + ioc->cic->exit(ioc->cic); +@@ -688,7 +694,7 @@ void elv_unregister(struct elevator_type + ioc->aic->dtor(ioc->aic); + ioc->aic = NULL; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + + spin_lock_irq(&elv_list_lock); +diff -upr linux-2.6.16.orig/block/genhd.c linux-2.6.16-026test015/block/genhd.c +--- linux-2.6.16.orig/block/genhd.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/block/genhd.c 2006-07-04 14:41:38.000000000 +0400 +@@ -16,9 +16,8 @@ + #include <linux/kobj_map.h> + #include <linux/buffer_head.h> + +-#define MAX_PROBE_HASH 255 /* random */ +- +-static struct subsystem block_subsys; ++struct subsystem block_subsys; ++EXPORT_SYMBOL(block_subsys); + + static DECLARE_MUTEX(block_subsys_sem); + +@@ -30,108 +29,29 @@ static struct blk_major_name { + struct blk_major_name *next; + int major; + char name[16]; +-} *major_names[MAX_PROBE_HASH]; ++} *major_names[BLKDEV_MAJOR_HASH_SIZE]; + + /* index in the above - for now: assume no multimajor ranges */ + static inline int major_to_index(int major) + { +- return major % MAX_PROBE_HASH; +-} +- +-struct blkdev_info { +- int index; +- struct blk_major_name *bd; +-}; +- +-/* +- * iterate over a list of blkdev_info structures. allows +- * the major_names array to be iterated over from outside this file +- * must be called with the block_subsys_sem held +- */ +-void *get_next_blkdev(void *dev) +-{ +- struct blkdev_info *info; +- +- if (dev == NULL) { +- info = kmalloc(sizeof(*info), GFP_KERNEL); +- if (!info) +- goto out; +- info->index=0; +- info->bd = major_names[info->index]; +- if (info->bd) +- goto out; +- } else { +- info = dev; +- } +- +- while (info->index < ARRAY_SIZE(major_names)) { +- if (info->bd) +- info->bd = info->bd->next; +- if (info->bd) +- goto out; +- /* +- * No devices on this chain, move to the next +- */ +- info->index++; +- info->bd = (info->index < ARRAY_SIZE(major_names)) ? +- major_names[info->index] : NULL; +- if (info->bd) +- goto out; +- } +- +-out: +- return info; +-} +- +-void *acquire_blkdev_list(void) +-{ +- down(&block_subsys_sem); +- return get_next_blkdev(NULL); +-} +- +-void release_blkdev_list(void *dev) +-{ +- up(&block_subsys_sem); +- kfree(dev); ++ return major % BLKDEV_MAJOR_HASH_SIZE; + } + ++#ifdef CONFIG_PROC_FS + +-/* +- * Count the number of records in the blkdev_list. +- * must be called with the block_subsys_sem held +- */ +-int count_blkdev_list(void) ++void blkdev_show(struct seq_file *f, off_t offset) + { +- struct blk_major_name *n; +- int i, count; +- +- count = 0; ++ struct blk_major_name *dp; + +- for (i = 0; i < ARRAY_SIZE(major_names); i++) { +- for (n = major_names[i]; n; n = n->next) +- count++; ++ if (offset < BLKDEV_MAJOR_HASH_SIZE) { ++ down(&block_subsys_sem); ++ for (dp = major_names[offset]; dp; dp = dp->next) ++ seq_printf(f, "%3d %s\n", dp->major, dp->name); ++ up(&block_subsys_sem); + } +- +- return count; +-} +- +-/* +- * extract the major and name values from a blkdev_info struct +- * passed in as a void to *dev. Must be called with +- * block_subsys_sem held +- */ +-int get_blkdev_info(void *dev, int *major, char **name) +-{ +- struct blkdev_info *info = dev; +- +- if (info->bd == NULL) +- return 1; +- +- *major = info->bd->major; +- *name = info->bd->name; +- return 0; + } + ++#endif /* CONFIG_PROC_FS */ + + int register_blkdev(unsigned int major, const char *name) + { +@@ -592,7 +512,7 @@ static struct kset_uevent_ops block_ueve + }; + + /* declare block_subsys. */ +-static decl_subsys(block, &ktype_block, &block_uevent_ops); ++decl_subsys(block, &ktype_block, &block_uevent_ops); + + + /* +diff -upr linux-2.6.16.orig/block/ll_rw_blk.c linux-2.6.16-026test015/block/ll_rw_blk.c +--- linux-2.6.16.orig/block/ll_rw_blk.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/block/ll_rw_blk.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1719,8 +1719,21 @@ void blk_run_queue(struct request_queue + + spin_lock_irqsave(q->queue_lock, flags); + blk_remove_plug(q); +- if (!elv_queue_empty(q)) +- q->request_fn(q); ++ ++ /* ++ * Only recurse once to avoid overrunning the stack, let the unplug ++ * handling reinvoke the handler shortly if we already got there. ++ */ ++ if (!elv_queue_empty(q)) { ++ if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { ++ q->request_fn(q); ++ clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); ++ } else { ++ blk_plug_device(q); ++ kblockd_schedule_work(&q->unplug_work); ++ } ++ } ++ + spin_unlock_irqrestore(q->queue_lock, flags); + } + EXPORT_SYMBOL(blk_run_queue); +diff -upr linux-2.6.16.orig/drivers/acpi/processor_perflib.c linux-2.6.16-026test015/drivers/acpi/processor_perflib.c +--- linux-2.6.16.orig/drivers/acpi/processor_perflib.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/acpi/processor_perflib.c 2006-07-04 14:41:36.000000000 +0400 +@@ -577,6 +577,8 @@ acpi_processor_register_performance(stru + return_VALUE(-EBUSY); + } + ++ WARN_ON(!performance); ++ + pr->performance = performance; + + if (acpi_processor_get_performance_info(pr)) { +@@ -609,7 +611,8 @@ acpi_processor_unregister_performance(st + return_VOID; + } + +- kfree(pr->performance->states); ++ if (pr->performance) ++ kfree(pr->performance->states); + pr->performance = NULL; + + acpi_cpufreq_remove_file(pr); +diff -upr linux-2.6.16.orig/drivers/base/class.c linux-2.6.16-026test015/drivers/base/class.c +--- linux-2.6.16.orig/drivers/base/class.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/base/class.c 2006-07-04 14:41:38.000000000 +0400 +@@ -72,8 +72,13 @@ static struct kobj_type ktype_class = { + }; + + /* Hotplug events for classes go to the class_obj subsys */ +-static decl_subsys(class, &ktype_class, NULL); ++decl_subsys(class, &ktype_class, NULL); + ++#ifndef CONFIG_VE ++#define visible_class_subsys class_subsys ++#else ++#define visible_class_subsys (*get_exec_env()->class_subsys) ++#endif + + int class_create_file(struct class * cls, const struct class_attribute * attr) + { +@@ -148,7 +153,7 @@ int class_register(struct class * cls) + if (error) + return error; + +- subsys_set_kset(cls, class_subsys); ++ subsys_set_kset(cls, visible_class_subsys); + + error = subsystem_register(&cls->subsys); + if (!error) { +@@ -420,8 +425,13 @@ static struct kset_uevent_ops class_ueve + .uevent = class_uevent, + }; + +-static decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops); ++decl_subsys(class_obj, &ktype_class_device, &class_uevent_ops); + ++#ifndef CONFIG_VE ++#define visible_class_obj_subsys class_obj_subsys ++#else ++#define visible_class_obj_subsys (*get_exec_env()->class_obj_subsys) ++#endif + + static int class_device_add_attrs(struct class_device * cd) + { +@@ -470,7 +480,7 @@ static ssize_t store_uevent(struct class + + void class_device_initialize(struct class_device *class_dev) + { +- kobj_set_kset_s(class_dev, class_obj_subsys); ++ kobj_set_kset_s(class_dev, visible_class_obj_subsys); + kobject_init(&class_dev->kobj); + INIT_LIST_HEAD(&class_dev->node); + } +@@ -805,12 +815,19 @@ void class_interface_unregister(struct c + class_put(parent); + } + +- ++void prepare_sysfs_classes(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->class_subsys = &class_subsys; ++ get_ve0()->class_obj_subsys = &class_obj_subsys; ++#endif ++} + + int __init classes_init(void) + { + int retval; + ++ prepare_sysfs_classes(); + retval = subsystem_register(&class_subsys); + if (retval) + return retval; +@@ -848,3 +865,6 @@ EXPORT_SYMBOL_GPL(class_device_remove_bi + + EXPORT_SYMBOL_GPL(class_interface_register); + EXPORT_SYMBOL_GPL(class_interface_unregister); ++ ++EXPORT_SYMBOL(class_subsys); ++EXPORT_SYMBOL(class_obj_subsys); +diff -upr linux-2.6.16.orig/drivers/base/cpu.c linux-2.6.16-026test015/drivers/base/cpu.c +--- linux-2.6.16.orig/drivers/base/cpu.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/base/cpu.c 2006-07-04 14:41:36.000000000 +0400 +@@ -141,7 +141,7 @@ int __devinit register_cpu(struct cpu *c + return error; + } + +-struct sys_device *get_cpu_sysdev(int cpu) ++struct sys_device *get_cpu_sysdev(unsigned cpu) + { + if (cpu < NR_CPUS) + return cpu_sys_devices[cpu]; +diff -upr linux-2.6.16.orig/drivers/base/firmware_class.c linux-2.6.16-026test015/drivers/base/firmware_class.c +--- linux-2.6.16.orig/drivers/base/firmware_class.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/base/firmware_class.c 2006-07-04 14:41:36.000000000 +0400 +@@ -211,18 +211,20 @@ static int + fw_realloc_buffer(struct firmware_priv *fw_priv, int min_size) + { + u8 *new_data; ++ int new_size = fw_priv->alloc_size; + + if (min_size <= fw_priv->alloc_size) + return 0; + +- new_data = vmalloc(fw_priv->alloc_size + PAGE_SIZE); ++ new_size = ALIGN(min_size, PAGE_SIZE); ++ new_data = vmalloc(new_size); + if (!new_data) { + printk(KERN_ERR "%s: unable to alloc buffer\n", __FUNCTION__); + /* Make sure that we don't keep incomplete data */ + fw_load_abort(fw_priv); + return -ENOMEM; + } +- fw_priv->alloc_size += PAGE_SIZE; ++ fw_priv->alloc_size = new_size; + if (fw_priv->fw->data) { + memcpy(new_data, fw_priv->fw->data, fw_priv->fw->size); + vfree(fw_priv->fw->data); +diff -upr linux-2.6.16.orig/drivers/base/node.c linux-2.6.16-026test015/drivers/base/node.c +--- linux-2.6.16.orig/drivers/base/node.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/base/node.c 2006-07-04 14:41:36.000000000 +0400 +@@ -106,7 +106,7 @@ static ssize_t node_read_numastat(struct + other_node = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *z = &pg->node_zones[i]; +- for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ for_each_online_cpu(cpu) { + struct per_cpu_pageset *ps = zone_pcp(z,cpu); + numa_hit += ps->numa_hit; + numa_miss += ps->numa_miss; +diff -upr linux-2.6.16.orig/drivers/block/cciss.c linux-2.6.16-026test015/drivers/block/cciss.c +--- linux-2.6.16.orig/drivers/block/cciss.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/block/cciss.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1181,6 +1181,53 @@ static int revalidate_allvol(ctlr_info_t + return 0; + } + ++static inline void complete_buffers(struct bio *bio, int status) ++{ ++ while (bio) { ++ struct bio *xbh = bio->bi_next; ++ int nr_sectors = bio_sectors(bio); ++ ++ bio->bi_next = NULL; ++ blk_finished_io(len); ++ bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO); ++ bio = xbh; ++ } ++ ++} ++ ++static void cciss_softirq_done(struct request *rq) ++{ ++ CommandList_struct *cmd = rq->completion_data; ++ ctlr_info_t *h = hba[cmd->ctlr]; ++ unsigned long flags; ++ u64bit temp64; ++ int i, ddir; ++ ++ if (cmd->Request.Type.Direction == XFER_READ) ++ ddir = PCI_DMA_FROMDEVICE; ++ else ++ ddir = PCI_DMA_TODEVICE; ++ ++ /* command did not need to be retried */ ++ /* unmap the DMA mapping for all the scatter gather elements */ ++ for(i=0; i<cmd->Header.SGList; i++) { ++ temp64.val32.lower = cmd->SG[i].Addr.lower; ++ temp64.val32.upper = cmd->SG[i].Addr.upper; ++ pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); ++ } ++ ++ complete_buffers(rq->bio, rq->errors); ++ ++#ifdef CCISS_DEBUG ++ printk("Done with %p\n", rq); ++#endif /* CCISS_DEBUG */ ++ ++ spin_lock_irqsave(&h->lock, flags); ++ end_that_request_last(rq, rq->errors); ++ cmd_free(h, cmd,1); ++ spin_unlock_irqrestore(&h->lock, flags); ++} ++ + /* This function will check the usage_count of the drive to be updated/added. + * If the usage_count is zero then the drive information will be updated and + * the disk will be re-registered with the kernel. If not then it will be +@@ -1249,6 +1296,8 @@ static void cciss_update_drive_info(int + + blk_queue_max_sectors(disk->queue, 512); + ++ blk_queue_softirq_done(disk->queue, cciss_softirq_done); ++ + disk->queue->queuedata = hba[ctlr]; + + blk_queue_hardsect_size(disk->queue, +@@ -2148,20 +2197,6 @@ static void start_io( ctlr_info_t *h) + addQ (&(h->cmpQ), c); + } + } +- +-static inline void complete_buffers(struct bio *bio, int status) +-{ +- while (bio) { +- struct bio *xbh = bio->bi_next; +- int nr_sectors = bio_sectors(bio); +- +- bio->bi_next = NULL; +- blk_finished_io(len); +- bio_endio(bio, nr_sectors << 9, status ? 0 : -EIO); +- bio = xbh; +- } +- +-} + /* Assumes that CCISS_LOCK(h->ctlr) is held. */ + /* Zeros out the error record and then resends the command back */ + /* to the controller */ +@@ -2179,39 +2214,6 @@ static inline void resend_cciss_cmd( ctl + start_io(h); + } + +-static void cciss_softirq_done(struct request *rq) +-{ +- CommandList_struct *cmd = rq->completion_data; +- ctlr_info_t *h = hba[cmd->ctlr]; +- unsigned long flags; +- u64bit temp64; +- int i, ddir; +- +- if (cmd->Request.Type.Direction == XFER_READ) +- ddir = PCI_DMA_FROMDEVICE; +- else +- ddir = PCI_DMA_TODEVICE; +- +- /* command did not need to be retried */ +- /* unmap the DMA mapping for all the scatter gather elements */ +- for(i=0; i<cmd->Header.SGList; i++) { +- temp64.val32.lower = cmd->SG[i].Addr.lower; +- temp64.val32.upper = cmd->SG[i].Addr.upper; +- pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); +- } +- +- complete_buffers(rq->bio, rq->errors); +- +-#ifdef CCISS_DEBUG +- printk("Done with %p\n", rq); +-#endif /* CCISS_DEBUG */ +- +- spin_lock_irqsave(&h->lock, flags); +- end_that_request_last(rq, rq->errors); +- cmd_free(h, cmd,1); +- spin_unlock_irqrestore(&h->lock, flags); +-} +- + /* checks the status of the job and calls complete buffers to mark all + * buffers for the completed job. Note that this function does not need + * to hold the hba/queue lock. +@@ -3269,8 +3271,8 @@ clean2: + unregister_blkdev(hba[i]->major, hba[i]->devname); + clean1: + release_io_mem(hba[i]); +- free_hba(i); + hba[i]->busy_initializing = 0; ++ free_hba(i); + return(-1); + } + +diff -upr linux-2.6.16.orig/drivers/block/ub.c linux-2.6.16-026test015/drivers/block/ub.c +--- linux-2.6.16.orig/drivers/block/ub.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/block/ub.c 2006-07-04 14:41:36.000000000 +0400 +@@ -704,6 +704,9 @@ static void ub_cleanup(struct ub_dev *sc + kfree(lun); + } + ++ usb_set_intfdata(sc->intf, NULL); ++ usb_put_intf(sc->intf); ++ usb_put_dev(sc->dev); + kfree(sc); + } + +@@ -2428,7 +2431,12 @@ static int ub_probe(struct usb_interface + // sc->ifnum = intf->cur_altsetting->desc.bInterfaceNumber; + usb_set_intfdata(intf, sc); + usb_get_dev(sc->dev); +- // usb_get_intf(sc->intf); /* Do we need this? */ ++ /* ++ * Since we give the interface struct to the block level through ++ * disk->driverfs_dev, we have to pin it. Otherwise, block_uevent ++ * oopses on close after a disconnect (kernels 2.6.16 and up). ++ */ ++ usb_get_intf(sc->intf); + + snprintf(sc->name, 12, DRV_NAME "(%d.%d)", + sc->dev->bus->busnum, sc->dev->devnum); +@@ -2509,7 +2517,7 @@ static int ub_probe(struct usb_interface + err_diag: + err_dev_desc: + usb_set_intfdata(intf, NULL); +- // usb_put_intf(sc->intf); ++ usb_put_intf(sc->intf); + usb_put_dev(sc->dev); + kfree(sc); + err_core: +@@ -2688,12 +2696,6 @@ static void ub_disconnect(struct usb_int + */ + + device_remove_file(&sc->intf->dev, &dev_attr_diag); +- usb_set_intfdata(intf, NULL); +- // usb_put_intf(sc->intf); +- sc->intf = NULL; +- usb_put_dev(sc->dev); +- sc->dev = NULL; +- + ub_put(sc); + } + +diff -upr linux-2.6.16.orig/drivers/char/Kconfig linux-2.6.16-026test015/drivers/char/Kconfig +--- linux-2.6.16.orig/drivers/char/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -187,6 +187,7 @@ config MOXA_SMARTIO + config ISI + tristate "Multi-Tech multiport card support (EXPERIMENTAL)" + depends on SERIAL_NONSTANDARD ++ select FW_LOADER + help + This is a driver for the Multi-Tech cards which provide several + serial ports. The driver is experimental and can currently only be +diff -upr linux-2.6.16.orig/drivers/char/agp/efficeon-agp.c linux-2.6.16-026test015/drivers/char/agp/efficeon-agp.c +--- linux-2.6.16.orig/drivers/char/agp/efficeon-agp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/agp/efficeon-agp.c 2006-07-04 14:41:36.000000000 +0400 +@@ -64,6 +64,12 @@ static struct gatt_mask efficeon_generic + {.mask = 0x00000001, .type = 0} + }; + ++/* This function does the same thing as mask_memory() for this chipset... */ ++static inline unsigned long efficeon_mask_memory(unsigned long addr) ++{ ++ return addr | 0x00000001; ++} ++ + static struct aper_size_info_lvl2 efficeon_generic_sizes[4] = + { + {256, 65536, 0}, +@@ -251,7 +257,7 @@ static int efficeon_insert_memory(struct + last_page = NULL; + for (i = 0; i < count; i++) { + int index = pg_start + i; +- unsigned long insert = mem->memory[i]; ++ unsigned long insert = efficeon_mask_memory(mem->memory[i]); + + page = (unsigned int *) efficeon_private.l1_table[index >> 10]; + +diff -upr linux-2.6.16.orig/drivers/char/cs5535_gpio.c linux-2.6.16-026test015/drivers/char/cs5535_gpio.c +--- linux-2.6.16.orig/drivers/char/cs5535_gpio.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/cs5535_gpio.c 2006-07-04 14:41:36.000000000 +0400 +@@ -241,9 +241,10 @@ static int __init cs5535_gpio_init(void) + static void __exit cs5535_gpio_cleanup(void) + { + dev_t dev_id = MKDEV(major, 0); ++ ++ cdev_del(&cs5535_gpio_cdev); + unregister_chrdev_region(dev_id, CS5535_GPIO_COUNT); +- if (gpio_base != 0) +- release_region(gpio_base, CS5535_GPIO_SIZE); ++ release_region(gpio_base, CS5535_GPIO_SIZE); + } + + module_init(cs5535_gpio_init); +diff -upr linux-2.6.16.orig/drivers/char/ipmi/ipmi_bt_sm.c linux-2.6.16-026test015/drivers/char/ipmi/ipmi_bt_sm.c +--- linux-2.6.16.orig/drivers/char/ipmi/ipmi_bt_sm.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/ipmi/ipmi_bt_sm.c 2006-07-04 14:41:36.000000000 +0400 +@@ -165,7 +165,7 @@ static int bt_start_transaction(struct s + { + unsigned int i; + +- if ((size < 2) || (size > IPMI_MAX_MSG_LENGTH)) ++ if ((size < 2) || (size > (IPMI_MAX_MSG_LENGTH - 2))) + return -1; + + if ((bt->state != BT_STATE_IDLE) && (bt->state != BT_STATE_HOSED)) +diff -upr linux-2.6.16.orig/drivers/char/pcmcia/cm4000_cs.c linux-2.6.16-026test015/drivers/char/pcmcia/cm4000_cs.c +--- linux-2.6.16.orig/drivers/char/pcmcia/cm4000_cs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/pcmcia/cm4000_cs.c 2006-07-04 14:41:36.000000000 +0400 +@@ -2010,10 +2010,6 @@ static int __init cmm_init(void) + if (!cmm_class) + return -1; + +- rc = pcmcia_register_driver(&cm4000_driver); +- if (rc < 0) +- return rc; +- + major = register_chrdev(0, DEVICE_NAME, &cm4000_fops); + if (major < 0) { + printk(KERN_WARNING MODULE_NAME +@@ -2021,6 +2017,12 @@ static int __init cmm_init(void) + return -1; + } + ++ rc = pcmcia_register_driver(&cm4000_driver); ++ if (rc < 0) { ++ unregister_chrdev(major, DEVICE_NAME); ++ return rc; ++ } ++ + return 0; + } + +diff -upr linux-2.6.16.orig/drivers/char/pcmcia/cm4040_cs.c linux-2.6.16-026test015/drivers/char/pcmcia/cm4040_cs.c +--- linux-2.6.16.orig/drivers/char/pcmcia/cm4040_cs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/pcmcia/cm4040_cs.c 2006-07-04 14:41:36.000000000 +0400 +@@ -769,16 +769,19 @@ static int __init cm4040_init(void) + if (!cmx_class) + return -1; + +- rc = pcmcia_register_driver(&reader_driver); +- if (rc < 0) +- return rc; +- + major = register_chrdev(0, DEVICE_NAME, &reader_fops); + if (major < 0) { + printk(KERN_WARNING MODULE_NAME + ": could not get major number\n"); + return -1; + } ++ ++ rc = pcmcia_register_driver(&reader_driver); ++ if (rc < 0) { ++ unregister_chrdev(major, DEVICE_NAME); ++ return rc; ++ } ++ + return 0; + } + +diff -upr linux-2.6.16.orig/drivers/char/pty.c linux-2.6.16-026test015/drivers/char/pty.c +--- linux-2.6.16.orig/drivers/char/pty.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/pty.c 2006-07-04 14:41:38.000000000 +0400 +@@ -32,16 +32,30 @@ + #include <linux/bitops.h> + #include <linux/devpts_fs.h> + ++#include <ub/ub_misc.h> ++ + /* These are global because they are accessed in tty_io.c */ + #ifdef CONFIG_UNIX98_PTYS + struct tty_driver *ptm_driver; +-static struct tty_driver *pts_driver; ++struct tty_driver *pts_driver; ++EXPORT_SYMBOL(ptm_driver); ++EXPORT_SYMBOL(pts_driver); ++ ++void prepare_pty(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->ptm_driver = ptm_driver; ++ /* don't clean ptm_driver and co. here, they are used in vecalls.c */ ++#endif ++} + #endif + + static void pty_close(struct tty_struct * tty, struct file * filp) + { + if (!tty) + return; ++ ++ ub_pty_uncharge(tty); + if (tty->driver->subtype == PTY_TYPE_MASTER) { + if (tty->count > 1) + printk("master pty_close: count = %d!!\n", tty->count); +@@ -61,8 +75,12 @@ static void pty_close(struct tty_struct + if (tty->driver->subtype == PTY_TYPE_MASTER) { + set_bit(TTY_OTHER_CLOSED, &tty->flags); + #ifdef CONFIG_UNIX98_PTYS +- if (tty->driver == ptm_driver) ++ if (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) { ++ struct ve_struct *old_env; ++ old_env = set_exec_env(VE_OWNER_TTY(tty)); + devpts_pty_kill(tty->index); ++ (void)set_exec_env(old_env); ++ } + #endif + tty_vhangup(tty->link); + } +@@ -212,6 +230,10 @@ static int pty_open(struct tty_struct *t + if (tty->link->count != 1) + goto out; + ++ retval = -ENODEV; ++ if (ub_pty_charge(tty)) ++ goto out; ++ + clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); + set_bit(TTY_THROTTLED, &tty->flags); + set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); +@@ -239,7 +261,9 @@ static struct tty_operations pty_ops = { + + /* Traditional BSD devices */ + #ifdef CONFIG_LEGACY_PTYS +-static struct tty_driver *pty_driver, *pty_slave_driver; ++struct tty_driver *pty_driver, *pty_slave_driver; ++EXPORT_SYMBOL(pty_driver); ++EXPORT_SYMBOL(pty_slave_driver); + + static int pty_bsd_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +@@ -397,6 +421,7 @@ static void __init unix98_pty_init(void) + panic("Couldn't register Unix98 pts driver"); + + pty_table[1].data = &ptm_driver->refcount; ++ prepare_pty(); + } + #else + static inline void unix98_pty_init(void) { } +diff -upr linux-2.6.16.orig/drivers/char/snsc.c linux-2.6.16-026test015/drivers/char/snsc.c +--- linux-2.6.16.orig/drivers/char/snsc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/snsc.c 2006-07-04 14:41:36.000000000 +0400 +@@ -391,7 +391,8 @@ scdrv_init(void) + format_module_id(devnamep, geo_module(geoid), + MODULE_FORMAT_BRIEF); + devnamep = devname + strlen(devname); +- sprintf(devnamep, "#%d", geo_slab(geoid)); ++ sprintf(devnamep, "^%d#%d", geo_slot(geoid), ++ geo_slab(geoid)); + + /* allocate sysctl device data */ + scd = kmalloc(sizeof (struct sysctl_data_s), +diff -upr linux-2.6.16.orig/drivers/char/snsc_event.c linux-2.6.16-026test015/drivers/char/snsc_event.c +--- linux-2.6.16.orig/drivers/char/snsc_event.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/snsc_event.c 2006-07-04 14:41:38.000000000 +0400 +@@ -206,7 +206,7 @@ scdrv_dispatch_event(char *event, int le + + /* first find init's task */ + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->pid == 1) + break; + } +diff -upr linux-2.6.16.orig/drivers/char/sonypi.c linux-2.6.16-026test015/drivers/char/sonypi.c +--- linux-2.6.16.orig/drivers/char/sonypi.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/sonypi.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1341,6 +1341,9 @@ static int __devinit sonypi_probe(struct + else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_ICH6_1, NULL))) + sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3; ++ else if ((pcidev = pci_get_device(PCI_VENDOR_ID_INTEL, ++ PCI_DEVICE_ID_INTEL_ICH7_1, NULL))) ++ sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE3; + else + sonypi_device.model = SONYPI_DEVICE_MODEL_TYPE2; + +diff -upr linux-2.6.16.orig/drivers/char/sysrq.c linux-2.6.16-026test015/drivers/char/sysrq.c +--- linux-2.6.16.orig/drivers/char/sysrq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/sysrq.c 2006-07-04 14:41:39.000000000 +0400 +@@ -174,8 +174,13 @@ static struct sysrq_key_op sysrq_showloc + static void sysrq_handle_showregs(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) + { ++ bust_spinlocks(1); + if (pt_regs) + show_regs(pt_regs); ++ bust_spinlocks(0); ++#if defined(__i386__) || defined(__x86_64__) ++ smp_nmi_call_function(smp_show_regs, NULL, 0); ++#endif + } + static struct sysrq_key_op sysrq_showregs_op = { + .handler = sysrq_handle_showregs, +@@ -221,7 +226,7 @@ static void send_sig_all(int sig) + { + struct task_struct *p; + +- for_each_process(p) { ++ for_each_process_all(p) { + if (p->mm && p->pid != 1) + /* Not swapper, init nor kernel thread */ + force_sig(sig, p); +@@ -272,6 +277,19 @@ static struct sysrq_key_op sysrq_kill_op + .enable_mask = SYSRQ_ENABLE_SIGNAL, + }; + ++#ifdef CONFIG_SCHED_VCPU ++static void sysrq_handle_vschedstate(int key, struct pt_regs *pt_regs, ++ struct tty_struct *tty) ++{ ++ show_vsched(); ++} ++static struct sysrq_key_op sysrq_vschedstate_op = { ++ .handler = sysrq_handle_vschedstate, ++ .help_msg = "vsced_stAte", ++ .action_msg = "Show Vsched", ++}; ++#endif ++ + /* END SIGNAL SYSRQ HANDLERS BLOCK */ + + static void sysrq_handle_unrt(int key, struct pt_regs *pt_regs, +@@ -300,9 +318,13 @@ static struct sysrq_key_op *sysrq_key_ta + /* 7 */ &sysrq_loglevel_op, + /* 8 */ &sysrq_loglevel_op, + /* 9 */ &sysrq_loglevel_op, ++#ifdef CONFIG_SCHED_VCPU ++/* a */ &sysrq_vschedstate_op, ++#else + /* a */ NULL, /* Don't use for system provided sysrqs, + it is handled specially on the sparc + and will never arrive */ ++#endif + /* b */ &sysrq_reboot_op, + #ifdef CONFIG_KEXEC + /* c */ &sysrq_crashdump_op, +diff -upr linux-2.6.16.orig/drivers/char/tipar.c linux-2.6.16-026test015/drivers/char/tipar.c +--- linux-2.6.16.orig/drivers/char/tipar.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/tipar.c 2006-07-04 14:41:36.000000000 +0400 +@@ -515,7 +515,7 @@ tipar_init_module(void) + err = PTR_ERR(tipar_class); + goto out_chrdev; + } +- if (parport_register_driver(&tipar_driver) || tp_count == 0) { ++ if (parport_register_driver(&tipar_driver)) { + printk(KERN_ERR "tipar: unable to register with parport\n"); + err = -EIO; + goto out_class; +diff -upr linux-2.6.16.orig/drivers/char/tlclk.c linux-2.6.16-026test015/drivers/char/tlclk.c +--- linux-2.6.16.orig/drivers/char/tlclk.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/tlclk.c 2006-07-04 14:41:36.000000000 +0400 +@@ -327,7 +327,7 @@ static ssize_t store_received_ref_clk3a( + return strnlen(buf, count); + } + +-static DEVICE_ATTR(received_ref_clk3a, S_IWUGO, NULL, ++static DEVICE_ATTR(received_ref_clk3a, (S_IWUSR|S_IWGRP), NULL, + store_received_ref_clk3a); + + +@@ -349,7 +349,7 @@ static ssize_t store_received_ref_clk3b( + return strnlen(buf, count); + } + +-static DEVICE_ATTR(received_ref_clk3b, S_IWUGO, NULL, ++static DEVICE_ATTR(received_ref_clk3b, (S_IWUSR|S_IWGRP), NULL, + store_received_ref_clk3b); + + +@@ -371,7 +371,7 @@ static ssize_t store_enable_clk3b_output + return strnlen(buf, count); + } + +-static DEVICE_ATTR(enable_clk3b_output, S_IWUGO, NULL, ++static DEVICE_ATTR(enable_clk3b_output, (S_IWUSR|S_IWGRP), NULL, + store_enable_clk3b_output); + + static ssize_t store_enable_clk3a_output(struct device *d, +@@ -392,7 +392,7 @@ static ssize_t store_enable_clk3a_output + return strnlen(buf, count); + } + +-static DEVICE_ATTR(enable_clk3a_output, S_IWUGO, NULL, ++static DEVICE_ATTR(enable_clk3a_output, (S_IWUSR|S_IWGRP), NULL, + store_enable_clk3a_output); + + static ssize_t store_enable_clkb1_output(struct device *d, +@@ -413,7 +413,7 @@ static ssize_t store_enable_clkb1_output + return strnlen(buf, count); + } + +-static DEVICE_ATTR(enable_clkb1_output, S_IWUGO, NULL, ++static DEVICE_ATTR(enable_clkb1_output, (S_IWUSR|S_IWGRP), NULL, + store_enable_clkb1_output); + + +@@ -435,7 +435,7 @@ static ssize_t store_enable_clka1_output + return strnlen(buf, count); + } + +-static DEVICE_ATTR(enable_clka1_output, S_IWUGO, NULL, ++static DEVICE_ATTR(enable_clka1_output, (S_IWUSR|S_IWGRP), NULL, + store_enable_clka1_output); + + static ssize_t store_enable_clkb0_output(struct device *d, +@@ -456,7 +456,7 @@ static ssize_t store_enable_clkb0_output + return strnlen(buf, count); + } + +-static DEVICE_ATTR(enable_clkb0_output, S_IWUGO, NULL, ++static DEVICE_ATTR(enable_clkb0_output, (S_IWUSR|S_IWGRP), NULL, + store_enable_clkb0_output); + + static ssize_t store_enable_clka0_output(struct device *d, +@@ -477,7 +477,7 @@ static ssize_t store_enable_clka0_output + return strnlen(buf, count); + } + +-static DEVICE_ATTR(enable_clka0_output, S_IWUGO, NULL, ++static DEVICE_ATTR(enable_clka0_output, (S_IWUSR|S_IWGRP), NULL, + store_enable_clka0_output); + + static ssize_t store_select_amcb2_transmit_clock(struct device *d, +@@ -519,7 +519,7 @@ static ssize_t store_select_amcb2_transm + return strnlen(buf, count); + } + +-static DEVICE_ATTR(select_amcb2_transmit_clock, S_IWUGO, NULL, ++static DEVICE_ATTR(select_amcb2_transmit_clock, (S_IWUSR|S_IWGRP), NULL, + store_select_amcb2_transmit_clock); + + static ssize_t store_select_amcb1_transmit_clock(struct device *d, +@@ -560,7 +560,7 @@ static ssize_t store_select_amcb1_transm + return strnlen(buf, count); + } + +-static DEVICE_ATTR(select_amcb1_transmit_clock, S_IWUGO, NULL, ++static DEVICE_ATTR(select_amcb1_transmit_clock, (S_IWUSR|S_IWGRP), NULL, + store_select_amcb1_transmit_clock); + + static ssize_t store_select_redundant_clock(struct device *d, +@@ -581,7 +581,7 @@ static ssize_t store_select_redundant_cl + return strnlen(buf, count); + } + +-static DEVICE_ATTR(select_redundant_clock, S_IWUGO, NULL, ++static DEVICE_ATTR(select_redundant_clock, (S_IWUSR|S_IWGRP), NULL, + store_select_redundant_clock); + + static ssize_t store_select_ref_frequency(struct device *d, +@@ -602,7 +602,7 @@ static ssize_t store_select_ref_frequenc + return strnlen(buf, count); + } + +-static DEVICE_ATTR(select_ref_frequency, S_IWUGO, NULL, ++static DEVICE_ATTR(select_ref_frequency, (S_IWUSR|S_IWGRP), NULL, + store_select_ref_frequency); + + static ssize_t store_filter_select(struct device *d, +@@ -623,7 +623,7 @@ static ssize_t store_filter_select(struc + return strnlen(buf, count); + } + +-static DEVICE_ATTR(filter_select, S_IWUGO, NULL, store_filter_select); ++static DEVICE_ATTR(filter_select, (S_IWUSR|S_IWGRP), NULL, store_filter_select); + + static ssize_t store_hardware_switching_mode(struct device *d, + struct device_attribute *attr, const char *buf, size_t count) +@@ -643,7 +643,7 @@ static ssize_t store_hardware_switching_ + return strnlen(buf, count); + } + +-static DEVICE_ATTR(hardware_switching_mode, S_IWUGO, NULL, ++static DEVICE_ATTR(hardware_switching_mode, (S_IWUSR|S_IWGRP), NULL, + store_hardware_switching_mode); + + static ssize_t store_hardware_switching(struct device *d, +@@ -664,7 +664,7 @@ static ssize_t store_hardware_switching( + return strnlen(buf, count); + } + +-static DEVICE_ATTR(hardware_switching, S_IWUGO, NULL, ++static DEVICE_ATTR(hardware_switching, (S_IWUSR|S_IWGRP), NULL, + store_hardware_switching); + + static ssize_t store_refalign (struct device *d, +@@ -684,7 +684,7 @@ static ssize_t store_refalign (struct de + return strnlen(buf, count); + } + +-static DEVICE_ATTR(refalign, S_IWUGO, NULL, store_refalign); ++static DEVICE_ATTR(refalign, (S_IWUSR|S_IWGRP), NULL, store_refalign); + + static ssize_t store_mode_select (struct device *d, + struct device_attribute *attr, const char *buf, size_t count) +@@ -704,7 +704,7 @@ static ssize_t store_mode_select (struct + return strnlen(buf, count); + } + +-static DEVICE_ATTR(mode_select, S_IWUGO, NULL, store_mode_select); ++static DEVICE_ATTR(mode_select, (S_IWUSR|S_IWGRP), NULL, store_mode_select); + + static ssize_t store_reset (struct device *d, + struct device_attribute *attr, const char *buf, size_t count) +@@ -724,7 +724,7 @@ static ssize_t store_reset (struct devic + return strnlen(buf, count); + } + +-static DEVICE_ATTR(reset, S_IWUGO, NULL, store_reset); ++static DEVICE_ATTR(reset, (S_IWUSR|S_IWGRP), NULL, store_reset); + + static struct attribute *tlclk_sysfs_entries[] = { + &dev_attr_current_ref.attr, +@@ -767,6 +767,7 @@ static int __init tlclk_init(void) + printk(KERN_ERR "tlclk: can't get major %d.\n", tlclk_major); + return ret; + } ++ tlclk_major = ret; + alarm_events = kzalloc( sizeof(struct tlclk_alarms), GFP_KERNEL); + if (!alarm_events) + goto out1; +diff -upr linux-2.6.16.orig/drivers/char/tty_io.c linux-2.6.16-026test015/drivers/char/tty_io.c +--- linux-2.6.16.orig/drivers/char/tty_io.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/char/tty_io.c 2006-07-04 14:41:38.000000000 +0400 +@@ -86,6 +86,7 @@ + #include <linux/string.h> + #include <linux/slab.h> + #include <linux/poll.h> ++#include <linux/ve_owner.h> + #include <linux/proc_fs.h> + #include <linux/init.h> + #include <linux/module.h> +@@ -105,6 +106,7 @@ + #include <linux/devfs_fs_kernel.h> + + #include <linux/kmod.h> ++#include <ub/ub_mem.h> + + #undef TTY_DEBUG_HANGUP + +@@ -122,11 +124,16 @@ struct termios tty_std_termios = { /* fo + + EXPORT_SYMBOL(tty_std_termios); + ++/* this lock protects tty_drivers list, this pretty guys do no locking */ ++rwlock_t tty_driver_guard = RW_LOCK_UNLOCKED; ++EXPORT_SYMBOL(tty_driver_guard); ++ + /* This list gets poked at by procfs and various bits of boot up code. This + could do with some rationalisation such as pulling the tty proc function + into this file */ + + LIST_HEAD(tty_drivers); /* linked list of tty drivers */ ++EXPORT_SYMBOL(tty_drivers); + + /* Semaphore to protect creating and releasing a tty. This is shared with + vt.c for deeply disgusting hack reasons */ +@@ -136,6 +143,15 @@ DECLARE_MUTEX(tty_sem); + extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ + extern int pty_limit; /* Config limit on Unix98 ptys */ + static DEFINE_IDR(allocated_ptys); ++#ifdef CONFIG_VE ++#define __ve_allocated_ptys(ve) (*((ve)->allocated_ptys)) ++#define ve_allocated_ptys __ve_allocated_ptys(get_exec_env()) ++#define ve_ptm_driver (get_exec_env()->ptm_driver) ++#else ++#define __ve_allocated_ptys(ve) allocated_ptys ++#define ve_allocated_ptys allocated_ptys ++#define ve_ptm_driver ptm_driver ++#endif + static DECLARE_MUTEX(allocated_ptys_lock); + static int ptmx_open(struct inode *, struct file *); + #endif +@@ -156,11 +172,25 @@ static int tty_fasync(int fd, struct fil + static void release_mem(struct tty_struct *tty, int idx); + + ++DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env) ++DCL_VE_OWNER(TTY, struct tty_struct, owner_env) ++ ++void prepare_tty(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->allocated_ptys = &allocated_ptys; ++ /* ++ * in this case, tty_register_driver() setups ++ * owner_env correctly right from the bootup ++ */ ++#endif ++} ++ + static struct tty_struct *alloc_tty_struct(void) + { + struct tty_struct *tty; + +- tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); ++ tty = ub_kmalloc(sizeof(struct tty_struct), GFP_KERNEL); + if (tty) + memset(tty, 0, sizeof(struct tty_struct)); + return tty; +@@ -857,14 +887,37 @@ static struct tty_driver *get_tty_driver + { + struct tty_driver *p; + ++ read_lock(&tty_driver_guard); + list_for_each_entry(p, &tty_drivers, tty_drivers) { + dev_t base = MKDEV(p->major, p->minor_start); + if (device < base || device >= base + p->num) + continue; + *index = device - base; +- return p; ++#ifdef CONFIG_VE ++ if (in_interrupt()) ++ goto found; ++ if (p->major!=PTY_MASTER_MAJOR && p->major!=PTY_SLAVE_MAJOR ++#ifdef CONFIG_UNIX98_PTYS ++ && (p->major<UNIX98_PTY_MASTER_MAJOR || ++ p->major>UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) && ++ (p->major<UNIX98_PTY_SLAVE_MAJOR || ++ p->major>UNIX98_PTY_SLAVE_MAJOR+UNIX98_PTY_MAJOR_COUNT-1) ++#endif ++ ) goto found; ++ if (ve_is_super(VE_OWNER_TTYDRV(p)) && ++ ve_is_super(get_exec_env())) ++ goto found; ++ if (!ve_accessible_strict(VE_OWNER_TTYDRV(p), get_exec_env())) ++ continue; ++#endif ++ goto found; + } ++ read_unlock(&tty_driver_guard); + return NULL; ++ ++found: ++ read_unlock(&tty_driver_guard); ++ return p; + } + + /* +@@ -1092,7 +1145,7 @@ static void do_tty_hangup(void *data) + + read_lock(&tasklist_lock); + if (tty->session > 0) { +- do_each_task_pid(tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { + if (p->signal->tty == tty) + p->signal->tty = NULL; + if (!p->signal->leader) +@@ -1101,7 +1154,7 @@ static void do_tty_hangup(void *data) + send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); + if (tty->pgrp > 0) + p->signal->tty_old_pgrp = tty->pgrp; +- } while_each_task_pid(tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); + } + read_unlock(&tasklist_lock); + +@@ -1218,9 +1271,9 @@ void disassociate_ctty(int on_exit) + + /* Now clear signal->tty under the lock */ + read_lock(&tasklist_lock); +- do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(current->signal->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(current->signal->session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + up(&tty_sem); + unlock_kernel(); +@@ -1446,21 +1499,28 @@ static inline void tty_line_name(struct + * really quite straightforward. The semaphore locking can probably be + * relaxed for the (most common) case of reopening a tty. + */ +-static int init_dev(struct tty_driver *driver, int idx, +- struct tty_struct **ret_tty) ++static int init_dev(struct tty_driver *driver, int idx, ++ struct tty_struct *i_tty, struct tty_struct **ret_tty) + { + struct tty_struct *tty, *o_tty; + struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; + struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; ++ struct ve_struct * owner; + int retval=0; + +- /* check whether we're reopening an existing tty */ +- if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { +- tty = devpts_get_tty(idx); +- if (tty && driver->subtype == PTY_TYPE_MASTER) +- tty = tty->link; +- } else { +- tty = driver->ttys[idx]; ++ owner = VE_OWNER_TTYDRV(driver); ++ ++ if (i_tty) ++ tty = i_tty; ++ else { ++ /* check whether we're reopening an existing tty */ ++ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { ++ tty = devpts_get_tty(idx); ++ if (tty && driver->subtype == PTY_TYPE_MASTER) ++ tty = tty->link; ++ } else { ++ tty = driver->ttys[idx]; ++ } + } + if (tty) goto fast_track; + +@@ -1488,6 +1548,7 @@ static int init_dev(struct tty_driver *d + tty->driver = driver; + tty->index = idx; + tty_line_name(driver, idx, tty->name); ++ SET_VE_OWNER_TTY(tty, owner); + + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + tp_loc = &tty->termios; +@@ -1498,7 +1559,7 @@ static int init_dev(struct tty_driver *d + } + + if (!*tp_loc) { +- tp = (struct termios *) kmalloc(sizeof(struct termios), ++ tp = (struct termios *) ub_kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!tp) + goto free_mem_out; +@@ -1506,7 +1567,7 @@ static int init_dev(struct tty_driver *d + } + + if (!*ltp_loc) { +- ltp = (struct termios *) kmalloc(sizeof(struct termios), ++ ltp = (struct termios *) ub_kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!ltp) + goto free_mem_out; +@@ -1521,6 +1582,7 @@ static int init_dev(struct tty_driver *d + o_tty->driver = driver->other; + o_tty->index = idx; + tty_line_name(driver->other, idx, o_tty->name); ++ SET_VE_OWNER_TTY(o_tty, owner); + + if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { + o_tp_loc = &o_tty->termios; +@@ -1532,7 +1594,7 @@ static int init_dev(struct tty_driver *d + + if (!*o_tp_loc) { + o_tp = (struct termios *) +- kmalloc(sizeof(struct termios), GFP_KERNEL); ++ ub_kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_tp) + goto free_mem_out; + *o_tp = driver->other->init_termios; +@@ -1540,7 +1602,7 @@ static int init_dev(struct tty_driver *d + + if (!*o_ltp_loc) { + o_ltp = (struct termios *) +- kmalloc(sizeof(struct termios), GFP_KERNEL); ++ ub_kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_ltp) + goto free_mem_out; + memset(o_ltp, 0, sizeof(struct termios)); +@@ -1558,6 +1620,10 @@ static int init_dev(struct tty_driver *d + *o_ltp_loc = o_ltp; + o_tty->termios = *o_tp_loc; + o_tty->termios_locked = *o_ltp_loc; ++#ifdef CONFIG_VE ++ if (driver->other->refcount == 0) ++ (void)get_ve(owner); ++#endif + driver->other->refcount++; + if (driver->subtype == PTY_TYPE_MASTER) + o_tty->count++; +@@ -1582,6 +1648,10 @@ static int init_dev(struct tty_driver *d + *ltp_loc = ltp; + tty->termios = *tp_loc; + tty->termios_locked = *ltp_loc; ++#ifdef CONFIG_VE ++ if (driver->refcount == 0) ++ (void)get_ve(owner); ++#endif + driver->refcount++; + tty->count++; + +@@ -1692,6 +1762,10 @@ static void release_mem(struct tty_struc + } + o_tty->magic = 0; + o_tty->driver->refcount--; ++#ifdef CONFIG_VE ++ if (o_tty->driver->refcount == 0) ++ put_ve(VE_OWNER_TTY(o_tty)); ++#endif + file_list_lock(); + list_del_init(&o_tty->tty_files); + file_list_unlock(); +@@ -1714,6 +1788,10 @@ static void release_mem(struct tty_struc + + tty->magic = 0; + tty->driver->refcount--; ++#ifdef CONFIG_VE ++ if (tty->driver->refcount == 0) ++ put_ve(VE_OWNER_TTY(tty)); ++#endif + file_list_lock(); + list_del_init(&tty->tty_files); + file_list_unlock(); +@@ -1737,7 +1815,10 @@ static void release_dev(struct file * fi + int idx; + char buf[64]; + unsigned long flags; +- ++#ifdef CONFIG_UNIX98_PTYS ++ struct idr *idr_alloced; ++#endif ++ + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev")) + return; +@@ -1752,6 +1833,9 @@ static void release_dev(struct file * fi + devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; + devpts_master = pty_master && devpts; + o_tty = tty->link; ++#ifdef CONFIG_UNIX98_PTYS ++ idr_alloced = &__ve_allocated_ptys(tty->owner_env); ++#endif + + #ifdef TTY_PARANOIA_CHECK + if (idx < 0 || idx >= tty->driver->num) { +@@ -1924,13 +2008,13 @@ static void release_dev(struct file * fi + struct task_struct *p; + + read_lock(&tasklist_lock); +- do_each_task_pid(tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); + if (o_tty) +- do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(o_tty->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(o_tty->session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + } + +@@ -2005,7 +2089,7 @@ static void release_dev(struct file * fi + /* Make this pty number available for reallocation */ + if (devpts) { + down(&allocated_ptys_lock); +- idr_remove(&allocated_ptys, idx); ++ idr_remove(idr_alloced, idx); + up(&allocated_ptys_lock); + } + #endif +@@ -2026,7 +2110,7 @@ static void release_dev(struct file * fi + */ + static int tty_open(struct inode * inode, struct file * filp) + { +- struct tty_struct *tty; ++ struct tty_struct *tty, *c_tty; + int noctty, retval; + struct tty_driver *driver; + int index; +@@ -2039,6 +2123,7 @@ retry_open: + noctty = filp->f_flags & O_NOCTTY; + index = -1; + retval = 0; ++ c_tty = NULL; + + down(&tty_sem); + +@@ -2049,6 +2134,7 @@ retry_open: + } + driver = current->signal->tty->driver; + index = current->signal->tty->index; ++ c_tty = current->signal->tty; + filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ + /* noctty = 1; */ + goto got_driver; +@@ -2056,6 +2142,12 @@ retry_open: + #ifdef CONFIG_VT + if (device == MKDEV(TTY_MAJOR,0)) { + extern struct tty_driver *console_driver; ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ up(&tty_sem); ++ return -ENODEV; ++ } ++#endif + driver = console_driver; + index = fg_console; + noctty = 1; +@@ -2063,6 +2155,12 @@ retry_open: + } + #endif + if (device == MKDEV(TTYAUX_MAJOR,1)) { ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ up(&tty_sem); ++ return -ENODEV; ++ } ++#endif + driver = console_device(&index); + if (driver) { + /* Don't let /dev/console block */ +@@ -2080,7 +2178,7 @@ retry_open: + return -ENODEV; + } + got_driver: +- retval = init_dev(driver, index, &tty); ++ retval = init_dev(driver, index, c_tty, &tty); + up(&tty_sem); + if (retval) + return retval; +@@ -2149,11 +2247,11 @@ static int ptmx_open(struct inode * inod + + /* find a device that is not in use. */ + down(&allocated_ptys_lock); +- if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { ++ if (!idr_pre_get(&ve_allocated_ptys, GFP_KERNEL)) { + up(&allocated_ptys_lock); + return -ENOMEM; + } +- idr_ret = idr_get_new(&allocated_ptys, NULL, &index); ++ idr_ret = idr_get_new(&ve_allocated_ptys, NULL, &index); + if (idr_ret < 0) { + up(&allocated_ptys_lock); + if (idr_ret == -EAGAIN) +@@ -2161,14 +2259,14 @@ static int ptmx_open(struct inode * inod + return -EIO; + } + if (index >= pty_limit) { +- idr_remove(&allocated_ptys, index); ++ idr_remove(&ve_allocated_ptys, index); + up(&allocated_ptys_lock); + return -EIO; + } + up(&allocated_ptys_lock); + + down(&tty_sem); +- retval = init_dev(ptm_driver, index, &tty); ++ retval = init_dev(ve_ptm_driver, index, NULL, &tty); + up(&tty_sem); + + if (retval) +@@ -2183,14 +2281,14 @@ static int ptmx_open(struct inode * inod + goto out1; + + check_tty_count(tty, "tty_open"); +- retval = ptm_driver->open(tty, filp); ++ retval = ve_ptm_driver->open(tty, filp); + if (!retval) + return 0; + out1: + release_dev(filp); + out: + down(&allocated_ptys_lock); +- idr_remove(&allocated_ptys, index); ++ idr_remove(&ve_allocated_ptys, index); + up(&allocated_ptys_lock); + return retval; + } +@@ -2303,6 +2401,8 @@ static int tioccons(struct file *file) + { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; ++ if (!ve_is_super(get_exec_env())) ++ return -EACCES; + if (file->f_op->write == redirected_tty_write) { + struct file *f; + spin_lock(&redirect_lock); +@@ -2363,9 +2463,9 @@ static int tiocsctty(struct tty_struct * + */ + + read_lock(&tasklist_lock); +- do_each_task_pid(tty->session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(tty->session, PIDTYPE_SID, p) { + p->signal->tty = NULL; +- } while_each_task_pid(tty->session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(tty->session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + } else + return -EPERM; +@@ -2387,7 +2487,7 @@ static int tiocgpgrp(struct tty_struct * + */ + if (tty == real_tty && current->signal->tty != real_tty) + return -ENOTTY; +- return put_user(real_tty->pgrp, p); ++ return put_user(pid_type_to_vpid(PIDTYPE_PGID, real_tty->pgrp), p); + } + + static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +@@ -2407,6 +2507,9 @@ static int tiocspgrp(struct tty_struct * + return -EFAULT; + if (pgrp < 0) + return -EINVAL; ++ pgrp = vpid_to_pid(pgrp); ++ if (pgrp < 0) ++ return -EPERM; + if (session_of_pgrp(pgrp) != current->signal->session) + return -EPERM; + real_tty->pgrp = pgrp; +@@ -2423,7 +2526,7 @@ static int tiocgsid(struct tty_struct *t + return -ENOTTY; + if (real_tty->session <= 0) + return -ENOTTY; +- return put_user(real_tty->session, p); ++ return put_user(pid_type_to_vpid(PIDTYPE_SID, real_tty->session), p); + } + + static int tiocsetd(struct tty_struct *tty, int __user *p) +@@ -2696,7 +2799,7 @@ static void __do_SAK(void *arg) + tty->driver->flush_buffer(tty); + + read_lock(&tasklist_lock); +- do_each_task_pid(session, PIDTYPE_SID, p) { ++ do_each_task_pid_all(session, PIDTYPE_SID, p) { + if (p->signal->tty == tty || session > 0) { + printk(KERN_NOTICE "SAK: killed process %d" + " (%s): p->signal->session==tty->session\n", +@@ -2706,7 +2809,11 @@ static void __do_SAK(void *arg) + } + task_lock(p); + if (p->files) { +- rcu_read_lock(); ++ /* ++ * We don't take a ref to the file, so we must ++ * hold ->file_lock instead. ++ */ ++ spin_lock(&p->files->file_lock); + fdt = files_fdtable(p->files); + for (i=0; i < fdt->max_fds; i++) { + filp = fcheck_files(p->files, i); +@@ -2721,10 +2828,10 @@ static void __do_SAK(void *arg) + break; + } + } +- rcu_read_unlock(); ++ spin_unlock(&p->files->file_lock); + } + task_unlock(p); +- } while_each_task_pid(session, PIDTYPE_SID, p); ++ } while_each_task_pid_all(session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); + #endif + } +@@ -3095,8 +3202,11 @@ int tty_register_driver(struct tty_drive + + if (!driver->put_char) + driver->put_char = tty_default_put_char; +- ++ ++ SET_VE_OWNER_TTYDRV(driver, get_exec_env()); ++ write_lock_irq(&tty_driver_guard); + list_add(&driver->tty_drivers, &tty_drivers); ++ write_unlock_irq(&tty_driver_guard); + + if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { + for(i = 0; i < driver->num; i++) +@@ -3123,7 +3233,9 @@ int tty_unregister_driver(struct tty_dri + unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), + driver->num); + ++ write_lock_irq(&tty_driver_guard); + list_del(&driver->tty_drivers); ++ write_unlock_irq(&tty_driver_guard); + + /* + * Free the termios and termios_locked structures because +@@ -3246,6 +3358,7 @@ static int __init tty_init(void) + + vty_init(); + #endif ++ prepare_tty(); + return 0; + } + module_init(tty_init); +diff -upr linux-2.6.16.orig/drivers/edac/Kconfig linux-2.6.16-026test015/drivers/edac/Kconfig +--- linux-2.6.16.orig/drivers/edac/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/edac/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -71,7 +71,7 @@ config EDAC_E7XXX + + config EDAC_E752X + tristate "Intel e752x (e7520, e7525, e7320)" +- depends on EDAC_MM_EDAC && PCI ++ depends on EDAC_MM_EDAC && PCI && HOTPLUG + help + Support for error detection and correction on the Intel + E7520, E7525, E7320 server chipsets. +diff -upr linux-2.6.16.orig/drivers/i2c/busses/i2c-i801.c linux-2.6.16-026test015/drivers/i2c/busses/i2c-i801.c +--- linux-2.6.16.orig/drivers/i2c/busses/i2c-i801.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/i2c/busses/i2c-i801.c 2006-07-04 14:41:36.000000000 +0400 +@@ -478,6 +478,11 @@ static s32 i801_access(struct i2c_adapte + ret = i801_transaction(); + } + ++ /* Some BIOSes don't like it when PEC is enabled at reboot or resume ++ time, so we forcibly disable it after every transaction. */ ++ if (hwpec) ++ outb_p(0, SMBAUXCTL); ++ + if(block) + return ret; + if(ret) +diff -upr linux-2.6.16.orig/drivers/i2c/busses/scx200_acb.c linux-2.6.16-026test015/drivers/i2c/busses/scx200_acb.c +--- linux-2.6.16.orig/drivers/i2c/busses/scx200_acb.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/i2c/busses/scx200_acb.c 2006-07-04 14:41:36.000000000 +0400 +@@ -440,7 +440,6 @@ static int __init scx200_acb_create(int + struct scx200_acb_iface *iface; + struct i2c_adapter *adapter; + int rc = 0; +- char description[64]; + + iface = kzalloc(sizeof(*iface), GFP_KERNEL); + if (!iface) { +@@ -459,8 +458,7 @@ static int __init scx200_acb_create(int + + init_MUTEX(&iface->sem); + +- snprintf(description, sizeof(description), "NatSemi SCx200 ACCESS.bus [%s]", adapter->name); +- if (request_region(base, 8, description) == 0) { ++ if (!request_region(base, 8, adapter->name)) { + dev_err(&adapter->dev, "can't allocate io 0x%x-0x%x\n", + base, base + 8-1); + rc = -EBUSY; +diff -upr linux-2.6.16.orig/drivers/i2c/chips/m41t00.c linux-2.6.16-026test015/drivers/i2c/chips/m41t00.c +--- linux-2.6.16.orig/drivers/i2c/chips/m41t00.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/i2c/chips/m41t00.c 2006-07-04 14:41:36.000000000 +0400 +@@ -129,13 +129,13 @@ m41t00_set_tlet(ulong arg) + if ((i2c_smbus_write_byte_data(save_client, 0, tm.tm_sec & 0x7f) < 0) + || (i2c_smbus_write_byte_data(save_client, 1, tm.tm_min & 0x7f) + < 0) +- || (i2c_smbus_write_byte_data(save_client, 2, tm.tm_hour & 0x7f) ++ || (i2c_smbus_write_byte_data(save_client, 2, tm.tm_hour & 0x3f) + < 0) +- || (i2c_smbus_write_byte_data(save_client, 4, tm.tm_mday & 0x7f) ++ || (i2c_smbus_write_byte_data(save_client, 4, tm.tm_mday & 0x3f) + < 0) +- || (i2c_smbus_write_byte_data(save_client, 5, tm.tm_mon & 0x7f) ++ || (i2c_smbus_write_byte_data(save_client, 5, tm.tm_mon & 0x1f) + < 0) +- || (i2c_smbus_write_byte_data(save_client, 6, tm.tm_year & 0x7f) ++ || (i2c_smbus_write_byte_data(save_client, 6, tm.tm_year & 0xff) + < 0)) + + dev_warn(&save_client->dev,"m41t00: can't write to rtc chip\n"); +diff -upr linux-2.6.16.orig/drivers/ide/pci/alim15x3.c linux-2.6.16-026test015/drivers/ide/pci/alim15x3.c +--- linux-2.6.16.orig/drivers/ide/pci/alim15x3.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/ide/pci/alim15x3.c 2006-07-04 14:41:36.000000000 +0400 +@@ -731,6 +731,8 @@ static unsigned int __devinit ata66_ali1 + + if(m5229_revision <= 0x20) + tmpbyte = (tmpbyte & (~0x02)) | 0x01; ++ else if (m5229_revision == 0xc7) ++ tmpbyte |= 0x03; + else + tmpbyte |= 0x01; + +diff -upr linux-2.6.16.orig/drivers/ieee1394/ohci1394.c linux-2.6.16-026test015/drivers/ieee1394/ohci1394.c +--- linux-2.6.16.orig/drivers/ieee1394/ohci1394.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/ieee1394/ohci1394.c 2006-07-04 14:41:36.000000000 +0400 +@@ -2525,7 +2525,7 @@ static irqreturn_t ohci_irq_handler(int + if (phys_dma) { + reg_write(ohci,OHCI1394_PhyReqFilterHiSet, 0xffffffff); + reg_write(ohci,OHCI1394_PhyReqFilterLoSet, 0xffffffff); +- reg_write(ohci,OHCI1394_PhyUpperBound, 0xffff0000); ++ reg_write(ohci,OHCI1394_PhyUpperBound, 0x01000000); + } else { + reg_write(ohci,OHCI1394_PhyReqFilterHiSet, 0x00000000); + reg_write(ohci,OHCI1394_PhyReqFilterLoSet, 0x00000000); +diff -upr linux-2.6.16.orig/drivers/ieee1394/sbp2.c linux-2.6.16-026test015/drivers/ieee1394/sbp2.c +--- linux-2.6.16.orig/drivers/ieee1394/sbp2.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/ieee1394/sbp2.c 2006-07-04 14:41:36.000000000 +0400 +@@ -495,22 +495,17 @@ static struct sbp2_command_info *sbp2uti + /* + * This function finds the sbp2_command for a given outstanding SCpnt. + * Only looks at the inuse list. ++ * Must be called with scsi_id->sbp2_command_orb_lock held. + */ +-static struct sbp2_command_info *sbp2util_find_command_for_SCpnt(struct scsi_id_instance_data *scsi_id, void *SCpnt) ++static struct sbp2_command_info *sbp2util_find_command_for_SCpnt( ++ struct scsi_id_instance_data *scsi_id, void *SCpnt) + { + struct sbp2_command_info *command; +- unsigned long flags; + +- spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); +- if (!list_empty(&scsi_id->sbp2_command_orb_inuse)) { +- list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) { +- if (command->Current_SCpnt == SCpnt) { +- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); ++ if (!list_empty(&scsi_id->sbp2_command_orb_inuse)) ++ list_for_each_entry(command, &scsi_id->sbp2_command_orb_inuse, list) ++ if (command->Current_SCpnt == SCpnt) + return command; +- } +- } +- } +- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); + return NULL; + } + +@@ -579,17 +574,15 @@ static void sbp2util_free_command_dma(st + + /* + * This function moves a command to the completed orb list. ++ * Must be called with scsi_id->sbp2_command_orb_lock held. + */ +-static void sbp2util_mark_command_completed(struct scsi_id_instance_data *scsi_id, +- struct sbp2_command_info *command) ++static void sbp2util_mark_command_completed( ++ struct scsi_id_instance_data *scsi_id, ++ struct sbp2_command_info *command) + { +- unsigned long flags; +- +- spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); + list_del(&command->list); + sbp2util_free_command_dma(command); + list_add_tail(&command->list, &scsi_id->sbp2_command_orb_completed); +- spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); + } + + /* +@@ -761,12 +754,17 @@ static struct scsi_id_instance_data *sbp + + /* Register the status FIFO address range. We could use the same FIFO + * for targets at different nodes. However we need different FIFOs per +- * target in order to support multi-unit devices. */ ++ * target in order to support multi-unit devices. ++ * The FIFO is located out of the local host controller's physical range ++ * but, if possible, within the posted write area. Status writes will ++ * then be performed as unified transactions. This slightly reduces ++ * bandwidth usage, and some Prolific based devices seem to require it. ++ */ + scsi_id->status_fifo_addr = hpsb_allocate_and_register_addrspace( + &sbp2_highlevel, ud->ne->host, &sbp2_ops, + sizeof(struct sbp2_status_block), sizeof(quadlet_t), +- ~0ULL, ~0ULL); +- if (!scsi_id->status_fifo_addr) { ++ 0x010000000000ULL, CSR1212_ALL_SPACE_END); ++ if (scsi_id->status_fifo_addr == ~0ULL) { + SBP2_ERR("failed to allocate status FIFO address range"); + goto failed_alloc; + } +@@ -2177,7 +2175,9 @@ static int sbp2_handle_status_write(stru + * Matched status with command, now grab scsi command pointers and check status + */ + SCpnt = command->Current_SCpnt; ++ spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); + sbp2util_mark_command_completed(scsi_id, command); ++ spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); + + if (SCpnt) { + +@@ -2491,9 +2491,20 @@ static int sbp2scsi_slave_alloc(struct s + + static int sbp2scsi_slave_configure(struct scsi_device *sdev) + { ++ struct scsi_id_instance_data *scsi_id = ++ (struct scsi_id_instance_data *)sdev->host->hostdata[0]; ++ + blk_queue_dma_alignment(sdev->request_queue, (512 - 1)); + sdev->use_10_for_rw = 1; + sdev->use_10_for_ms = 1; ++ ++ if ((scsi_id->sbp2_firmware_revision & 0xffff00) == 0x0a2700 && ++ (scsi_id->ud->model_id == 0x000021 /* gen.4 iPod */ || ++ scsi_id->ud->model_id == 0x000023 /* iPod mini */ || ++ scsi_id->ud->model_id == 0x00007e /* iPod Photo */ )) { ++ SBP2_INFO("enabling iPod workaround: decrement disk capacity"); ++ sdev->fix_capacity = 1; ++ } + return 0; + } + +@@ -2513,6 +2524,7 @@ static int sbp2scsi_abort(struct scsi_cm + (struct scsi_id_instance_data *)SCpnt->device->host->hostdata[0]; + struct sbp2scsi_host_info *hi = scsi_id->hi; + struct sbp2_command_info *command; ++ unsigned long flags; + + SBP2_ERR("aborting sbp2 command"); + scsi_print_command(SCpnt); +@@ -2523,6 +2535,7 @@ static int sbp2scsi_abort(struct scsi_cm + * Right now, just return any matching command structures + * to the free pool. + */ ++ spin_lock_irqsave(&scsi_id->sbp2_command_orb_lock, flags); + command = sbp2util_find_command_for_SCpnt(scsi_id, SCpnt); + if (command) { + SBP2_DEBUG("Found command to abort"); +@@ -2540,6 +2553,7 @@ static int sbp2scsi_abort(struct scsi_cm + command->Current_done(command->Current_SCpnt); + } + } ++ spin_unlock_irqrestore(&scsi_id->sbp2_command_orb_lock, flags); + + /* + * Initiate a fetch agent reset. +diff -upr linux-2.6.16.orig/drivers/input/mouse/psmouse-base.c linux-2.6.16-026test015/drivers/input/mouse/psmouse-base.c +--- linux-2.6.16.orig/drivers/input/mouse/psmouse-base.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/input/mouse/psmouse-base.c 2006-07-04 14:41:36.000000000 +0400 +@@ -300,8 +300,10 @@ static irqreturn_t psmouse_interrupt(str + * Check if this is a new device announcement (0xAA 0x00) + */ + if (unlikely(psmouse->packet[0] == PSMOUSE_RET_BAT && psmouse->pktcnt <= 2)) { +- if (psmouse->pktcnt == 1) ++ if (psmouse->pktcnt == 1) { ++ psmouse->last = jiffies; + goto out; ++ } + + if (psmouse->packet[1] == PSMOUSE_RET_ID) { + __psmouse_set_state(psmouse, PSMOUSE_IGNORE); +diff -upr linux-2.6.16.orig/drivers/macintosh/therm_adt746x.c linux-2.6.16-026test015/drivers/macintosh/therm_adt746x.c +--- linux-2.6.16.orig/drivers/macintosh/therm_adt746x.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/macintosh/therm_adt746x.c 2006-07-04 14:41:36.000000000 +0400 +@@ -627,8 +627,8 @@ thermostat_init(void) + if(therm_type == ADT7460) + device_create_file(&of_dev->dev, &dev_attr_sensor2_fan_speed); + +-#ifndef CONFIG_I2C_KEYWEST +- request_module("i2c-keywest"); ++#ifndef CONFIG_I2C_POWERMAC ++ request_module("i2c-powermac"); + #endif + + return i2c_add_driver(&thermostat_driver); +diff -upr linux-2.6.16.orig/drivers/md/dm-snap.c linux-2.6.16-026test015/drivers/md/dm-snap.c +--- linux-2.6.16.orig/drivers/md/dm-snap.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/md/dm-snap.c 2006-07-04 14:41:36.000000000 +0400 +@@ -542,8 +542,12 @@ static void snapshot_dtr(struct dm_targe + { + struct dm_snapshot *s = (struct dm_snapshot *) ti->private; + ++ /* Prevent further origin writes from using this snapshot. */ ++ /* After this returns there can be no new kcopyd jobs. */ + unregister_snapshot(s); + ++ kcopyd_client_destroy(s->kcopyd_client); ++ + exit_exception_table(&s->pending, pending_cache); + exit_exception_table(&s->complete, exception_cache); + +@@ -552,7 +556,7 @@ static void snapshot_dtr(struct dm_targe + + dm_put_device(ti, s->origin); + dm_put_device(ti, s->cow); +- kcopyd_client_destroy(s->kcopyd_client); ++ + kfree(s); + } + +diff -upr linux-2.6.16.orig/drivers/md/dm.c linux-2.6.16-026test015/drivers/md/dm.c +--- linux-2.6.16.orig/drivers/md/dm.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/md/dm.c 2006-07-04 14:41:36.000000000 +0400 +@@ -533,30 +533,35 @@ static void __clone_and_map(struct clone + + } else { + /* +- * Create two copy bios to deal with io that has +- * been split across a target. ++ * Handle a bvec that must be split between two or more targets. + */ + struct bio_vec *bv = bio->bi_io_vec + ci->idx; ++ sector_t remaining = to_sector(bv->bv_len); ++ unsigned int offset = 0; + +- clone = split_bvec(bio, ci->sector, ci->idx, +- bv->bv_offset, max); +- __map_bio(ti, clone, tio); +- +- ci->sector += max; +- ci->sector_count -= max; +- ti = dm_table_find_target(ci->map, ci->sector); +- +- len = to_sector(bv->bv_len) - max; +- clone = split_bvec(bio, ci->sector, ci->idx, +- bv->bv_offset + to_bytes(max), len); +- tio = alloc_tio(ci->md); +- tio->io = ci->io; +- tio->ti = ti; +- memset(&tio->info, 0, sizeof(tio->info)); +- __map_bio(ti, clone, tio); ++ do { ++ if (offset) { ++ ti = dm_table_find_target(ci->map, ci->sector); ++ max = max_io_len(ci->md, ci->sector, ti); ++ ++ tio = alloc_tio(ci->md); ++ tio->io = ci->io; ++ tio->ti = ti; ++ memset(&tio->info, 0, sizeof(tio->info)); ++ } ++ ++ len = min(remaining, max); ++ ++ clone = split_bvec(bio, ci->sector, ci->idx, ++ bv->bv_offset + offset, len); ++ ++ __map_bio(ti, clone, tio); ++ ++ ci->sector += len; ++ ci->sector_count -= len; ++ offset += to_bytes(len); ++ } while (remaining -= len); + +- ci->sector += len; +- ci->sector_count -= len; + ci->idx++; + } + } +@@ -1093,6 +1098,7 @@ int dm_suspend(struct mapped_device *md, + { + struct dm_table *map = NULL; + DECLARE_WAITQUEUE(wait, current); ++ struct bio *def; + int r = -EINVAL; + + down(&md->suspend_lock); +@@ -1152,9 +1158,11 @@ int dm_suspend(struct mapped_device *md, + /* were we interrupted ? */ + r = -EINTR; + if (atomic_read(&md->pending)) { ++ clear_bit(DMF_BLOCK_IO, &md->flags); ++ def = bio_list_get(&md->deferred); ++ __flush_deferred_io(md, def); + up_write(&md->io_lock); + unlock_fs(md); +- clear_bit(DMF_BLOCK_IO, &md->flags); + goto out; + } + up_write(&md->io_lock); +diff -upr linux-2.6.16.orig/drivers/md/kcopyd.c linux-2.6.16-026test015/drivers/md/kcopyd.c +--- linux-2.6.16.orig/drivers/md/kcopyd.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/md/kcopyd.c 2006-07-04 14:41:36.000000000 +0400 +@@ -44,6 +44,9 @@ struct kcopyd_client { + struct page_list *pages; + unsigned int nr_pages; + unsigned int nr_free_pages; ++ ++ wait_queue_head_t destroyq; ++ atomic_t nr_jobs; + }; + + static struct page_list *alloc_pl(void) +@@ -293,10 +296,15 @@ static int run_complete_job(struct kcopy + int read_err = job->read_err; + unsigned int write_err = job->write_err; + kcopyd_notify_fn fn = job->fn; ++ struct kcopyd_client *kc = job->kc; + +- kcopyd_put_pages(job->kc, job->pages); ++ kcopyd_put_pages(kc, job->pages); + mempool_free(job, _job_pool); + fn(read_err, write_err, context); ++ ++ if (atomic_dec_and_test(&kc->nr_jobs)) ++ wake_up(&kc->destroyq); ++ + return 0; + } + +@@ -431,6 +439,7 @@ static void do_work(void *ignored) + */ + static void dispatch_job(struct kcopyd_job *job) + { ++ atomic_inc(&job->kc->nr_jobs); + push(&_pages_jobs, job); + wake(); + } +@@ -670,6 +679,9 @@ int kcopyd_client_create(unsigned int nr + return r; + } + ++ init_waitqueue_head(&kc->destroyq); ++ atomic_set(&kc->nr_jobs, 0); ++ + client_add(kc); + *result = kc; + return 0; +@@ -677,6 +689,9 @@ int kcopyd_client_create(unsigned int nr + + void kcopyd_client_destroy(struct kcopyd_client *kc) + { ++ /* Wait for completion of all jobs submitted by this client. */ ++ wait_event(kc->destroyq, !atomic_read(&kc->nr_jobs)); ++ + dm_io_put(kc->nr_pages); + client_free_pages(kc); + client_del(kc); +diff -upr linux-2.6.16.orig/drivers/md/raid10.c linux-2.6.16-026test015/drivers/md/raid10.c +--- linux-2.6.16.orig/drivers/md/raid10.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/md/raid10.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1436,9 +1436,9 @@ static void raid10d(mddev_t *mddev) + sl--; + d = r10_bio->devs[sl].devnum; + rdev = conf->mirrors[d].rdev; +- atomic_add(s, &rdev->corrected_errors); + if (rdev && + test_bit(In_sync, &rdev->flags)) { ++ atomic_add(s, &rdev->corrected_errors); + if (sync_page_io(rdev->bdev, + r10_bio->devs[sl].addr + + sect + rdev->data_offset, +diff -upr linux-2.6.16.orig/drivers/media/dvb/dvb-usb/cxusb.c linux-2.6.16-026test015/drivers/media/dvb/dvb-usb/cxusb.c +--- linux-2.6.16.orig/drivers/media/dvb/dvb-usb/cxusb.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/media/dvb/dvb-usb/cxusb.c 2006-07-04 14:41:36.000000000 +0400 +@@ -149,6 +149,15 @@ static int cxusb_power_ctrl(struct dvb_u + return cxusb_ctrl_msg(d, CMD_POWER_OFF, &b, 1, NULL, 0); + } + ++static int cxusb_bluebird_power_ctrl(struct dvb_usb_device *d, int onoff) ++{ ++ u8 b = 0; ++ if (onoff) ++ return cxusb_ctrl_msg(d, CMD_POWER_ON, &b, 1, NULL, 0); ++ else ++ return 0; ++} ++ + static int cxusb_streaming_ctrl(struct dvb_usb_device *d, int onoff) + { + u8 buf[2] = { 0x03, 0x00 }; +@@ -505,7 +514,7 @@ static struct dvb_usb_properties cxusb_b + .size_of_priv = sizeof(struct cxusb_state), + + .streaming_ctrl = cxusb_streaming_ctrl, +- .power_ctrl = cxusb_power_ctrl, ++ .power_ctrl = cxusb_bluebird_power_ctrl, + .frontend_attach = cxusb_lgdt3303_frontend_attach, + .tuner_attach = cxusb_lgh064f_tuner_attach, + +@@ -545,7 +554,7 @@ static struct dvb_usb_properties cxusb_b + .size_of_priv = sizeof(struct cxusb_state), + + .streaming_ctrl = cxusb_streaming_ctrl, +- .power_ctrl = cxusb_power_ctrl, ++ .power_ctrl = cxusb_bluebird_power_ctrl, + .frontend_attach = cxusb_dee1601_frontend_attach, + .tuner_attach = cxusb_dee1601_tuner_attach, + +@@ -594,7 +603,7 @@ static struct dvb_usb_properties cxusb_b + .size_of_priv = sizeof(struct cxusb_state), + + .streaming_ctrl = cxusb_streaming_ctrl, +- .power_ctrl = cxusb_power_ctrl, ++ .power_ctrl = cxusb_bluebird_power_ctrl, + .frontend_attach = cxusb_mt352_frontend_attach, + .tuner_attach = cxusb_lgz201_tuner_attach, + +@@ -634,7 +643,7 @@ static struct dvb_usb_properties cxusb_b + .size_of_priv = sizeof(struct cxusb_state), + + .streaming_ctrl = cxusb_streaming_ctrl, +- .power_ctrl = cxusb_power_ctrl, ++ .power_ctrl = cxusb_bluebird_power_ctrl, + .frontend_attach = cxusb_mt352_frontend_attach, + .tuner_attach = cxusb_dtt7579_tuner_attach, + +diff -upr linux-2.6.16.orig/drivers/media/video/Kconfig linux-2.6.16-026test015/drivers/media/video/Kconfig +--- linux-2.6.16.orig/drivers/media/video/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/media/video/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -349,6 +349,7 @@ config VIDEO_AUDIO_DECODER + config VIDEO_DECODER + tristate "Add support for additional video chipsets" + depends on VIDEO_DEV && I2C && EXPERIMENTAL ++ select FW_LOADER + ---help--- + Say Y here to compile drivers for SAA7115, SAA7127 and CX25840 + video decoders. +diff -upr linux-2.6.16.orig/drivers/media/video/saa7127.c linux-2.6.16-026test015/drivers/media/video/saa7127.c +--- linux-2.6.16.orig/drivers/media/video/saa7127.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/media/video/saa7127.c 2006-07-04 14:41:36.000000000 +0400 +@@ -141,6 +141,7 @@ struct i2c_reg_value { + static const struct i2c_reg_value saa7129_init_config_extra[] = { + { SAA7127_REG_OUTPUT_PORT_CONTROL, 0x38 }, + { SAA7127_REG_VTRIG, 0xfa }, ++ { 0, 0 } + }; + + static const struct i2c_reg_value saa7127_init_config_common[] = { +diff -upr linux-2.6.16.orig/drivers/media/video/tuner-types.c linux-2.6.16-026test015/drivers/media/video/tuner-types.c +--- linux-2.6.16.orig/drivers/media/video/tuner-types.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/media/video/tuner-types.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1087,8 +1087,8 @@ static struct tuner_params tuner_tnf_533 + /* ------------ TUNER_SAMSUNG_TCPN_2121P30A - Samsung NTSC ------------ */ + + static struct tuner_range tuner_samsung_tcpn_2121p30a_ntsc_ranges[] = { +- { 16 * 175.75 /*MHz*/, 0x01, }, +- { 16 * 410.25 /*MHz*/, 0x02, }, ++ { 16 * 130.00 /*MHz*/, 0x01, }, ++ { 16 * 364.50 /*MHz*/, 0x02, }, + { 16 * 999.99 , 0x08, }, + }; + +diff -upr linux-2.6.16.orig/drivers/message/i2o/exec-osm.c linux-2.6.16-026test015/drivers/message/i2o/exec-osm.c +--- linux-2.6.16.orig/drivers/message/i2o/exec-osm.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/message/i2o/exec-osm.c 2006-07-04 14:41:36.000000000 +0400 +@@ -55,6 +55,7 @@ struct i2o_exec_wait { + u32 m; /* message id */ + struct i2o_message *msg; /* pointer to the reply message */ + struct list_head list; /* node in global wait list */ ++ spinlock_t lock; /* lock before modifying */ + }; + + /* Exec OSM class handling definition */ +@@ -80,6 +81,7 @@ static struct i2o_exec_wait *i2o_exec_wa + return NULL; + + INIT_LIST_HEAD(&wait->list); ++ spin_lock_init(&wait->lock); + + return wait; + }; +@@ -118,6 +120,7 @@ int i2o_msg_post_wait_mem(struct i2o_con + DECLARE_WAIT_QUEUE_HEAD(wq); + struct i2o_exec_wait *wait; + static u32 tcntxt = 0x80000000; ++ long flags; + int rc = 0; + + wait = i2o_exec_wait_alloc(); +@@ -139,33 +142,28 @@ int i2o_msg_post_wait_mem(struct i2o_con + wait->tcntxt = tcntxt++; + msg->u.s.tcntxt = cpu_to_le32(wait->tcntxt); + ++ wait->wq = &wq; ++ /* ++ * we add elements to the head, because if a entry in the list will ++ * never be removed, we have to iterate over it every time ++ */ ++ list_add(&wait->list, &i2o_exec_wait_list); ++ + /* + * Post the message to the controller. At some point later it will + * return. If we time out before it returns then complete will be zero. + */ + i2o_msg_post(c, msg); + +- if (!wait->complete) { +- wait->wq = &wq; +- /* +- * we add elements add the head, because if a entry in the list +- * will never be removed, we have to iterate over it every time +- */ +- list_add(&wait->list, &i2o_exec_wait_list); +- +- wait_event_interruptible_timeout(wq, wait->complete, +- timeout * HZ); ++ wait_event_interruptible_timeout(wq, wait->complete, timeout * HZ); + +- wait->wq = NULL; +- } ++ spin_lock_irqsave(&wait->lock, flags); + +- barrier(); ++ wait->wq = NULL; + +- if (wait->complete) { ++ if (wait->complete) + rc = le32_to_cpu(wait->msg->body[0]) >> 24; +- i2o_flush_reply(c, wait->m); +- i2o_exec_wait_free(wait); +- } else { ++ else { + /* + * We cannot remove it now. This is important. When it does + * terminate (which it must do if the controller has not +@@ -179,6 +177,13 @@ int i2o_msg_post_wait_mem(struct i2o_con + rc = -ETIMEDOUT; + } + ++ spin_unlock_irqrestore(&wait->lock, flags); ++ ++ if (rc != -ETIMEDOUT) { ++ i2o_flush_reply(c, wait->m); ++ i2o_exec_wait_free(wait); ++ } ++ + return rc; + }; + +@@ -206,7 +211,6 @@ static int i2o_msg_post_wait_complete(st + { + struct i2o_exec_wait *wait, *tmp; + unsigned long flags; +- static spinlock_t lock = SPIN_LOCK_UNLOCKED; + int rc = 1; + + /* +@@ -216,23 +220,24 @@ static int i2o_msg_post_wait_complete(st + * already expired. Not much we can do about that except log it for + * debug purposes, increase timeout, and recompile. + */ +- spin_lock_irqsave(&lock, flags); + list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) { + if (wait->tcntxt == context) { +- list_del(&wait->list); ++ spin_lock_irqsave(&wait->lock, flags); + +- spin_unlock_irqrestore(&lock, flags); ++ list_del(&wait->list); + + wait->m = m; + wait->msg = msg; + wait->complete = 1; + +- barrier(); +- +- if (wait->wq) { +- wake_up_interruptible(wait->wq); ++ if (wait->wq) + rc = 0; +- } else { ++ else ++ rc = -1; ++ ++ spin_unlock_irqrestore(&wait->lock, flags); ++ ++ if (rc) { + struct device *dev; + + dev = &c->pdev->dev; +@@ -241,15 +246,13 @@ static int i2o_msg_post_wait_complete(st + c->name); + i2o_dma_free(dev, &wait->dma); + i2o_exec_wait_free(wait); +- rc = -1; +- } ++ } else ++ wake_up_interruptible(wait->wq); + + return rc; + } + } + +- spin_unlock_irqrestore(&lock, flags); +- + osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name, + context); + +@@ -315,14 +318,9 @@ static DEVICE_ATTR(product_id, S_IRUGO, + static int i2o_exec_probe(struct device *dev) + { + struct i2o_device *i2o_dev = to_i2o_device(dev); +- struct i2o_controller *c = i2o_dev->iop; + + i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff); + +- c->exec = i2o_dev; +- +- i2o_exec_lct_notify(c, c->lct->change_ind + 1); +- + device_create_file(dev, &dev_attr_vendor_id); + device_create_file(dev, &dev_attr_product_id); + +@@ -510,6 +508,8 @@ static int i2o_exec_lct_notify(struct i2 + struct device *dev; + struct i2o_message *msg; + ++ down(&c->lct_lock); ++ + dev = &c->pdev->dev; + + if (i2o_dma_realloc +@@ -532,6 +532,8 @@ static int i2o_exec_lct_notify(struct i2 + + i2o_msg_post(c, msg); + ++ up(&c->lct_lock); ++ + return 0; + }; + +diff -upr linux-2.6.16.orig/drivers/message/i2o/iop.c linux-2.6.16-026test015/drivers/message/i2o/iop.c +--- linux-2.6.16.orig/drivers/message/i2o/iop.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/message/i2o/iop.c 2006-07-04 14:41:36.000000000 +0400 +@@ -804,8 +804,6 @@ void i2o_iop_remove(struct i2o_controlle + + /* Ask the IOP to switch to RESET state */ + i2o_iop_reset(c); +- +- put_device(&c->device); + } + + /** +@@ -1059,7 +1057,7 @@ struct i2o_controller *i2o_iop_alloc(voi + + snprintf(poolname, sizeof(poolname), "i2o_%s_msg_inpool", c->name); + if (i2o_pool_alloc +- (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4, ++ (&c->in_msg, poolname, I2O_INBOUND_MSG_FRAME_SIZE * 4 + sizeof(u32), + I2O_MSG_INPOOL_MIN)) { + kfree(c); + return ERR_PTR(-ENOMEM); +diff -upr linux-2.6.16.orig/drivers/mtd/nand/Kconfig linux-2.6.16-026test015/drivers/mtd/nand/Kconfig +--- linux-2.6.16.orig/drivers/mtd/nand/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/mtd/nand/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -178,17 +178,16 @@ config MTD_NAND_DISKONCHIP_BBTWRITE + Even if you leave this disabled, you can enable BBT writes at module + load time (assuming you build diskonchip as a module) with the module + parameter "inftl_bbt_write=1". +- +- config MTD_NAND_SHARPSL +- bool "Support for NAND Flash on Sharp SL Series (C7xx + others)" +- depends on MTD_NAND && ARCH_PXA +- +- config MTD_NAND_NANDSIM +- bool "Support for NAND Flash Simulator" +- depends on MTD_NAND && MTD_PARTITIONS + ++config MTD_NAND_SHARPSL ++ tristate "Support for NAND Flash on Sharp SL Series (C7xx + others)" ++ depends on MTD_NAND && ARCH_PXA ++ ++config MTD_NAND_NANDSIM ++ tristate "Support for NAND Flash Simulator" ++ depends on MTD_NAND && MTD_PARTITIONS + help + The simulator may simulate verious NAND flash chips for the + MTD nand layer. +- ++ + endmenu +diff -upr linux-2.6.16.orig/drivers/net/Makefile linux-2.6.16-026test015/drivers/net/Makefile +--- linux-2.6.16.orig/drivers/net/Makefile 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/Makefile 2006-07-04 14:41:39.000000000 +0400 +@@ -18,6 +18,12 @@ gianfar_driver-objs := gianfar.o \ + gianfar_mii.o \ + gianfar_sysfs.o + ++obj-$(CONFIG_VE_NETDEV) += vznetdev.o ++vznetdev-objs := open_vznet.o venet_core.o ++ ++obj-$(CONFIG_VE_ETHDEV) += vzethdev.o ++vzethdev-objs := veth.o ++ + # + # link order important here + # +diff -upr linux-2.6.16.orig/drivers/net/e1000/e1000_main.c linux-2.6.16-026test015/drivers/net/e1000/e1000_main.c +--- linux-2.6.16.orig/drivers/net/e1000/e1000_main.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/e1000/e1000_main.c 2006-07-04 14:41:36.000000000 +0400 +@@ -3851,6 +3851,7 @@ e1000_clean_rx_irq_ps(struct e1000_adapt + skb_shinfo(skb)->nr_frags++; + skb->len += length; + skb->data_len += length; ++ skb->truesize += length; + } + + e1000_rx_checksum(adapter, staterr, +diff -upr linux-2.6.16.orig/drivers/net/irda/irda-usb.c linux-2.6.16-026test015/drivers/net/irda/irda-usb.c +--- linux-2.6.16.orig/drivers/net/irda/irda-usb.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/irda/irda-usb.c 2006-07-04 14:41:36.000000000 +0400 +@@ -740,7 +740,7 @@ static void irda_usb_receive(struct urb + struct sk_buff *newskb; + struct sk_buff *dataskb; + struct urb *next_urb; +- int docopy; ++ unsigned int len, docopy; + + IRDA_DEBUG(2, "%s(), len=%d\n", __FUNCTION__, urb->actual_length); + +@@ -851,10 +851,11 @@ static void irda_usb_receive(struct urb + dataskb->dev = self->netdev; + dataskb->mac.raw = dataskb->data; + dataskb->protocol = htons(ETH_P_IRDA); ++ len = dataskb->len; + netif_rx(dataskb); + + /* Keep stats up to date */ +- self->stats.rx_bytes += dataskb->len; ++ self->stats.rx_bytes += len; + self->stats.rx_packets++; + self->netdev->last_rx = jiffies; + +diff -upr linux-2.6.16.orig/drivers/net/loopback.c linux-2.6.16-026test015/drivers/net/loopback.c +--- linux-2.6.16.orig/drivers/net/loopback.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/loopback.c 2006-07-04 14:41:39.000000000 +0400 +@@ -130,6 +130,11 @@ static int loopback_xmit(struct sk_buff + { + struct net_device_stats *lb_stats; + ++ if (unlikely(get_exec_env()->disable_net)) { ++ kfree_skb(skb); ++ return 0; ++ } ++ + skb_orphan(skb); + + skb->protocol = eth_type_trans(skb,dev); +@@ -198,6 +203,34 @@ static struct ethtool_ops loopback_ethto + .set_tso = ethtool_op_set_tso, + }; + ++static void loopback_destructor(struct net_device *dev) ++{ ++ kfree(dev->priv); ++ dev->priv = NULL; ++} ++ ++struct net_device templ_loopback_dev = { ++ .name = "lo", ++ .mtu = (16 * 1024) + 20 + 20 + 12, ++ .hard_start_xmit = loopback_xmit, ++ .hard_header = eth_header, ++ .hard_header_cache = eth_header_cache, ++ .header_cache_update = eth_header_cache_update, ++ .hard_header_len = ETH_HLEN, /* 14 */ ++ .addr_len = ETH_ALEN, /* 6 */ ++ .tx_queue_len = 0, ++ .type = ARPHRD_LOOPBACK, /* 0x0001*/ ++ .rebuild_header = eth_rebuild_header, ++ .flags = IFF_LOOPBACK, ++ .features = NETIF_F_SG|NETIF_F_FRAGLIST ++ |NETIF_F_NO_CSUM|NETIF_F_HIGHDMA ++ |NETIF_F_LLTX|NETIF_F_VIRTUAL, ++}; ++ ++#ifdef loopback_dev ++#undef loopback_dev ++#endif ++ + struct net_device loopback_dev = { + .name = "lo", + .mtu = (16 * 1024) + 20 + 20 + 12, +@@ -231,9 +264,13 @@ int __init loopback_init(void) + memset(stats, 0, sizeof(struct net_device_stats)); + loopback_dev.priv = stats; + loopback_dev.get_stats = &get_stats; ++ loopback_dev.destructor = &loopback_destructor; + } +- ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ get_ve0()->_loopback_dev = &loopback_dev; ++#endif + return register_netdev(&loopback_dev); + }; + + EXPORT_SYMBOL(loopback_dev); ++EXPORT_SYMBOL(templ_loopback_dev); +diff -upr linux-2.6.16.orig/drivers/net/open_vznet.c linux-2.6.16-026test015/drivers/net/open_vznet.c +--- linux-2.6.16.orig/drivers/net/open_vznet.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/drivers/net/open_vznet.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,227 @@ ++/* ++ * open_vznet.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Virtual Networking device used to change VE ownership on packets ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/seq_file.h> ++ ++#include <linux/inet.h> ++#include <net/ip.h> ++#include <linux/skbuff.h> ++#include <linux/venet.h> ++ ++void veip_stop(struct ve_struct *ve) ++{ ++ struct list_head *p, *tmp; ++ ++ write_lock_irq(&veip_hash_lock); ++ if (ve->veip == NULL) ++ goto unlock; ++ list_for_each_safe(p, tmp, &ve->veip->ip_lh) { ++ struct ip_entry_struct *ptr; ++ ptr = list_entry(p, struct ip_entry_struct, ve_list); ++ ptr->active_env = NULL; ++ list_del(&ptr->ve_list); ++ list_del(&ptr->ip_hash); ++ kfree(ptr); ++ } ++ veip_put(ve->veip); ++ ve->veip = NULL; ++unlock: ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++int veip_start(struct ve_struct *ve) ++{ ++ int err; ++ ++ err = 0; ++ write_lock_irq(&veip_hash_lock); ++ ve->veip = veip_findcreate(ve->veid); ++ if (ve->veip == NULL) ++ err = -ENOMEM; ++ write_unlock_irq(&veip_hash_lock); ++ return err; ++} ++ ++int veip_entry_add(struct ve_struct *ve, struct sockaddr *addr) ++{ ++ struct ip_entry_struct *entry, *found; ++ int err; ++ ++ entry = kmalloc(sizeof(struct ip_entry_struct), GFP_KERNEL); ++ if (entry == NULL) ++ return -ENOMEM; ++ ++ memset(entry, 0, sizeof(struct ip_entry_struct)); ++ entry->family = addr->sa_family; ++ if (addr->sa_family == AF_INET) { ++ entry->key[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr; ++ } else if (addr->sa_family == AF_INET6) { ++ memcpy(entry->key, &((struct sockaddr_in6*)addr)->sin6_addr, 16); ++ } else { ++ kfree(entry); ++ return -EAFNOSUPPORT; ++ } ++ ++ write_lock_irq(&veip_hash_lock); ++ err = -EADDRINUSE; ++ found = venet_entry_lookup(entry->key, entry->family); ++ if (found != NULL) ++ goto out_unlock; ++ else { ++ ip_entry_hash(entry, ve->veip); ++ found = entry; ++ entry = NULL; ++ } ++ err = 0; ++ found->active_env = ve; ++out_unlock: ++ write_unlock_irq(&veip_hash_lock); ++ if (entry != NULL) ++ kfree(entry); ++ return err; ++} ++ ++int veip_entry_del(envid_t veid, struct sockaddr *addr) ++{ ++ struct ip_entry_struct *found; ++ u32 key[4]; ++ int err; ++ ++ if (addr->sa_family == AF_INET) { ++ memset(key, 0, sizeof(key)); ++ key[3] = ((struct sockaddr_in*)addr)->sin_addr.s_addr; ++ } else if (addr->sa_family == AF_INET6) { ++ memcpy(key, &((struct sockaddr_in6*)addr)->sin6_addr, 16); ++ } else { ++ return -EAFNOSUPPORT; ++ } ++ ++ err = -EADDRNOTAVAIL; ++ write_lock_irq(&veip_hash_lock); ++ found = venet_entry_lookup(key, addr->sa_family); ++ if (found == NULL) ++ goto out; ++ if (found->active_env->veid != veid) ++ goto out; ++ ++ err = 0; ++ found->active_env = NULL; ++ ++ list_del(&found->ip_hash); ++ list_del(&found->ve_list); ++ kfree(found); ++out: ++ write_unlock_irq(&veip_hash_lock); ++ return err; ++} ++ ++static struct ve_struct *venet_find_ve(struct sk_buff *skb, int dir) ++{ ++ struct ip_entry_struct *entry; ++ ++ if (skb->protocol == __constant_htons(ETH_P_IP)) { ++ entry = ip_entry_lookup(dir ? skb->nh.iph->daddr : ++ skb->nh.iph->saddr); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { ++ entry = venet_entry_lookup(dir ? skb->nh.ipv6h->daddr.s6_addr32 : ++ skb->nh.ipv6h->saddr.s6_addr32, AF_INET6); ++#endif ++ } else { ++ return NULL; ++ } ++ if (entry == NULL) ++ return NULL; ++ ++ return entry->active_env; ++} ++ ++int venet_change_skb_owner(struct sk_buff *skb) ++{ ++ struct ve_struct *ve, *ve_old; ++ ++ ve_old = skb->owner_env; ++ ++ read_lock(&veip_hash_lock); ++ if (!ve_is_super(ve_old)) { ++ /* from VE to host */ ++ ve = venet_find_ve(skb, 0); ++ if (ve == NULL) ++ goto out_drop; ++ if (!ve_accessible_strict(ve, ve_old)) ++ goto out_source; ++ skb->owner_env = get_ve0(); ++ } else { ++ /* from host to VE */ ++ ve = venet_find_ve(skb, 1); ++ if (ve == NULL) ++ goto out_drop; ++ skb->owner_env = ve; ++ } ++ read_unlock(&veip_hash_lock); ++ ++ return 0; ++ ++out_drop: ++ read_unlock(&veip_hash_lock); ++ return -ESRCH; ++ ++out_source: ++ read_unlock(&veip_hash_lock); ++ if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) { ++ printk(KERN_WARNING "Dropped packet, source wrong " ++ "veid=%u src-IP=%u.%u.%u.%u " ++ "dst-IP=%u.%u.%u.%u\n", ++ skb->owner_env->veid, ++ NIPQUAD(skb->nh.iph->saddr), ++ NIPQUAD(skb->nh.iph->daddr)); ++ } ++ return -EACCES; ++} ++ ++#ifdef CONFIG_PROC_FS ++int veip_seq_show(struct seq_file *m, void *v) ++{ ++ struct list_head *p; ++ struct ip_entry_struct *entry; ++ char s[40]; ++ ++ p = (struct list_head *)v; ++ if (p == ip_entry_hash_table) { ++ seq_puts(m, "Version: 2.5\n"); ++ return 0; ++ } ++ entry = list_entry(p, struct ip_entry_struct, ip_hash); ++ if (entry->family == AF_INET) ++ sprintf(s, "%u.%u.%u.%u", NIPQUAD(entry->key[3])); ++ else ++ sprintf(s, "%x:%x:%x:%x:%x:%x:%x:%x", ++ ntohl(entry->key[0])>>16, ++ ntohl(entry->key[0])&0xFFFF, ++ ntohl(entry->key[1])>>16, ++ ntohl(entry->key[1])&0xFFFF, ++ ntohl(entry->key[2])>>16, ++ ntohl(entry->key[2])&0xFFFF, ++ ntohl(entry->key[3])>>16, ++ ntohl(entry->key[3])&0xFFFF); ++ seq_printf(m, "%39s %10u\n", s, 0); ++ return 0; ++} ++#endif ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Virtual Network Device"); ++MODULE_LICENSE("GPL v2"); +diff -upr linux-2.6.16.orig/drivers/net/sky2.c linux-2.6.16-026test015/drivers/net/sky2.c +--- linux-2.6.16.orig/drivers/net/sky2.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/sky2.c 2006-07-04 14:41:36.000000000 +0400 +@@ -579,8 +579,8 @@ static void sky2_mac_init(struct sky2_hw + reg = gma_read16(hw, port, GM_PHY_ADDR); + gma_write16(hw, port, GM_PHY_ADDR, reg | GM_PAR_MIB_CLR); + +- for (i = 0; i < GM_MIB_CNT_SIZE; i++) +- gma_read16(hw, port, GM_MIB_CNT_BASE + 8 * i); ++ for (i = GM_MIB_CNT_BASE; i <= GM_MIB_CNT_END; i += 4) ++ gma_read16(hw, port, i); + gma_write16(hw, port, GM_PHY_ADDR, reg); + + /* transmit control */ +diff -upr linux-2.6.16.orig/drivers/net/sky2.h linux-2.6.16-026test015/drivers/net/sky2.h +--- linux-2.6.16.orig/drivers/net/sky2.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/sky2.h 2006-07-04 14:41:36.000000000 +0400 +@@ -1380,6 +1380,7 @@ enum { + /* MIB Counters */ + #define GM_MIB_CNT_BASE 0x0100 /* Base Address of MIB Counters */ + #define GM_MIB_CNT_SIZE 44 /* Number of MIB Counters */ ++#define GM_MIB_CNT_END 0x025C /* Last MIB counter */ + + /* + * MIB Counters base address definitions (low word) - +diff -upr linux-2.6.16.orig/drivers/net/tg3.c linux-2.6.16-026test015/drivers/net/tg3.c +--- linux-2.6.16.orig/drivers/net/tg3.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/tg3.c 2006-07-04 14:41:36.000000000 +0400 +@@ -7368,21 +7368,23 @@ static int tg3_get_settings(struct net_d + cmd->supported |= (SUPPORTED_1000baseT_Half | + SUPPORTED_1000baseT_Full); + +- if (!(tp->tg3_flags2 & TG3_FLG2_ANY_SERDES)) ++ if (!(tp->tg3_flags2 & TG3_FLG2_ANY_SERDES)) { + cmd->supported |= (SUPPORTED_100baseT_Half | + SUPPORTED_100baseT_Full | + SUPPORTED_10baseT_Half | + SUPPORTED_10baseT_Full | + SUPPORTED_MII); +- else ++ cmd->port = PORT_TP; ++ } else { + cmd->supported |= SUPPORTED_FIBRE; ++ cmd->port = PORT_FIBRE; ++ } + + cmd->advertising = tp->link_config.advertising; + if (netif_running(dev)) { + cmd->speed = tp->link_config.active_speed; + cmd->duplex = tp->link_config.active_duplex; + } +- cmd->port = 0; + cmd->phy_address = PHY_ADDR; + cmd->transceiver = 0; + cmd->autoneg = tp->link_config.autoneg; +diff -upr linux-2.6.16.orig/drivers/net/tun.c linux-2.6.16-026test015/drivers/net/tun.c +--- linux-2.6.16.orig/drivers/net/tun.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/tun.c 2006-07-04 14:41:38.000000000 +0400 +@@ -62,6 +62,7 @@ + + #include <asm/system.h> + #include <asm/uaccess.h> ++#include <ub/beancounter.h> + + #ifdef TUN_DEBUG + static int debug; +@@ -90,6 +91,7 @@ static int tun_net_close(struct net_devi + static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct tun_struct *tun = netdev_priv(dev); ++ struct user_beancounter *ub; + + DBG(KERN_INFO "%s: tun_net_xmit %d\n", tun->dev->name, skb->len); + +@@ -114,6 +116,18 @@ static int tun_net_xmit(struct sk_buff * + } + } + ++ ub = netdev_bc(dev)->exec_ub; ++ if (ub && (skb_bc(skb)->charged == 0)) { ++ unsigned long charge; ++ charge = skb_charge_fullsize(skb); ++ if (charge_beancounter(ub, UB_OTHERSOCKBUF, charge, 1)) ++ goto drop; ++ get_beancounter(ub); ++ skb_bc(skb)->ub = ub; ++ skb_bc(skb)->charged = charge; ++ skb_bc(skb)->resource = UB_OTHERSOCKBUF; ++ } ++ + /* Queue packet */ + skb_queue_tail(&tun->readq, skb); + dev->trans_start = jiffies; +@@ -410,12 +424,14 @@ static ssize_t tun_chr_readv(struct file + tun->dev->name, addr[0], addr[1], addr[2], + addr[3], addr[4], addr[5]); + ret = tun_put_user(tun, skb, (struct iovec *) iv, len); ++ /* skb will be uncharged in kfree_skb() */ + kfree_skb(skb); + break; + } else { + DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %x:%x:%x:%x:%x:%x\n", + tun->dev->name, addr[0], addr[1], addr[2], + addr[3], addr[4], addr[5]); ++ /* skb will be uncharged in kfree_skb() */ + kfree_skb(skb); + continue; + } +@@ -451,6 +467,7 @@ static void tun_setup(struct net_device + dev->get_stats = tun_net_stats; + dev->ethtool_ops = &tun_ethtool_ops; + dev->destructor = free_netdev; ++ dev->features |= NETIF_F_VIRTUAL; + } + + static struct tun_struct *tun_get_by_name(const char *name) +@@ -459,8 +476,9 @@ static struct tun_struct *tun_get_by_nam + + ASSERT_RTNL(); + list_for_each_entry(tun, &tun_dev_list, list) { +- if (!strncmp(tun->dev->name, name, IFNAMSIZ)) +- return tun; ++ if (ve_accessible_strict(tun->dev->owner_env, get_exec_env()) && ++ !strncmp(tun->dev->name, name, IFNAMSIZ)) ++ return tun; + } + + return NULL; +@@ -479,7 +497,8 @@ static int tun_set_iff(struct file *file + + /* Check permissions */ + if (tun->owner != -1 && +- current->euid != tun->owner && !capable(CAP_NET_ADMIN)) ++ current->euid != tun->owner && ++ !capable(CAP_NET_ADMIN) && !capable(CAP_VE_NET_ADMIN)) + return -EPERM; + } + else if (__dev_get_by_name(ifr->ifr_name)) +diff -upr linux-2.6.16.orig/drivers/net/venet_core.c linux-2.6.16-026test015/drivers/net/venet_core.c +--- linux-2.6.16.orig/drivers/net/venet_core.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/drivers/net/venet_core.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,675 @@ ++/* ++ * venet_core.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Common part for Virtuozzo virtual network devices ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/interrupt.h> ++#include <linux/fs.h> ++#include <linux/types.h> ++#include <linux/string.h> ++#include <linux/socket.h> ++#include <linux/errno.h> ++#include <linux/fcntl.h> ++#include <linux/in.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/tcp.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++#include <net/addrconf.h> ++ ++#include <asm/system.h> ++#include <asm/uaccess.h> ++#include <asm/io.h> ++#include <asm/unistd.h> ++ ++#include <linux/inet.h> ++#include <linux/netdevice.h> ++#include <linux/etherdevice.h> ++#include <net/ip.h> ++#include <linux/skbuff.h> ++#include <net/sock.h> ++#include <linux/if_ether.h> /* For the statistics structure. */ ++#include <linux/if_arp.h> /* For ARPHRD_ETHER */ ++#include <linux/venet.h> ++#include <linux/ve_proto.h> ++#include <linux/vzctl.h> ++#include <linux/vzctl_venet.h> ++ ++struct list_head ip_entry_hash_table[VEIP_HASH_SZ]; ++rwlock_t veip_hash_lock = RW_LOCK_UNLOCKED; ++LIST_HEAD(veip_lh); ++ ++#define ip_entry_hash_function(ip) (ntohl(ip) & (VEIP_HASH_SZ - 1)) ++ ++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip) ++{ ++ list_add(&entry->ip_hash, ++ ip_entry_hash_table + ip_entry_hash_function(entry->key[3])); ++ list_add(&entry->ve_list, &veip->ip_lh); ++} ++ ++void veip_put(struct veip_struct *veip) ++{ ++ if (!list_empty(&veip->ip_lh)) ++ return; ++ if (!list_empty(&veip->src_lh)) ++ return; ++ if (!list_empty(&veip->dst_lh)) ++ return; ++ ++ list_del(&veip->list); ++ kfree(veip); ++} ++ ++struct ip_entry_struct *ip_entry_lookup(u32 addr) ++{ ++ struct ip_entry_struct *entry; ++ struct list_head *tmp; ++ ++ list_for_each(tmp, ip_entry_hash_table + ip_entry_hash_function(addr)) { ++ entry = list_entry(tmp, struct ip_entry_struct, ip_hash); ++ if (entry->key[3] != addr || entry->family != AF_INET) ++ continue; ++ return entry; ++ } ++ return NULL; ++} ++ ++struct ip_entry_struct *venet_entry_lookup(u32 *addr, int family) ++{ ++ struct ip_entry_struct *entry; ++ struct list_head *tmp; ++ ++ list_for_each(tmp, ip_entry_hash_table + ip_entry_hash_function(addr[3])) { ++ entry = list_entry(tmp, struct ip_entry_struct, ip_hash); ++ if (memcmp(entry->key, addr, 16) != 0 ++ || entry->family != family) ++ continue; ++ return entry; ++ } ++ return NULL; ++} ++ ++struct veip_struct *veip_find(envid_t veid) ++{ ++ struct veip_struct *ptr; ++ list_for_each_entry(ptr, &veip_lh, list) { ++ if (ptr->veid != veid) ++ continue; ++ return ptr; ++ } ++ return NULL; ++} ++ ++struct veip_struct *veip_findcreate(envid_t veid) ++{ ++ struct veip_struct *ptr; ++ ++ ptr = veip_find(veid); ++ if (ptr != NULL) ++ return ptr; ++ ++ ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC); ++ if (ptr == NULL) ++ return NULL; ++ memset(ptr, 0, sizeof(struct veip_struct)); ++ INIT_LIST_HEAD(&ptr->ip_lh); ++ INIT_LIST_HEAD(&ptr->src_lh); ++ INIT_LIST_HEAD(&ptr->dst_lh); ++ list_add(&ptr->list, &veip_lh); ++ ptr->veid = veid; ++ return ptr; ++} ++ ++/* ++ * Device functions ++ */ ++ ++static int venet_open(struct net_device *dev) ++{ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ return 0; ++} ++ ++static int venet_close(struct net_device *master) ++{ ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++static void venet_destructor(struct net_device *dev) ++{ ++ kfree(dev->priv); ++ dev->priv = NULL; ++} ++ ++/* ++ * The higher levels take care of making this non-reentrant (it's ++ * called with bh's disabled). ++ */ ++static int venet_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct net_device_stats *stats = (struct net_device_stats *)dev->priv; ++ struct net_device *rcv = NULL; ++ int length; ++ ++ if (unlikely(get_exec_env()->disable_net)) ++ goto outf; ++ ++ /* ++ * Optimise so buffers with skb->free=1 are not copied but ++ * instead are lobbed from tx queue to rx queue ++ */ ++ if (atomic_read(&skb->users) != 1) { ++ struct sk_buff *skb2 = skb; ++ skb = skb_clone(skb, GFP_ATOMIC); /* Clone the buffer */ ++ if (skb == NULL) { ++ kfree_skb(skb2); ++ goto out; ++ } ++ kfree_skb(skb2); ++ } else ++ skb_orphan(skb); ++ ++ if (skb->protocol == __constant_htons(ETH_P_IP)) { ++ struct iphdr *iph; ++ iph = skb->nh.iph; ++ if (MULTICAST(iph->daddr)) ++ goto outf; ++ } else if (skb->protocol == __constant_htons(ETH_P_IPV6)) { ++ struct ipv6hdr *ip6h; ++ ip6h = skb->nh.ipv6h; ++ if (ipv6_addr_is_multicast(&ip6h->daddr)) ++ goto outf; ++ } else { ++ goto outf; ++ } ++ ++ if (venet_change_skb_owner(skb) < 0) ++ goto outf; ++ ++ if (unlikely(VE_OWNER_SKB(skb)->disable_net)) ++ goto outf; ++ ++ rcv = VE_OWNER_SKB(skb)->_venet_dev; ++ if (!rcv) ++ /* VE going down */ ++ goto outf; ++ ++ dev_hold(rcv); ++ ++ if (!(rcv->flags & IFF_UP)) { ++ /* Target VE does not want to receive packets */ ++ dev_put(rcv); ++ goto outf; ++ } ++ ++ skb->pkt_type = PACKET_HOST; ++ skb->dev = rcv; ++ ++ skb->mac.raw = skb->data; ++ memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len); ++ ++ dst_release(skb->dst); ++ skb->dst = NULL; ++#ifdef CONFIG_NETFILTER ++ nf_conntrack_put(skb->nfct); ++ skb->nfct = NULL; ++#ifdef CONFIG_NETFILTER_DEBUG ++ skb->nf_debug = 0; ++#endif ++#endif ++ length = skb->len; ++ ++ netif_rx(skb); ++ ++ stats->tx_bytes += length; ++ stats->tx_packets++; ++ if (rcv) { ++ struct net_device_stats *rcv_stats = ++ (struct net_device_stats *)rcv->priv; ++ rcv_stats->rx_bytes += length; ++ rcv_stats->rx_packets++; ++ dev_put(rcv); ++ } ++ ++ return 0; ++ ++outf: ++ kfree_skb(skb); ++ ++stats->tx_dropped; ++out: ++ return 0; ++} ++ ++static struct net_device_stats *get_stats(struct net_device *dev) ++{ ++ return (struct net_device_stats *)dev->priv; ++} ++ ++/* Initialize the rest of the LOOPBACK device. */ ++int venet_init_dev(struct net_device *dev) ++{ ++ dev->hard_start_xmit = venet_xmit; ++ dev->priv = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); ++ if (dev->priv == NULL) ++ return -ENOMEM; ++ memset(dev->priv, 0, sizeof(struct net_device_stats)); ++ dev->get_stats = get_stats; ++ dev->open = venet_open; ++ dev->stop = venet_close; ++ dev->destructor = venet_destructor; ++ ++ /* ++ * Fill in the generic fields of the device structure. ++ */ ++ dev->type = ARPHRD_VOID; ++ dev->hard_header_len = ETH_HLEN; ++ dev->mtu = 1500; /* eth_mtu */ ++ dev->tx_queue_len = 0; ++ ++ memset(dev->broadcast, 0xFF, ETH_ALEN); ++ ++ /* New-style flags. */ ++ dev->flags = IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT; ++ return 0; ++} ++ ++static void venet_setup(struct net_device *dev) ++{ ++ dev->init = venet_init_dev; ++ /* ++ * No other features, as they are: ++ * - checksumming is required, and nobody else will done our job ++ */ ++ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL; ++} ++ ++#ifdef CONFIG_PROC_FS ++static int veinfo_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ struct list_head *tmp; ++ ++ seq_printf(m, "%10u %5u %5u", ve->veid, ++ ve->class_id, atomic_read(&ve->pcounter)); ++ read_lock(&veip_hash_lock); ++ if (ve->veip == NULL) ++ goto unlock; ++ list_for_each(tmp, &ve->veip->ip_lh) { ++ char ip[40]; ++ struct ip_entry_struct *entry; ++ ++ entry = list_entry(tmp, struct ip_entry_struct, ve_list); ++ if (entry->active_env == NULL) ++ continue; ++ ++ if (entry->family == AF_INET) ++ sprintf(ip, "%u.%u.%u.%u", NIPQUAD(entry->key[3])); ++ else ++ sprintf(ip, "%x:%x:%x:%x:%x:%x:%x:%x", ++ ntohl(entry->key[0])>>16, ++ ntohl(entry->key[0])&0xFFFF, ++ ntohl(entry->key[1])>>16, ++ ntohl(entry->key[1])&0xFFFF, ++ ntohl(entry->key[2])>>16, ++ ntohl(entry->key[2])&0xFFFF, ++ ntohl(entry->key[3])>>16, ++ ntohl(entry->key[3])&0xFFFF); ++ seq_printf(m, " %39s", ip); ++ } ++unlock: ++ read_unlock(&veip_hash_lock); ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static void *ve_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ve_struct *ve, *curve; ++ loff_t l; ++ ++ curve = get_exec_env(); ++ read_lock(&ve_list_guard); ++ if (!ve_is_super(curve)) { ++ if (*pos != 0) ++ return NULL; ++ return curve; ++ } ++ for (ve = ve_list_head, l = *pos; ++ ve != NULL && l > 0; ++ ve = ve->next, l--); ++ return ve; ++} ++ ++static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ ++ if (!ve_is_super(get_exec_env())) ++ return NULL; ++ (*pos)++; ++ return ve->next; ++} ++ ++static void ve_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_list_guard); ++} ++ ++ ++static struct seq_operations veinfo_seq_op = { ++ start: ve_seq_start, ++ next: ve_seq_next, ++ stop: ve_seq_stop, ++ show: veinfo_seq_show ++}; ++ ++static int veinfo_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veinfo_seq_op); ++} ++ ++static struct file_operations proc_veinfo_operations = { ++ open: veinfo_open, ++ read: seq_read, ++ llseek: seq_lseek, ++ release: seq_release ++}; ++ ++static void *veip_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t l; ++ struct list_head *p; ++ int i; ++ ++ l = *pos; ++ write_lock_irq(&veip_hash_lock); ++ if (l == 0) ++ return ip_entry_hash_table; ++ for (i = 0; i < VEIP_HASH_SZ; i++) { ++ list_for_each(p, ip_entry_hash_table + i) { ++ if (--l == 0) ++ return p; ++ } ++ } ++ return NULL; ++} ++ ++static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct list_head *p; ++ ++ p = (struct list_head *)v; ++ while (1) { ++ p = p->next; ++ if (p < ip_entry_hash_table || ++ p >= ip_entry_hash_table + VEIP_HASH_SZ) { ++ (*pos)++; ++ return p; ++ } ++ if (++p >= ip_entry_hash_table + VEIP_HASH_SZ) ++ return NULL; ++ } ++ return NULL; ++} ++ ++static void veip_seq_stop(struct seq_file *m, void *v) ++{ ++ write_unlock_irq(&veip_hash_lock); ++} ++ ++static struct seq_operations veip_seq_op = { ++ start: veip_seq_start, ++ next: veip_seq_next, ++ stop: veip_seq_stop, ++ show: veip_seq_show ++}; ++ ++static int veip_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &veip_seq_op); ++} ++ ++static struct file_operations proc_veip_operations = { ++ open: veip_open, ++ read: seq_read, ++ llseek: seq_lseek, ++ release: seq_release ++}; ++#endif ++ ++int real_ve_ip_map(envid_t veid, int op, struct sockaddr *uservaddr, int addrlen) ++{ ++ int err; ++ union { ++ struct sockaddr g; ++ struct sockaddr_in a4; ++ struct sockaddr_in6 a6; ++ } addr; ++ struct ve_struct *ve; ++ ++ err = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ ++ err = -EINVAL; ++ if (addrlen > sizeof(addr) || addrlen < sizeof(struct sockaddr_in)) ++ goto out; ++ ++ err = move_addr_to_kernel(uservaddr, addrlen, &addr); ++ if (err < 0) ++ goto out; ++ ++ err = -EINVAL; ++ if (addr.g.sa_family == AF_INET) { ++ if (addrlen != sizeof(struct sockaddr_in)) ++ goto out; ++ } else if (addr.g.sa_family == AF_INET6) { ++ if (addrlen != sizeof(struct sockaddr_in6)) ++ goto out; ++ } else { ++ err = -EAFNOSUPPORT; ++ goto out; ++ } ++ ++ switch (op) ++ { ++ case VE_IP_ADD: ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veip_entry_add(ve, &addr.g); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ ++ case VE_IP_DEL: ++ err = veip_entry_del(veid, &addr.g); ++ break; ++ default: ++ err = -EINVAL; ++ } ++ ++out: ++ return err; ++} ++ ++int venet_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VENETCTL_VE_IP_MAP: { ++ struct vzctl_ve_ip_map s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen); ++ } ++ break; ++ } ++ return err; ++} ++ ++static struct vzioctlinfo venetcalls = { ++ type: VENETCTLTYPE, ++ func: venet_ioctl, ++ owner: THIS_MODULE, ++}; ++ ++int venet_dev_start(struct ve_struct *env) ++{ ++ struct net_device *dev_venet; ++ int err; ++ ++ dev_venet = alloc_netdev(0, "venet%d", venet_setup); ++ if (!dev_venet) ++ return -ENOMEM; ++ err = dev_alloc_name(dev_venet, dev_venet->name); ++ if (err<0) ++ goto err; ++ if ((err = register_netdev(dev_venet)) != 0) ++ goto err; ++ env->_venet_dev = dev_venet; ++ return 0; ++err: ++ free_netdev(dev_venet); ++ printk(KERN_ERR "VENET initialization error err=%d\n", err); ++ return err; ++} ++ ++static int venet_start(unsigned int hooknum, void *data) ++{ ++ struct ve_struct *env; ++ int err; ++ ++ env = (struct ve_struct *)data; ++ if (env->veip) ++ return -EEXIST; ++ if (!ve_is_super(env) && !try_module_get(THIS_MODULE)) ++ return 0; ++ ++ err = veip_start(env); ++ if (err) ++ goto err; ++ ++ err = venet_dev_start(env); ++ if (err) ++ goto err_free; ++ return 0; ++ ++err_free: ++ veip_stop(env); ++err: ++ if (!ve_is_super(env)) ++ module_put(THIS_MODULE); ++ return err; ++} ++ ++static int venet_stop(unsigned int hooknum, void *data) ++{ ++ struct ve_struct *env; ++ ++ env = (struct ve_struct *)data; ++ veip_stop(env); ++ if (!ve_is_super(env)) ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++#define VE_HOOK_PRI_NET 0 ++ ++static struct ve_hook venet_ve_hook_init = { ++ hook: venet_start, ++ undo: venet_stop, ++ hooknum: VE_HOOK_INIT, ++ priority: VE_HOOK_PRI_NET ++}; ++ ++static struct ve_hook venet_ve_hook_fini = { ++ hook: venet_stop, ++ hooknum: VE_HOOK_FINI, ++ priority: VE_HOOK_PRI_NET ++}; ++ ++__init int venet_init(void) ++{ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *de; ++#endif ++ int i, err; ++ ++ if (get_ve0()->_venet_dev != NULL) ++ return -EEXIST; ++ ++ for (i = 0; i < VEIP_HASH_SZ; i++) ++ INIT_LIST_HEAD(ip_entry_hash_table + i); ++ ++ err = venet_start(VE_HOOK_INIT, (void *)get_ve0()); ++ if (err) ++ return err; ++ ++#ifdef CONFIG_PROC_FS ++ de = create_proc_glob_entry("vz/veinfo", ++ S_IFREG|S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_veinfo_operations; ++ else ++ printk(KERN_WARNING "venet: can't make veinfo proc entry\n"); ++ ++ de = create_proc_entry("vz/veip", S_IFREG|S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_veip_operations; ++ else ++ printk(KERN_WARNING "venet: can't make veip proc entry\n"); ++#endif ++ ++ ve_hook_register(&venet_ve_hook_init); ++ ve_hook_register(&venet_ve_hook_fini); ++ vzioctl_register(&venetcalls); ++ return 0; ++} ++ ++__exit void venet_exit(void) ++{ ++ struct net_device *dev_venet; ++ ++ vzioctl_unregister(&venetcalls); ++ ve_hook_unregister(&venet_ve_hook_fini); ++ ve_hook_unregister(&venet_ve_hook_init); ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("vz/veip", NULL); ++ remove_proc_entry("vz/veinfo", NULL); ++#endif ++ ++ dev_venet = get_ve0()->_venet_dev; ++ if (dev_venet != NULL) { ++ get_ve0()->_venet_dev = NULL; ++ unregister_netdev(dev_venet); ++ free_netdev(dev_venet); ++ } ++ veip_stop(get_ve0()); ++} ++ ++module_init(venet_init); ++module_exit(venet_exit); +diff -upr linux-2.6.16.orig/drivers/net/veth.c linux-2.6.16-026test015/drivers/net/veth.c +--- linux-2.6.16.orig/drivers/net/veth.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/drivers/net/veth.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,582 @@ ++/* ++ * veth.c ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * Virtual ethernet device used to change VE ownership on packets ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/interrupt.h> ++#include <linux/fs.h> ++#include <linux/types.h> ++#include <linux/string.h> ++#include <linux/socket.h> ++#include <linux/errno.h> ++#include <linux/fcntl.h> ++#include <linux/in.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/tcp.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++ ++#include <asm/system.h> ++#include <asm/uaccess.h> ++#include <asm/io.h> ++#include <asm/unistd.h> ++ ++#include <linux/inet.h> ++#include <linux/netdevice.h> ++#include <linux/etherdevice.h> ++#include <net/ip.h> ++#include <linux/skbuff.h> ++#include <net/sock.h> ++#include <linux/if_ether.h> /* For the statistics structure. */ ++#include <linux/if_arp.h> /* For ARPHRD_ETHER */ ++#include <linux/ve_proto.h> ++#include <linux/vzctl.h> ++#include <linux/vzctl_veth.h> ++ ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/vzcalluser.h> ++ ++struct veth_struct ++{ ++ struct net_device_stats stats; ++ struct net_device *pair; ++ struct list_head hwaddr_list; ++}; ++ ++struct list_head veth_hwaddr_list; ++rwlock_t ve_hwaddr_lock = RW_LOCK_UNLOCKED; ++DECLARE_MUTEX(hwaddr_sem); ++ ++#define veth_from_netdev(dev) \ ++ ((struct veth_struct *)(netdev_priv(dev))) ++#define veth_to_netdev(veth) \ ++ ((struct net_device*)((char*)veth - \ ++ (unsigned long)netdev_priv(NULL))) ++ ++struct net_device * veth_dev_start(char *dev_addr, char *name); ++ ++struct veth_struct *hwaddr_entry_lookup(char *name) ++{ ++ struct veth_struct *entry; ++ struct list_head *tmp; ++ ++ list_for_each(tmp, &veth_hwaddr_list) { ++ entry = list_entry(tmp, struct veth_struct, hwaddr_list); ++ BUG_ON(entry->pair == NULL); ++ if (strncmp(name, entry->pair->name, IFNAMSIZ) == 0) ++ return entry; ++ } ++ return NULL; ++} ++ ++int veth_entry_add(struct ve_struct *ve, char *dev_addr, char *name, ++ char *dev_addr_ve, char *name_ve) ++{ ++ struct net_device *dev_ve; ++ struct net_device *dev_ve0; ++ struct ve_struct *old_env; ++ char dev_name[IFNAMSIZ]; ++ int err; ++ ++ down(&hwaddr_sem); ++ ++ if (name[0] == '\0') ++ snprintf(dev_name, sizeof(dev_name), "vz%d.%%d", ve->veid); ++ else { ++ memcpy(dev_name, name, IFNAMSIZ - 1); ++ dev_name[IFNAMSIZ - 1] = '\0'; ++ } ++ dev_ve0 = veth_dev_start(dev_addr, dev_name); ++ if (IS_ERR(dev_ve0)) { ++ err = PTR_ERR(dev_ve0); ++ goto err; ++ } ++ ++ old_env = set_exec_env(ve); ++ if (name_ve[0] == '\0') ++ sprintf(dev_name, "eth%%d"); ++ else { ++ memcpy(dev_name, name_ve, IFNAMSIZ - 1); ++ dev_name[IFNAMSIZ - 1] = '\0'; ++ } ++ dev_ve = veth_dev_start(dev_addr_ve, dev_name); ++ if (IS_ERR(dev_ve)) { ++ err = PTR_ERR(dev_ve); ++ goto err_ve; ++ } ++ set_exec_env(old_env); ++ veth_from_netdev(dev_ve)->pair = dev_ve0; ++ veth_from_netdev(dev_ve0)->pair = dev_ve; ++ ++ write_lock(&ve_hwaddr_lock); ++ list_add(&(veth_from_netdev(dev_ve)->hwaddr_list), &veth_hwaddr_list); ++ write_unlock(&ve_hwaddr_lock); ++ ++ up(&hwaddr_sem); ++ return 0; ++ ++err_ve: ++ set_exec_env(old_env); ++ unregister_netdev(dev_ve0); ++err: ++ up(&hwaddr_sem); ++ return err; ++} ++ ++int veth_entry_del(struct ve_struct *ve, char *name) ++{ ++ struct veth_struct *found; ++ struct ve_struct *old_env; ++ struct net_device *dev; ++ int err; ++ ++ err = -ENODEV; ++ down(&hwaddr_sem); ++ found = hwaddr_entry_lookup(name); ++ if (found == NULL) ++ goto out; ++ if (veth_to_netdev(found)->owner_env != ve) ++ goto out; ++ ++ write_lock(&ve_hwaddr_lock); ++ list_del(&found->hwaddr_list); ++ write_unlock(&ve_hwaddr_lock); ++ err = 0; ++ dev = found->pair; ++ BUG_ON(found->pair == NULL); ++ ++ old_env = get_exec_env(); ++ set_exec_env(ve); ++ unregister_netdev(veth_to_netdev(found)); ++ set_exec_env(old_env); ++ ++ unregister_netdev(dev); ++ ++out: ++ up(&hwaddr_sem); ++ return err; ++} ++ ++/* ++ * Device functions ++ */ ++ ++static int veth_open(struct net_device *dev) ++{ ++ return 0; ++} ++ ++static int veth_close(struct net_device *master) ++{ ++ return 0; ++} ++ ++static void veth_destructor(struct net_device *dev) ++{ ++ free_netdev(dev); ++} ++ ++static struct net_device_stats *get_stats(struct net_device *dev) ++{ ++ return &veth_from_netdev(dev)->stats; ++} ++ ++/* ++ * The higher levels take care of making this non-reentrant (it's ++ * called with bh's disabled). ++ */ ++static int veth_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct net_device_stats *stats = get_stats(dev); ++ struct net_device *rcv = NULL; ++ struct veth_struct *entry; ++ int length; ++ ++ if (unlikely(get_exec_env()->disable_net)) ++ goto outf; ++ ++ skb_orphan(skb); ++ ++ entry = veth_from_netdev(dev); ++ rcv = entry->pair; ++ if (!rcv) ++ /* VE going down */ ++ goto outf; ++ ++ if (unlikely(rcv->owner_env->disable_net)) ++ goto outf; ++ ++ skb->owner_env = rcv->owner_env; ++ ++ if (!(rcv->flags & IFF_UP)) { ++ /* Target VE does not want to receive packets */ ++ goto outf; ++ } ++ ++ skb->dev = rcv; ++ skb->pkt_type = PACKET_HOST; ++ skb->protocol = eth_type_trans(skb, rcv); ++ ++ dst_release(skb->dst); ++ skb->dst = NULL; ++#ifdef CONFIG_NETFILTER ++ nf_conntrack_put(skb->nfct); ++ skb->nfct = NULL; ++#ifdef CONFIG_NETFILTER_DEBUG ++ skb->nf_debug = 0; ++#endif ++#endif ++ length = skb->len; ++ ++ netif_rx(skb); ++ ++ stats->tx_bytes += length; ++ stats->tx_packets++; ++ if (rcv) { ++ struct net_device_stats *rcv_stats = get_stats(rcv); ++ rcv_stats->rx_bytes += length; ++ rcv_stats->rx_packets++; ++ } ++ ++ return 0; ++ ++outf: ++ kfree_skb(skb); ++ stats->tx_dropped++; ++ return 0; ++} ++ ++int veth_init_dev(struct net_device *dev) ++{ ++ dev->hard_start_xmit = veth_xmit; ++ dev->get_stats = get_stats; ++ dev->open = veth_open; ++ dev->stop = veth_close; ++ dev->destructor = veth_destructor; ++ ++ ether_setup(dev); ++ ++ dev->tx_queue_len = 0; ++ return 0; ++} ++ ++static void veth_setup(struct net_device *dev) ++{ ++ dev->init = veth_init_dev; ++ /* ++ * No other features, as they are: ++ * - checksumming is required, and nobody else will done our job ++ */ ++ dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL; ++} ++ ++#ifdef CONFIG_PROC_FS ++#define ADDR_FMT "%02x:%02x:%02x:%02x:%02x:%02x" ++#define ADDR(x) (x)[0],(x)[1],(x)[2],(x)[3],(x)[4],(x)[5] ++static int vehwaddr_seq_show(struct seq_file *m, void *v) ++{ ++ struct list_head *p; ++ struct veth_struct *entry; ++ ++ p = (struct list_head *)v; ++ if (p == &veth_hwaddr_list) { ++ seq_puts(m, "Version: 1.0\n"); ++ return 0; ++ } ++ entry = list_entry(p, struct veth_struct, hwaddr_list); ++ seq_printf(m, ADDR_FMT " %16s ", ++ ADDR(entry->pair->dev_addr), entry->pair->name); ++ seq_printf(m, ADDR_FMT " %16s %10u\n", ++ ADDR(veth_to_netdev(entry)->dev_addr), ++ veth_to_netdev(entry)->name, ++ VEID(veth_to_netdev(entry)->owner_env)); ++ return 0; ++} ++ ++static void *vehwaddr_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t l; ++ struct list_head *p; ++ ++ l = *pos; ++ read_lock(&ve_hwaddr_lock); ++ if (l == 0) ++ return &veth_hwaddr_list; ++ list_for_each(p, &veth_hwaddr_list) { ++ if (--l == 0) ++ return p; ++ } ++ return NULL; ++} ++ ++static void *vehwaddr_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct list_head *p; ++ ++ p = (struct list_head *)v; ++ (*pos)++; ++ return p->next == &veth_hwaddr_list ? NULL : p->next; ++} ++ ++static void vehwaddr_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_hwaddr_lock); ++} ++ ++static struct seq_operations vehwaddr_seq_op = { ++ .start = vehwaddr_seq_start, ++ .next = vehwaddr_seq_next, ++ .stop = vehwaddr_seq_stop, ++ .show = vehwaddr_seq_show ++}; ++ ++static int vehwaddr_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &vehwaddr_seq_op); ++} ++ ++static struct file_operations proc_vehwaddr_operations = { ++ .open = vehwaddr_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release ++}; ++#endif ++ ++int real_ve_hwaddr(envid_t veid, int op, ++ unsigned char *dev_addr, int addrlen, char *name, ++ unsigned char *dev_addr_ve, int addrlen_ve, char *name_ve) ++{ ++ int err; ++ struct ve_struct *ve; ++ char ve_addr[ETH_ALEN]; ++ ++ err = -EPERM; ++ if (!capable(CAP_NET_ADMIN)) ++ goto out; ++ ++ err = -EINVAL; ++ switch (op) ++ { ++ case VE_ETH_ADD: ++ if (addrlen != ETH_ALEN) ++ goto out; ++ if (addrlen_ve != ETH_ALEN && addrlen_ve != 0) ++ goto out; ++ /* If ve addr is not set then we use dev_addr[3] & 0x80 for it */ ++ if (addrlen_ve == 0 && (dev_addr[3] & 0x80)) ++ goto out; ++ if (addrlen_ve == 0) { ++ memcpy(ve_addr, dev_addr, ETH_ALEN); ++ ve_addr[3] |= 0x80; ++ } else { ++ memcpy(ve_addr, dev_addr_ve, ETH_ALEN); ++ } ++ ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veth_entry_add(ve, dev_addr, name, ++ ve_addr, name_ve); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ ++ case VE_ETH_DEL: ++ if (name[0] == '\0') ++ goto out; ++ ve = get_ve_by_id(veid); ++ err = -ESRCH; ++ if (!ve) ++ goto out; ++ ++ down_read(&ve->op_sem); ++ if (ve->is_running) ++ err = veth_entry_del(ve, name); ++ up_read(&ve->op_sem); ++ put_ve(ve); ++ break; ++ } ++ ++out: ++ return err; ++} ++ ++int veth_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VETHCTL_VE_HWADDR: { ++ struct vzctl_ve_hwaddr s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_ve_hwaddr(s.veid, s.op, ++ s.dev_addr, s.addrlen, s.dev_name, ++ s.dev_addr_ve, s.addrlen_ve, s.dev_name_ve); ++ } ++ break; ++ } ++ return err; ++} ++ ++static struct vzioctlinfo vethcalls = { ++ .type = VETHCTLTYPE, ++ .func = veth_ioctl, ++ .owner = THIS_MODULE, ++}; ++ ++struct net_device * veth_dev_start(char *dev_addr, char *name) ++{ ++ struct net_device *dev; ++ int err; ++ ++ dev = alloc_netdev(sizeof(struct veth_struct), name, veth_setup); ++ if (!dev) ++ return ERR_PTR(-ENOMEM); ++ if (strchr(dev->name, '%')) { ++ err = dev_alloc_name(dev, dev->name); ++ if (err < 0) ++ goto err; ++ } ++ if ((err = register_netdev(dev)) != 0) ++ goto err; ++ ++ memcpy(dev->dev_addr, dev_addr, ETH_ALEN); ++ dev->addr_len = ETH_ALEN; ++ ++ return dev; ++err: ++ free_netdev(dev); ++ printk(KERN_ERR "%s initialization error err=%d\n", name, err); ++ return ERR_PTR(err); ++} ++ ++static int veth_stop(unsigned int hooknum, void *data) ++{ ++ struct ve_struct *old_env; ++ struct ve_struct *env; ++ struct list_head *tmp, *n; ++ ++ env = (struct ve_struct *)data; ++ down(&hwaddr_sem); ++ list_for_each_safe(tmp, n, &veth_hwaddr_list) { ++ struct veth_struct *entry; ++ struct net_device *dev; ++ entry = list_entry(tmp, struct veth_struct, hwaddr_list); ++ if (VEID(env) != VEID(veth_to_netdev(entry)->owner_env)) ++ continue; ++ ++ write_lock(&ve_hwaddr_lock); ++ list_del(&entry->hwaddr_list); ++ write_unlock(&ve_hwaddr_lock); ++ ++ dev = entry->pair; ++ BUG_ON(entry->pair == NULL); ++ old_env = set_exec_env(env); ++ unregister_netdev(veth_to_netdev(entry)); ++ set_exec_env(old_env); ++ ++ old_env = set_exec_env(get_ve0()); ++ unregister_netdev(dev); ++ set_exec_env(old_env); ++ } ++ up(&hwaddr_sem); ++ return 0; ++} ++ ++#define VE_HOOK_PRI_NET 0 ++ ++static struct ve_hook veth_ve_hook_fini = { ++ .hook = veth_stop, ++ .hooknum = VE_HOOK_FINI, ++ .priority = VE_HOOK_PRI_NET, ++ .owner = THIS_MODULE, ++}; ++ ++__init int veth_init(void) ++{ ++#ifdef CONFIG_PROC_FS ++ struct proc_dir_entry *de; ++#endif ++ ++ INIT_LIST_HEAD(&veth_hwaddr_list); ++ ++#ifdef CONFIG_PROC_FS ++ de = create_proc_glob_entry("vz/veth", ++ S_IFREG|S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_vehwaddr_operations; ++ else ++ printk(KERN_WARNING "veth: can't make vehwaddr proc entry\n"); ++ ++#endif ++ ++ ve_hook_register(&veth_ve_hook_fini); ++ vzioctl_register(&vethcalls); ++ return 0; ++} ++ ++__exit void veth_exit(void) ++{ ++ struct veth_struct *entry; ++ struct list_head *tmp, *n; ++ struct ve_struct *ve; ++ struct ve_struct *old_env; ++ ++ vzioctl_unregister(&vethcalls); ++ ve_hook_unregister(&veth_ve_hook_fini); ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("vz/veth", NULL); ++#endif ++ ++ down(&hwaddr_sem); ++ list_for_each_safe(tmp, n, &veth_hwaddr_list) { ++ struct net_device *dev; ++ entry = list_entry(tmp, struct veth_struct, hwaddr_list); ++ ve = get_ve(veth_to_netdev(entry)->owner_env); ++ ++ write_lock(&ve_hwaddr_lock); ++ list_del(&entry->hwaddr_list); ++ write_unlock(&ve_hwaddr_lock); ++ ++ dev = entry->pair; ++ BUG_ON(entry->pair == NULL); ++ old_env = set_exec_env(ve); ++ unregister_netdev(veth_to_netdev(entry)); ++ set_exec_env(old_env); ++ ++ unregister_netdev(dev); ++ ++ put_ve(ve); ++ } ++ up(&hwaddr_sem); ++} ++ ++module_init(veth_init); ++module_exit(veth_exit); ++ ++MODULE_AUTHOR("Andrey Mirkin <amirkin@sw.ru>"); ++MODULE_DESCRIPTION("Virtuozzo Virtual Ethernet Device"); ++MODULE_LICENSE("GPL v2"); ++ +diff -upr linux-2.6.16.orig/drivers/net/via-rhine.c linux-2.6.16-026test015/drivers/net/via-rhine.c +--- linux-2.6.16.orig/drivers/net/via-rhine.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/via-rhine.c 2006-07-04 14:41:36.000000000 +0400 +@@ -129,6 +129,7 @@ + - Massive clean-up + - Rewrite PHY, media handling (remove options, full_duplex, backoff) + - Fix Tx engine race for good ++ - Craig Brind: Zero padded aligned buffers for short packets. + + */ + +@@ -1306,7 +1307,12 @@ static int rhine_start_tx(struct sk_buff + rp->stats.tx_dropped++; + return 0; + } ++ ++ /* Padding is not copied and so must be redone. */ + skb_copy_and_csum_dev(skb, rp->tx_buf[entry]); ++ if (skb->len < ETH_ZLEN) ++ memset(rp->tx_buf[entry] + skb->len, 0, ++ ETH_ZLEN - skb->len); + rp->tx_skbuff_dma[entry] = 0; + rp->tx_ring[entry].addr = cpu_to_le32(rp->tx_bufs_dma + + (rp->tx_buf[entry] - +diff -upr linux-2.6.16.orig/drivers/net/wireless/Kconfig linux-2.6.16-026test015/drivers/net/wireless/Kconfig +--- linux-2.6.16.orig/drivers/net/wireless/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/wireless/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -239,7 +239,8 @@ config IPW2200_DEBUG + + config AIRO + tristate "Cisco/Aironet 34X/35X/4500/4800 ISA and PCI cards" +- depends on NET_RADIO && ISA_DMA_API && CRYPTO && (PCI || BROKEN) ++ depends on NET_RADIO && ISA_DMA_API && (PCI || BROKEN) ++ select CRYPTO + ---help--- + This is the standard Linux driver to support Cisco/Aironet ISA and + PCI 802.11 wireless cards. +@@ -374,6 +375,7 @@ config PCMCIA_HERMES + config PCMCIA_SPECTRUM + tristate "Symbol Spectrum24 Trilogy PCMCIA card support" + depends on NET_RADIO && PCMCIA && HERMES ++ select FW_LOADER + ---help--- + + This is a driver for 802.11b cards using RAM-loadable Symbol +@@ -387,6 +389,7 @@ config PCMCIA_SPECTRUM + config AIRO_CS + tristate "Cisco/Aironet 34X/35X/4500/4800 PCMCIA cards" + depends on NET_RADIO && PCMCIA && (BROKEN || !M32R) ++ select CRYPTO + ---help--- + This is the standard Linux driver to support Cisco/Aironet PCMCIA + 802.11 wireless cards. This driver is the same as the Aironet +diff -upr linux-2.6.16.orig/drivers/net/wireless/hostap/hostap_80211_tx.c linux-2.6.16-026test015/drivers/net/wireless/hostap/hostap_80211_tx.c +--- linux-2.6.16.orig/drivers/net/wireless/hostap/hostap_80211_tx.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/wireless/hostap/hostap_80211_tx.c 2006-07-04 14:41:36.000000000 +0400 +@@ -469,7 +469,7 @@ int hostap_master_start_xmit(struct sk_b + } + + if (local->ieee_802_1x && meta->ethertype == ETH_P_PAE && tx.crypt && +- !(fc & IEEE80211_FCTL_VERS)) { ++ !(fc & IEEE80211_FCTL_PROTECTED)) { + no_encrypt = 1; + PDEBUG(DEBUG_EXTRA2, "%s: TX: IEEE 802.1X - passing " + "unencrypted EAPOL frame\n", dev->name); +diff -upr linux-2.6.16.orig/drivers/net/wireless/ipw2200.c linux-2.6.16-026test015/drivers/net/wireless/ipw2200.c +--- linux-2.6.16.orig/drivers/net/wireless/ipw2200.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/net/wireless/ipw2200.c 2006-07-04 14:41:36.000000000 +0400 +@@ -8391,20 +8391,28 @@ static int ipw_wx_get_range(struct net_d + + i = 0; + if (priv->ieee->mode & (IEEE_B | IEEE_G)) { +- for (j = 0; j < geo->bg_channels && i < IW_MAX_FREQUENCIES; +- i++, j++) { ++ for (j = 0; j < geo->bg_channels && i < IW_MAX_FREQUENCIES; j++) { ++ if ((priv->ieee->iw_mode == IW_MODE_ADHOC) && ++ (geo->bg[j].flags & IEEE80211_CH_PASSIVE_ONLY)) ++ continue; ++ + range->freq[i].i = geo->bg[j].channel; + range->freq[i].m = geo->bg[j].freq * 100000; + range->freq[i].e = 1; ++ i++; + } + } + + if (priv->ieee->mode & IEEE_A) { +- for (j = 0; j < geo->a_channels && i < IW_MAX_FREQUENCIES; +- i++, j++) { ++ for (j = 0; j < geo->a_channels && i < IW_MAX_FREQUENCIES; j++) { ++ if ((priv->ieee->iw_mode == IW_MODE_ADHOC) && ++ (geo->a[j].flags & IEEE80211_CH_PASSIVE_ONLY)) ++ continue; ++ + range->freq[i].i = geo->a[j].channel; + range->freq[i].m = geo->a[j].freq * 100000; + range->freq[i].e = 1; ++ i++; + } + } + +@@ -9956,9 +9964,8 @@ static int ipw_ethtool_set_eeprom(struct + return -EINVAL; + down(&p->sem); + memcpy(&p->eeprom[eeprom->offset], bytes, eeprom->len); +- for (i = IPW_EEPROM_DATA; +- i < IPW_EEPROM_DATA + IPW_EEPROM_IMAGE_SIZE; i++) +- ipw_write8(p, i, p->eeprom[i]); ++ for (i = 0; i < IPW_EEPROM_IMAGE_SIZE; i++) ++ ipw_write8(p, i + IPW_EEPROM_DATA, p->eeprom[i]); + up(&p->sem); + return 0; + } +diff -upr linux-2.6.16.orig/drivers/pci/pci-acpi.c linux-2.6.16-026test015/drivers/pci/pci-acpi.c +--- linux-2.6.16.orig/drivers/pci/pci-acpi.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/pci/pci-acpi.c 2006-07-04 14:41:36.000000000 +0400 +@@ -33,13 +33,10 @@ acpi_query_osc ( + acpi_status status; + struct acpi_object_list input; + union acpi_object in_params[4]; +- struct acpi_buffer output; +- union acpi_object out_obj; ++ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; ++ union acpi_object *out_obj; + u32 osc_dw0; + +- /* Setting up output buffer */ +- output.length = sizeof(out_obj) + 3*sizeof(u32); +- output.pointer = &out_obj; + + /* Setting up input parameters */ + input.count = 4; +@@ -61,12 +58,15 @@ acpi_query_osc ( + "Evaluate _OSC Set fails. Status = 0x%04x\n", status); + return status; + } +- if (out_obj.type != ACPI_TYPE_BUFFER) { ++ out_obj = output.pointer; ++ ++ if (out_obj->type != ACPI_TYPE_BUFFER) { + printk(KERN_DEBUG + "Evaluate _OSC returns wrong type\n"); +- return AE_TYPE; ++ status = AE_TYPE; ++ goto query_osc_out; + } +- osc_dw0 = *((u32 *) out_obj.buffer.pointer); ++ osc_dw0 = *((u32 *) out_obj->buffer.pointer); + if (osc_dw0) { + if (osc_dw0 & OSC_REQUEST_ERROR) + printk(KERN_DEBUG "_OSC request fails\n"); +@@ -76,15 +76,21 @@ acpi_query_osc ( + printk(KERN_DEBUG "_OSC invalid revision\n"); + if (osc_dw0 & OSC_CAPABILITIES_MASK_ERROR) { + /* Update Global Control Set */ +- global_ctrlsets = *((u32 *)(out_obj.buffer.pointer+8)); +- return AE_OK; ++ global_ctrlsets = *((u32 *)(out_obj->buffer.pointer+8)); ++ status = AE_OK; ++ goto query_osc_out; + } +- return AE_ERROR; ++ status = AE_ERROR; ++ goto query_osc_out; + } + + /* Update Global Control Set */ +- global_ctrlsets = *((u32 *)(out_obj.buffer.pointer + 8)); +- return AE_OK; ++ global_ctrlsets = *((u32 *)(out_obj->buffer.pointer + 8)); ++ status = AE_OK; ++ ++query_osc_out: ++ kfree(output.pointer); ++ return status; + } + + +@@ -96,14 +102,10 @@ acpi_run_osc ( + acpi_status status; + struct acpi_object_list input; + union acpi_object in_params[4]; +- struct acpi_buffer output; +- union acpi_object out_obj; ++ struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; ++ union acpi_object *out_obj; + u32 osc_dw0; + +- /* Setting up output buffer */ +- output.length = sizeof(out_obj) + 3*sizeof(u32); +- output.pointer = &out_obj; +- + /* Setting up input parameters */ + input.count = 4; + input.pointer = in_params; +@@ -124,12 +126,14 @@ acpi_run_osc ( + "Evaluate _OSC Set fails. Status = 0x%04x\n", status); + return status; + } +- if (out_obj.type != ACPI_TYPE_BUFFER) { ++ out_obj = output.pointer; ++ if (out_obj->type != ACPI_TYPE_BUFFER) { + printk(KERN_DEBUG + "Evaluate _OSC returns wrong type\n"); +- return AE_TYPE; ++ status = AE_TYPE; ++ goto run_osc_out; + } +- osc_dw0 = *((u32 *) out_obj.buffer.pointer); ++ osc_dw0 = *((u32 *) out_obj->buffer.pointer); + if (osc_dw0) { + if (osc_dw0 & OSC_REQUEST_ERROR) + printk(KERN_DEBUG "_OSC request fails\n"); +@@ -139,11 +143,17 @@ acpi_run_osc ( + printk(KERN_DEBUG "_OSC invalid revision\n"); + if (osc_dw0 & OSC_CAPABILITIES_MASK_ERROR) { + printk(KERN_DEBUG "_OSC FW not grant req. control\n"); +- return AE_SUPPORT; ++ status = AE_SUPPORT; ++ goto run_osc_out; + } +- return AE_ERROR; ++ status = AE_ERROR; ++ goto run_osc_out; + } +- return AE_OK; ++ status = AE_OK; ++ ++run_osc_out: ++ kfree(output.pointer); ++ return status; + } + + /** +diff -upr linux-2.6.16.orig/drivers/pci/probe.c linux-2.6.16-026test015/drivers/pci/probe.c +--- linux-2.6.16.orig/drivers/pci/probe.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/pci/probe.c 2006-07-04 14:41:38.000000000 +0400 +@@ -21,6 +21,7 @@ LIST_HEAD(pci_root_buses); + EXPORT_SYMBOL(pci_root_buses); + + LIST_HEAD(pci_devices); ++EXPORT_SYMBOL(pci_devices); + + #ifdef HAVE_PCI_LEGACY + /** +diff -upr linux-2.6.16.orig/drivers/pci/quirks.c linux-2.6.16-026test015/drivers/pci/quirks.c +--- linux-2.6.16.orig/drivers/pci/quirks.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/pci/quirks.c 2006-07-04 14:41:36.000000000 +0400 +@@ -631,6 +631,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_V + * non-x86 architectures (yes Via exists on PPC among other places), + * we must mask the PCI_INTERRUPT_LINE value versus 0xf to get + * interrupts delivered properly. ++ * ++ * Some of the on-chip devices are actually '586 devices' so they are ++ * listed here. + */ + static void quirk_via_irq(struct pci_dev *dev) + { +@@ -639,13 +642,19 @@ static void quirk_via_irq(struct pci_dev + new_irq = dev->irq & 0xf; + pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &irq); + if (new_irq != irq) { +- printk(KERN_INFO "PCI: Via IRQ fixup for %s, from %d to %d\n", ++ printk(KERN_INFO "PCI: VIA IRQ fixup for %s, from %d to %d\n", + pci_name(dev), irq, new_irq); + udelay(15); /* unknown if delay really needed */ + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, new_irq); + } + } +-DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_ANY_ID, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_0, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_1, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_2, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C586_3, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686_4, quirk_via_irq); ++DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_82C686_5, quirk_via_irq); + + /* + * VIA VT82C598 has its device ID settable and many BIOSes +@@ -861,6 +870,7 @@ static void __init quirk_eisa_bridge(str + } + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82375, quirk_eisa_bridge ); + ++#ifndef CONFIG_ACPI_SLEEP + /* + * On ASUS P4B boards, the SMBus PCI Device within the ICH2/4 southbridge + * is not activated. The myth is that Asus said that they do not want the +@@ -872,8 +882,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_I + * bridge. Unfortunately, this device has no subvendor/subdevice ID. So it + * becomes necessary to do this tweak in two steps -- I've chosen the Host + * bridge as trigger. ++ * ++ * Actually, leaving it unhidden and not redoing the quirk over suspend2ram ++ * will cause thermal management to break down, and causing machine to ++ * overheat. + */ +-static int __initdata asus_hides_smbus = 0; ++static int __initdata asus_hides_smbus; + + static void __init asus_hides_smbus_hostbridge(struct pci_dev *dev) + { +@@ -1008,6 +1022,8 @@ static void __init asus_hides_smbus_lpc_ + } + DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, asus_hides_smbus_lpc_ich6 ); + ++#endif ++ + /* + * SiS 96x south bridge: BIOS typically hides SMBus device... + */ +diff -upr linux-2.6.16.orig/drivers/pcmcia/ds.c linux-2.6.16-026test015/drivers/pcmcia/ds.c +--- linux-2.6.16.orig/drivers/pcmcia/ds.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/pcmcia/ds.c 2006-07-04 14:41:36.000000000 +0400 +@@ -546,7 +546,7 @@ static int pcmcia_device_query(struct pc + tmp = vers1->str + vers1->ofs[i]; + + length = strlen(tmp) + 1; +- if ((length < 3) || (length > 255)) ++ if ((length < 2) || (length > 255)) + continue; + + p_dev->prod_id[i] = kmalloc(sizeof(char) * length, +diff -upr linux-2.6.16.orig/drivers/s390/cio/cio.c linux-2.6.16-026test015/drivers/s390/cio/cio.c +--- linux-2.6.16.orig/drivers/s390/cio/cio.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/s390/cio/cio.c 2006-07-04 14:41:38.000000000 +0400 +@@ -610,7 +610,11 @@ do_IRQ (struct pt_regs *regs) + struct tpi_info *tpi_info; + struct subchannel *sch; + struct irb *irb; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; + ++ ve = set_exec_env(get_ve0()); ++ ub = set_exec_ub(get_ub0()); + irq_enter (); + asm volatile ("mc 0,0"); + if (S390_lowcore.int_clock >= S390_lowcore.jiffy_timer) +@@ -657,6 +661,8 @@ do_IRQ (struct pt_regs *regs) + */ + } while (!MACHINE_IS_VM && tpi (NULL) != 0); + irq_exit (); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); + } + + #ifdef CONFIG_CCW_CONSOLE +diff -upr linux-2.6.16.orig/drivers/scsi/3w-9xxx.c linux-2.6.16-026test015/drivers/scsi/3w-9xxx.c +--- linux-2.6.16.orig/drivers/scsi/3w-9xxx.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/scsi/3w-9xxx.c 2006-07-04 14:41:36.000000000 +0400 +@@ -85,7 +85,7 @@ + #include "3w-9xxx.h" + + /* Globals */ +-#define TW_DRIVER_VERSION "2.26.02.005" ++#define TW_DRIVER_VERSION "2.26.02.007" + static TW_Device_Extension *twa_device_extension_list[TW_MAX_SLOT]; + static unsigned int twa_device_extension_count; + static int twa_major = -1; +@@ -1944,9 +1944,13 @@ static void twa_scsiop_execute_scsi_comp + } + if (tw_dev->srb[request_id]->use_sg == 1) { + struct scatterlist *sg = (struct scatterlist *)tw_dev->srb[request_id]->request_buffer; +- char *buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset; ++ char *buf; ++ unsigned long flags = 0; ++ local_irq_save(flags); ++ buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset; + memcpy(buf, tw_dev->generic_buffer_virt[request_id], sg->length); + kunmap_atomic(buf - sg->offset, KM_IRQ0); ++ local_irq_restore(flags); + } + } + } /* End twa_scsiop_execute_scsi_complete() */ +diff -upr linux-2.6.16.orig/drivers/scsi/3w-xxxx.c linux-2.6.16-026test015/drivers/scsi/3w-xxxx.c +--- linux-2.6.16.orig/drivers/scsi/3w-xxxx.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/scsi/3w-xxxx.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1508,10 +1508,12 @@ static void tw_transfer_internal(TW_Devi + struct scsi_cmnd *cmd = tw_dev->srb[request_id]; + void *buf; + unsigned int transfer_len; ++ unsigned long flags = 0; + + if (cmd->use_sg) { + struct scatterlist *sg = + (struct scatterlist *)cmd->request_buffer; ++ local_irq_save(flags); + buf = kmap_atomic(sg->page, KM_IRQ0) + sg->offset; + transfer_len = min(sg->length, len); + } else { +@@ -1526,6 +1528,7 @@ static void tw_transfer_internal(TW_Devi + + sg = (struct scatterlist *)cmd->request_buffer; + kunmap_atomic(buf - sg->offset, KM_IRQ0); ++ local_irq_restore(flags); + } + } + +diff -upr linux-2.6.16.orig/drivers/scsi/libata-core.c linux-2.6.16-026test015/drivers/scsi/libata-core.c +--- linux-2.6.16.orig/drivers/scsi/libata-core.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/scsi/libata-core.c 2006-07-04 14:41:36.000000000 +0400 +@@ -4293,6 +4293,7 @@ static int ata_start_drive(struct ata_po + int ata_device_resume(struct ata_port *ap, struct ata_device *dev) + { + if (ap->flags & ATA_FLAG_SUSPENDED) { ++ ata_busy_wait(ap, ATA_BUSY | ATA_DRQ, 200000); + ap->flags &= ~ATA_FLAG_SUSPENDED; + ata_set_mode(ap); + } +diff -upr linux-2.6.16.orig/drivers/scsi/sata_mv.c linux-2.6.16-026test015/drivers/scsi/sata_mv.c +--- linux-2.6.16.orig/drivers/scsi/sata_mv.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/scsi/sata_mv.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1102,6 +1102,7 @@ static u8 mv_get_crpb_status(struct ata_ + void __iomem *port_mmio = mv_ap_base(ap); + struct mv_port_priv *pp = ap->private_data; + u32 out_ptr; ++ u8 ata_status; + + out_ptr = readl(port_mmio + EDMA_RSP_Q_OUT_PTR_OFS); + +@@ -1109,6 +1110,8 @@ static u8 mv_get_crpb_status(struct ata_ + assert(((out_ptr >> EDMA_RSP_Q_PTR_SHIFT) & MV_MAX_Q_DEPTH_MASK) == + pp->rsp_consumer); + ++ ata_status = pp->crpb[pp->rsp_consumer].flags >> CRPB_FLAG_STATUS_SHIFT; ++ + /* increment our consumer index... */ + pp->rsp_consumer = mv_inc_q_index(&pp->rsp_consumer); + +@@ -1123,7 +1126,7 @@ static u8 mv_get_crpb_status(struct ata_ + writelfl(out_ptr, port_mmio + EDMA_RSP_Q_OUT_PTR_OFS); + + /* Return ATA status register for completed CRPB */ +- return (pp->crpb[pp->rsp_consumer].flags >> CRPB_FLAG_STATUS_SHIFT); ++ return ata_status; + } + + /** +@@ -1192,7 +1195,6 @@ static void mv_host_intr(struct ata_host + u32 hc_irq_cause; + int shift, port, port0, hard_port, handled; + unsigned int err_mask; +- u8 ata_status = 0; + + if (hc == 0) { + port0 = 0; +@@ -1210,6 +1212,7 @@ static void mv_host_intr(struct ata_host + hc,relevant,hc_irq_cause); + + for (port = port0; port < port0 + MV_PORTS_PER_HC; port++) { ++ u8 ata_status = 0; + ap = host_set->ports[port]; + hard_port = port & MV_PORT_MASK; /* range 0-3 */ + handled = 0; /* ensure ata_status is set if handled++ */ +diff -upr linux-2.6.16.orig/drivers/scsi/scsi_lib.c linux-2.6.16-026test015/drivers/scsi/scsi_lib.c +--- linux-2.6.16.orig/drivers/scsi/scsi_lib.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/scsi/scsi_lib.c 2006-07-04 14:41:36.000000000 +0400 +@@ -368,7 +368,7 @@ static int scsi_req_map_sg(struct reques + int nsegs, unsigned bufflen, gfp_t gfp) + { + struct request_queue *q = rq->q; +- int nr_pages = (bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ int nr_pages = (bufflen + sgl[0].offset + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned int data_len = 0, len, bytes, off; + struct page *page; + struct bio *bio = NULL; +diff -upr linux-2.6.16.orig/drivers/sn/ioc3.c linux-2.6.16-026test015/drivers/sn/ioc3.c +--- linux-2.6.16.orig/drivers/sn/ioc3.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/sn/ioc3.c 2006-07-04 14:41:36.000000000 +0400 +@@ -677,7 +677,7 @@ static int ioc3_probe(struct pci_dev *pd + /* Track PCI-device specific data */ + pci_set_drvdata(pdev, idd); + down_write(&ioc3_devices_rwsem); +- list_add(&idd->list, &ioc3_devices); ++ list_add_tail(&idd->list, &ioc3_devices); + idd->id = ioc3_counter++; + up_write(&ioc3_devices_rwsem); + +diff -upr linux-2.6.16.orig/drivers/sn/ioc4.c linux-2.6.16-026test015/drivers/sn/ioc4.c +--- linux-2.6.16.orig/drivers/sn/ioc4.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/sn/ioc4.c 2006-07-04 14:41:36.000000000 +0400 +@@ -313,7 +313,7 @@ ioc4_probe(struct pci_dev *pdev, const s + idd->idd_serial_data = NULL; + pci_set_drvdata(idd->idd_pdev, idd); + down_write(&ioc4_devices_rwsem); +- list_add(&idd->idd_list, &ioc4_devices); ++ list_add_tail(&idd->idd_list, &ioc4_devices); + up_write(&ioc4_devices_rwsem); + + /* Add this IOC4 to all submodules */ +diff -upr linux-2.6.16.orig/drivers/usb/core/message.c linux-2.6.16-026test015/drivers/usb/core/message.c +--- linux-2.6.16.orig/drivers/usb/core/message.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/usb/core/message.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1388,11 +1388,13 @@ free_interfaces: + if (dev->state != USB_STATE_ADDRESS) + usb_disable_device (dev, 1); // Skip ep0 + +- i = dev->bus_mA - cp->desc.bMaxPower * 2; +- if (i < 0) +- dev_warn(&dev->dev, "new config #%d exceeds power " +- "limit by %dmA\n", +- configuration, -i); ++ if (cp) { ++ i = dev->bus_mA - cp->desc.bMaxPower * 2; ++ if (i < 0) ++ dev_warn(&dev->dev, "new config #%d exceeds power " ++ "limit by %dmA\n", ++ configuration, -i); ++ } + + if ((ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), + USB_REQ_SET_CONFIGURATION, 0, configuration, 0, +diff -upr linux-2.6.16.orig/drivers/usb/host/ehci-sched.c linux-2.6.16-026test015/drivers/usb/host/ehci-sched.c +--- linux-2.6.16.orig/drivers/usb/host/ehci-sched.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/usb/host/ehci-sched.c 2006-07-04 14:41:36.000000000 +0400 +@@ -707,6 +707,7 @@ iso_stream_init ( + } else { + u32 addr; + int think_time; ++ int hs_transfers; + + addr = dev->ttport << 24; + if (!ehci_is_TDI(ehci) +@@ -719,6 +720,7 @@ iso_stream_init ( + think_time = dev->tt ? dev->tt->think_time : 0; + stream->tt_usecs = NS_TO_US (think_time + usb_calc_bus_time ( + dev->speed, is_input, 1, maxp)); ++ hs_transfers = max (1u, (maxp + 187) / 188); + if (is_input) { + u32 tmp; + +@@ -727,12 +729,11 @@ iso_stream_init ( + stream->usecs = HS_USECS_ISO (1); + stream->raw_mask = 1; + +- /* pessimistic c-mask */ +- tmp = usb_calc_bus_time (USB_SPEED_FULL, 1, 0, maxp) +- / (125 * 1000); +- stream->raw_mask |= 3 << (tmp + 9); ++ /* c-mask as specified in USB 2.0 11.18.4 3.c */ ++ tmp = (1 << (hs_transfers + 2)) - 1; ++ stream->raw_mask |= tmp << (8 + 2); + } else +- stream->raw_mask = smask_out [maxp / 188]; ++ stream->raw_mask = smask_out [hs_transfers - 1]; + bandwidth = stream->usecs + stream->c_usecs; + bandwidth /= 1 << (interval + 2); + +diff -upr linux-2.6.16.orig/drivers/usb/serial/console.c linux-2.6.16-026test015/drivers/usb/serial/console.c +--- linux-2.6.16.orig/drivers/usb/serial/console.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/usb/serial/console.c 2006-07-04 14:41:36.000000000 +0400 +@@ -54,7 +54,7 @@ static struct console usbcons; + * serial.c code, except that the specifier is "ttyUSB" instead + * of "ttyS". + */ +-static int __init usb_console_setup(struct console *co, char *options) ++static int usb_console_setup(struct console *co, char *options) + { + struct usbcons_info *info = &usbcons_info; + int baud = 9600; +diff -upr linux-2.6.16.orig/drivers/usb/serial/option.c linux-2.6.16-026test015/drivers/usb/serial/option.c +--- linux-2.6.16.orig/drivers/usb/serial/option.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/usb/serial/option.c 2006-07-04 14:41:36.000000000 +0400 +@@ -582,14 +582,14 @@ static void option_setup_urbs(struct usb + portdata = usb_get_serial_port_data(port); + + /* Do indat endpoints first */ +- for (j = 0; j <= N_IN_URB; ++j) { ++ for (j = 0; j < N_IN_URB; ++j) { + portdata->in_urbs[j] = option_setup_urb (serial, + port->bulk_in_endpointAddress, USB_DIR_IN, port, + portdata->in_buffer[j], IN_BUFLEN, option_indat_callback); + } + + /* outdat endpoints */ +- for (j = 0; j <= N_OUT_URB; ++j) { ++ for (j = 0; j < N_OUT_URB; ++j) { + portdata->out_urbs[j] = option_setup_urb (serial, + port->bulk_out_endpointAddress, USB_DIR_OUT, port, + portdata->out_buffer[j], OUT_BUFLEN, option_outdat_callback); +diff -upr linux-2.6.16.orig/drivers/usb/serial/whiteheat.c linux-2.6.16-026test015/drivers/usb/serial/whiteheat.c +--- linux-2.6.16.orig/drivers/usb/serial/whiteheat.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/usb/serial/whiteheat.c 2006-07-04 14:41:36.000000000 +0400 +@@ -388,7 +388,7 @@ static int whiteheat_attach (struct usb_ + if (ret) { + err("%s: Couldn't send command [%d]", serial->type->description, ret); + goto no_firmware; +- } else if (alen != sizeof(command)) { ++ } else if (alen != 2) { + err("%s: Send command incomplete [%d]", serial->type->description, alen); + goto no_firmware; + } +@@ -400,7 +400,7 @@ static int whiteheat_attach (struct usb_ + if (ret) { + err("%s: Couldn't get results [%d]", serial->type->description, ret); + goto no_firmware; +- } else if (alen != sizeof(result)) { ++ } else if (alen != sizeof(*hw_info) + 1) { + err("%s: Get results incomplete [%d]", serial->type->description, alen); + goto no_firmware; + } else if (result[0] != command[0]) { +diff -upr linux-2.6.16.orig/drivers/usb/storage/Kconfig linux-2.6.16-026test015/drivers/usb/storage/Kconfig +--- linux-2.6.16.orig/drivers/usb/storage/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/usb/storage/Kconfig 2006-07-04 14:41:36.000000000 +0400 +@@ -48,7 +48,8 @@ config USB_STORAGE_FREECOM + + config USB_STORAGE_ISD200 + bool "ISD-200 USB/ATA Bridge support" +- depends on USB_STORAGE && BLK_DEV_IDE ++ depends on USB_STORAGE ++ depends on BLK_DEV_IDE=y || BLK_DEV_IDE=USB_STORAGE + ---help--- + Say Y here if you want to use USB Mass Store devices based + on the In-Systems Design ISD-200 USB/ATA bridge. +diff -upr linux-2.6.16.orig/drivers/video/cfbimgblt.c linux-2.6.16-026test015/drivers/video/cfbimgblt.c +--- linux-2.6.16.orig/drivers/video/cfbimgblt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/video/cfbimgblt.c 2006-07-04 14:41:36.000000000 +0400 +@@ -169,7 +169,7 @@ static inline void slow_imageblit(const + + while (j--) { + l--; +- color = (*s & 1 << (FB_BIT_NR(l))) ? fgcolor : bgcolor; ++ color = (*s & (1 << l)) ? fgcolor : bgcolor; + val |= FB_SHIFT_HIGH(color, shift); + + /* Did the bitshift spill bits to the next long? */ +diff -upr linux-2.6.16.orig/drivers/video/fbmem.c linux-2.6.16-026test015/drivers/video/fbmem.c +--- linux-2.6.16.orig/drivers/video/fbmem.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/video/fbmem.c 2006-07-04 14:41:36.000000000 +0400 +@@ -669,13 +669,19 @@ fb_write(struct file *file, const char _ + total_size = info->fix.smem_len; + + if (p > total_size) +- return 0; ++ return -EFBIG; + +- if (count >= total_size) ++ if (count > total_size) { ++ err = -EFBIG; + count = total_size; ++ } ++ ++ if (count + p > total_size) { ++ if (!err) ++ err = -ENOSPC; + +- if (count + p > total_size) + count = total_size - p; ++ } + + buffer = kmalloc((count > PAGE_SIZE) ? PAGE_SIZE : count, + GFP_KERNEL); +@@ -717,7 +723,7 @@ fb_write(struct file *file, const char _ + + kfree(buffer); + +- return (err) ? err : cnt; ++ return (cnt) ? cnt : err; + } + + #ifdef CONFIG_KMOD +diff -upr linux-2.6.16.orig/drivers/video/i810/i810_main.c linux-2.6.16-026test015/drivers/video/i810/i810_main.c +--- linux-2.6.16.orig/drivers/video/i810/i810_main.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/drivers/video/i810/i810_main.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1508,7 +1508,7 @@ static int i810fb_cursor(struct fb_info + int size = ((cursor->image.width + 7) >> 3) * + cursor->image.height; + int i; +- u8 *data = kmalloc(64 * 8, GFP_KERNEL); ++ u8 *data = kmalloc(64 * 8, GFP_ATOMIC); + + if (data == NULL) + return -ENOMEM; +diff -upr linux-2.6.16.orig/fs/9p/vfs_inode.c linux-2.6.16-026test015/fs/9p/vfs_inode.c +--- linux-2.6.16.orig/fs/9p/vfs_inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/9p/vfs_inode.c 2006-07-04 14:41:36.000000000 +0400 +@@ -614,6 +614,7 @@ static struct dentry *v9fs_vfs_lookup(st + + sb = dir->i_sb; + v9ses = v9fs_inode2v9ses(dir); ++ dentry->d_op = &v9fs_dentry_operations; + dirfid = v9fs_fid_lookup(dentry->d_parent); + + if (!dirfid) { +@@ -681,8 +682,6 @@ static struct dentry *v9fs_vfs_lookup(st + goto FreeFcall; + + fid->qid = fcall->params.rstat.stat.qid; +- +- dentry->d_op = &v9fs_dentry_operations; + v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); + + d_add(dentry, inode); +diff -upr linux-2.6.16.orig/fs/Kconfig linux-2.6.16-026test015/fs/Kconfig +--- linux-2.6.16.orig/fs/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -418,6 +418,15 @@ config QUOTA + with the quota tools. Probably the quota support is only useful for + multi user systems. If unsure, say N. + ++config QUOTA_COMPAT ++ bool "Compatibility with older quotactl interface" ++ depends on QUOTA ++ help ++ This option enables compatibility layer for older version ++ of quotactl interface with byte granularity (QUOTAON at 0x0100, ++ GETQUOTA at 0x0D00). Interface versions older than that one and ++ with block granularity are still not supported. ++ + config QFMT_V1 + tristate "Old quota format support" + depends on QUOTA +@@ -433,6 +442,38 @@ config QFMT_V2 + This quota format allows using quotas with 32-bit UIDs/GIDs. If you + need this functionality say Y here. + ++config SIM_FS ++ tristate "VPS filesystem" ++ depends on VZ_QUOTA ++ default m ++ help ++ This file system is a part of Virtuozzo. It intoduces a fake ++ superblock and blockdev to VE to hide real device and show ++ statfs results taken from quota. ++ ++config VZ_QUOTA ++ tristate "Virtuozzo Disk Quota support" ++ depends on QUOTA ++ default m ++ help ++ Virtuozzo Disk Quota imposes disk quota on directories with their ++ files and subdirectories in total. Such disk quota is used to ++ account and limit disk usage by Virtuozzo VPS, but also may be used ++ separately. ++ ++config VZ_QUOTA_UNLOAD ++ bool "Unloadable Virtuozzo Disk Quota module" ++ depends on VZ_QUOTA=m ++ default n ++ help ++ Make Virtuozzo Disk Quota module unloadable. ++ Doesn't work reliably now. ++ ++config VZ_QUOTA_UGID ++ bool "Per-user and per-group quota in Virtuozzo quota partitions" ++ depends on VZ_QUOTA!=n ++ default y ++ + config QUOTACTL + bool + depends on XFS_QUOTA || QUOTA +diff -upr linux-2.6.16.orig/fs/Makefile linux-2.6.16-026test015/fs/Makefile +--- linux-2.6.16.orig/fs/Makefile 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/Makefile 2006-07-04 14:41:39.000000000 +0400 +@@ -39,9 +39,15 @@ obj-$(CONFIG_QUOTA) += dquot.o + obj-$(CONFIG_QFMT_V1) += quota_v1.o + obj-$(CONFIG_QFMT_V2) += quota_v2.o + obj-$(CONFIG_QUOTACTL) += quota.o ++obj-$(CONFIG_VZ_QUOTA) += vzdquota.o ++vzdquota-y += vzdquot.o vzdq_mgmt.o vzdq_ops.o vzdq_tree.o ++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_ugid.o ++vzdquota-$(CONFIG_VZ_QUOTA_UGID) += vzdq_file.o + + obj-$(CONFIG_DNOTIFY) += dnotify.o + ++obj-$(CONFIG_SIM_FS) += simfs.o ++ + obj-$(CONFIG_PROC_FS) += proc/ + obj-y += partitions/ + obj-$(CONFIG_SYSFS) += sysfs/ +diff -upr linux-2.6.16.orig/fs/aio.c linux-2.6.16-026test015/fs/aio.c +--- linux-2.6.16.orig/fs/aio.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/aio.c 2006-07-04 14:41:39.000000000 +0400 +@@ -41,13 +41,16 @@ + #endif + + /*------ sysctl variables----*/ +-static DEFINE_SPINLOCK(aio_nr_lock); ++DEFINE_SPINLOCK(aio_nr_lock); + unsigned long aio_nr; /* current system wide number of aio requests */ + unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ ++EXPORT_SYMBOL_GPL(aio_nr_lock); ++EXPORT_SYMBOL_GPL(aio_nr); + /*----end sysctl variables---*/ + + static kmem_cache_t *kiocb_cachep; +-static kmem_cache_t *kioctx_cachep; ++kmem_cache_t *kioctx_cachep; ++EXPORT_SYMBOL_GPL(kioctx_cachep); + + static struct workqueue_struct *aio_wq; + +@@ -58,7 +61,7 @@ static DECLARE_WORK(fput_work, aio_fput_ + static DEFINE_SPINLOCK(fput_lock); + static LIST_HEAD(fput_head); + +-static void aio_kick_handler(void *); ++void aio_kick_handler(void *); + static void aio_queue_work(struct kioctx *); + + /* aio_setup +@@ -293,7 +296,7 @@ static void aio_cancel_all(struct kioctx + spin_unlock_irq(&ctx->ctx_lock); + } + +-static void wait_for_all_aios(struct kioctx *ctx) ++void wait_for_all_aios(struct kioctx *ctx) + { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); +@@ -310,6 +313,7 @@ static void wait_for_all_aios(struct kio + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(&ctx->wait, &wait); + } ++EXPORT_SYMBOL_GPL(wait_for_all_aios); + + /* wait_on_sync_kiocb: + * Waits on the given sync kiocb to complete. +@@ -856,7 +860,7 @@ static inline void aio_run_all_iocbs(str + * space. + * Run on aiod's context. + */ +-static void aio_kick_handler(void *data) ++void aio_kick_handler(void *data) + { + struct kioctx *ctx = data; + mm_segment_t oldfs = get_fs(); +@@ -875,6 +879,7 @@ static void aio_kick_handler(void *data) + if (requeue) + queue_work(aio_wq, &ctx->wq); + } ++EXPORT_SYMBOL_GPL(aio_kick_handler); + + + /* +diff -upr linux-2.6.16.orig/fs/autofs/autofs_i.h linux-2.6.16-026test015/fs/autofs/autofs_i.h +--- linux-2.6.16.orig/fs/autofs/autofs_i.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs/autofs_i.h 2006-07-04 14:41:38.000000000 +0400 +@@ -124,7 +124,7 @@ static inline struct autofs_sb_info *aut + filesystem without "magic".) */ + + static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { +- return sbi->catatonic || process_group(current) == sbi->oz_pgrp; ++ return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp; + } + + /* Hash operations */ +diff -upr linux-2.6.16.orig/fs/autofs/init.c linux-2.6.16-026test015/fs/autofs/init.c +--- linux-2.6.16.orig/fs/autofs/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs/init.c 2006-07-04 14:41:38.000000000 +0400 +@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs + .name = "autofs", + .get_sb = autofs_get_sb, + .kill_sb = kill_anon_super, ++ .fs_flags = FS_VIRTUALIZED, + }; + + static int __init init_autofs_fs(void) +diff -upr linux-2.6.16.orig/fs/autofs/inode.c linux-2.6.16-026test015/fs/autofs/inode.c +--- linux-2.6.16.orig/fs/autofs/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs/inode.c 2006-07-04 14:41:38.000000000 +0400 +@@ -66,7 +66,7 @@ static int parse_options(char *options, + + *uid = current->uid; + *gid = current->gid; +- *pgrp = process_group(current); ++ *pgrp = virt_pgid(current); + + *minproto = *maxproto = AUTOFS_PROTO_VERSION; + +@@ -138,7 +138,7 @@ int autofs_fill_super(struct super_block + sbi->magic = AUTOFS_SBI_MAGIC; + sbi->catatonic = 0; + sbi->exp_timeout = 0; +- sbi->oz_pgrp = process_group(current); ++ sbi->oz_pgrp = virt_pgid(current); + autofs_initialize_hash(&sbi->dirhash); + sbi->queues = NULL; + memset(sbi->symlink_bitmap, 0, sizeof(long)*AUTOFS_SYMLINK_BITMAP_LEN); +diff -upr linux-2.6.16.orig/fs/autofs/root.c linux-2.6.16-026test015/fs/autofs/root.c +--- linux-2.6.16.orig/fs/autofs/root.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs/root.c 2006-07-04 14:41:38.000000000 +0400 +@@ -354,7 +354,7 @@ static int autofs_root_unlink(struct ino + + /* This allows root to remove symlinks */ + lock_kernel(); +- if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) { ++ if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) { + unlock_kernel(); + return -EACCES; + } +@@ -541,7 +541,7 @@ static int autofs_root_ioctl(struct inod + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) + return -ENOTTY; + +- if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) ++ if ( !autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) + return -EPERM; + + switch(cmd) { +diff -upr linux-2.6.16.orig/fs/autofs4/autofs_i.h linux-2.6.16-026test015/fs/autofs4/autofs_i.h +--- linux-2.6.16.orig/fs/autofs4/autofs_i.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs4/autofs_i.h 2006-07-04 14:41:38.000000000 +0400 +@@ -122,7 +122,7 @@ static inline struct autofs_info *autofs + filesystem without "magic".) */ + + static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) { +- return sbi->catatonic || process_group(current) == sbi->oz_pgrp; ++ return sbi->catatonic || virt_pgid(current) == sbi->oz_pgrp; + } + + /* Does a dentry have some pending activity? */ +diff -upr linux-2.6.16.orig/fs/autofs4/init.c linux-2.6.16-026test015/fs/autofs4/init.c +--- linux-2.6.16.orig/fs/autofs4/init.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs4/init.c 2006-07-04 14:41:38.000000000 +0400 +@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs + .name = "autofs", + .get_sb = autofs_get_sb, + .kill_sb = kill_anon_super, ++ .fs_flags = FS_VIRTUALIZED, + }; + + static int __init init_autofs4_fs(void) +diff -upr linux-2.6.16.orig/fs/autofs4/inode.c linux-2.6.16-026test015/fs/autofs4/inode.c +--- linux-2.6.16.orig/fs/autofs4/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs4/inode.c 2006-07-04 14:41:38.000000000 +0400 +@@ -179,7 +179,7 @@ static int parse_options(char *options, + + *uid = current->uid; + *gid = current->gid; +- *pgrp = process_group(current); ++ *pgrp = virt_pgid(current); + + *minproto = AUTOFS_MIN_PROTO_VERSION; + *maxproto = AUTOFS_MAX_PROTO_VERSION; +@@ -265,7 +265,7 @@ int autofs4_fill_super(struct super_bloc + sbi->root = NULL; + sbi->catatonic = 0; + sbi->exp_timeout = 0; +- sbi->oz_pgrp = process_group(current); ++ sbi->oz_pgrp = virt_pgid(current); + sbi->sb = s; + sbi->version = 0; + sbi->sub_version = 0; +diff -upr linux-2.6.16.orig/fs/autofs4/root.c linux-2.6.16-026test015/fs/autofs4/root.c +--- linux-2.6.16.orig/fs/autofs4/root.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/autofs4/root.c 2006-07-04 14:41:38.000000000 +0400 +@@ -592,7 +592,7 @@ static int autofs4_dir_unlink(struct ino + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* This allows root to remove symlinks */ +- if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) ++ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) + return -EACCES; + + dput(ino->dentry); +@@ -784,7 +784,7 @@ static int autofs4_root_ioctl(struct ino + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT ) + return -ENOTTY; + +- if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) ) ++ if ( !autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_SYS_ADMIN) ) + return -EPERM; + + switch(cmd) { +diff -upr linux-2.6.16.orig/fs/binfmt_aout.c linux-2.6.16-026test015/fs/binfmt_aout.c +--- linux-2.6.16.orig/fs/binfmt_aout.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_aout.c 2006-07-04 14:41:39.000000000 +0400 +@@ -446,9 +446,11 @@ beyond_if: + #endif + start_thread(regs, ex.a_entry, current->mm->start_stack); + if (unlikely(current->ptrace & PT_PTRACED)) { +- if (current->ptrace & PT_TRACE_EXEC) ++ if (current->ptrace & PT_TRACE_EXEC) { ++ set_pn_state(current, PN_STOP_EXEC); + ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); +- else ++ clear_pn_state(current); ++ } else + send_sig(SIGTRAP, current, 0); + } + return 0; +diff -upr linux-2.6.16.orig/fs/binfmt_elf.c linux-2.6.16-026test015/fs/binfmt_elf.c +--- linux-2.6.16.orig/fs/binfmt_elf.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_elf.c 2006-07-04 14:41:39.000000000 +0400 +@@ -361,7 +361,7 @@ static unsigned long load_elf_interp(str + eppnt = elf_phdata; + for (i=0; i<interp_elf_ex->e_phnum; i++, eppnt++) { + if (eppnt->p_type == PT_LOAD) { +- int elf_type = MAP_PRIVATE | MAP_DENYWRITE; ++ int elf_type = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECPRIO; + int elf_prot = 0; + unsigned long vaddr = 0; + unsigned long k, map_addr; +@@ -669,7 +669,7 @@ static int load_elf_binary(struct linux_ + */ + SET_PERSONALITY(loc->elf_ex, ibcs2_interpreter); + +- interpreter = open_exec(elf_interpreter); ++ interpreter = open_exec(elf_interpreter, NULL); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) + goto out_free_interp; +@@ -834,7 +834,7 @@ static int load_elf_binary(struct linux_ + if (elf_ppnt->p_flags & PF_W) elf_prot |= PROT_WRITE; + if (elf_ppnt->p_flags & PF_X) elf_prot |= PROT_EXEC; + +- elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; ++ elf_flags = MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE|MAP_EXECPRIO; + + vaddr = elf_ppnt->p_vaddr; + if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { +@@ -1000,9 +1000,11 @@ static int load_elf_binary(struct linux_ + + start_thread(regs, elf_entry, bprm->p); + if (unlikely(current->ptrace & PT_PTRACED)) { +- if (current->ptrace & PT_TRACE_EXEC) ++ if (current->ptrace & PT_TRACE_EXEC) { ++ set_pn_state(current, PN_STOP_EXEC); + ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); +- else ++ clear_pn_state(current); ++ } else + send_sig(SIGTRAP, current, 0); + } + retval = 0; +@@ -1022,8 +1024,13 @@ out_free_file: + sys_close(elf_exec_fileno); + out_free_fh: + if (files) { +- put_files_struct(current->files); ++ struct files_struct *old; ++ ++ old = current->files; ++ task_lock(current); + current->files = files; ++ task_unlock(current); ++ put_files_struct(old); + } + out_free_ph: + kfree(elf_phdata); +@@ -1281,10 +1288,10 @@ static void fill_prstatus(struct elf_prs + prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; + prstatus->pr_sigpend = p->pending.signal.sig[0]; + prstatus->pr_sighold = p->blocked.sig[0]; +- prstatus->pr_pid = p->pid; +- prstatus->pr_ppid = p->parent->pid; +- prstatus->pr_pgrp = process_group(p); +- prstatus->pr_sid = p->signal->session; ++ prstatus->pr_pid = virt_pid(p); ++ prstatus->pr_ppid = virt_pid(p->parent); ++ prstatus->pr_pgrp = virt_pgid(p); ++ prstatus->pr_sid = virt_sid(p); + if (thread_group_leader(p)) { + /* + * This is the record for the group leader. Add in the +@@ -1327,10 +1334,10 @@ static int fill_psinfo(struct elf_prpsin + psinfo->pr_psargs[i] = ' '; + psinfo->pr_psargs[len] = 0; + +- psinfo->pr_pid = p->pid; +- psinfo->pr_ppid = p->parent->pid; +- psinfo->pr_pgrp = process_group(p); +- psinfo->pr_sid = p->signal->session; ++ psinfo->pr_pid = virt_pid(p); ++ psinfo->pr_ppid = virt_pid(p->parent); ++ psinfo->pr_pgrp = virt_pgid(p); ++ psinfo->pr_sid = virt_sid(p); + + i = p->state ? ffz(~p->state) + 1 : 0; + psinfo->pr_state = i; +@@ -1463,7 +1470,7 @@ static int elf_core_dump(long signr, str + if (signr) { + struct elf_thread_status *tmp; + read_lock(&tasklist_lock); +- do_each_thread(g,p) ++ do_each_thread_ve(g,p) + if (current->mm == p->mm && current != p) { + tmp = kmalloc(sizeof(*tmp), GFP_ATOMIC); + if (!tmp) { +@@ -1475,7 +1482,7 @@ static int elf_core_dump(long signr, str + tmp->thread = p; + list_add(&tmp->list, &thread_list); + } +- while_each_thread(g,p); ++ while_each_thread_ve(g,p); + read_unlock(&tasklist_lock); + list_for_each(t, &thread_list) { + struct elf_thread_status *tmp; +diff -upr linux-2.6.16.orig/fs/binfmt_elf_fdpic.c linux-2.6.16-026test015/fs/binfmt_elf_fdpic.c +--- linux-2.6.16.orig/fs/binfmt_elf_fdpic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_elf_fdpic.c 2006-07-04 14:41:37.000000000 +0400 +@@ -205,7 +205,7 @@ static int load_elf_fdpic_binary(struct + kdebug("Using ELF interpreter %s", interpreter_name); + + /* replace the program with the interpreter */ +- interpreter = open_exec(interpreter_name); ++ interpreter = open_exec(interpreter_name, bprm); + retval = PTR_ERR(interpreter); + if (IS_ERR(interpreter)) { + interpreter = NULL; +diff -upr linux-2.6.16.orig/fs/binfmt_em86.c linux-2.6.16-026test015/fs/binfmt_em86.c +--- linux-2.6.16.orig/fs/binfmt_em86.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_em86.c 2006-07-04 14:41:37.000000000 +0400 +@@ -82,7 +82,7 @@ static int load_em86(struct linux_binprm + * Note that we use open_exec() as the name is now in kernel + * space, and we don't need to copy it. + */ +- file = open_exec(interp); ++ file = open_exec(interp, bprm); + if (IS_ERR(file)) + return PTR_ERR(file); + +diff -upr linux-2.6.16.orig/fs/binfmt_flat.c linux-2.6.16-026test015/fs/binfmt_flat.c +--- linux-2.6.16.orig/fs/binfmt_flat.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_flat.c 2006-07-04 14:41:37.000000000 +0400 +@@ -774,7 +774,7 @@ static int load_flat_shared_library(int + + /* Open the file up */ + bprm.filename = buf; +- bprm.file = open_exec(bprm.filename); ++ bprm.file = open_exec(bprm.filename, bprm); + res = PTR_ERR(bprm.file); + if (IS_ERR(bprm.file)) + return res; +diff -upr linux-2.6.16.orig/fs/binfmt_misc.c linux-2.6.16-026test015/fs/binfmt_misc.c +--- linux-2.6.16.orig/fs/binfmt_misc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_misc.c 2006-07-04 14:41:37.000000000 +0400 +@@ -179,7 +179,7 @@ static int load_misc_binary(struct linux + + bprm->interp = iname; /* for binfmt_script */ + +- interp_file = open_exec (iname); ++ interp_file = open_exec (iname, bprm); + retval = PTR_ERR (interp_file); + if (IS_ERR (interp_file)) + goto _error; +@@ -216,8 +216,13 @@ _error: + bprm->interp_data = 0; + _unshare: + if (files) { +- put_files_struct(current->files); ++ struct files_struct *old; ++ ++ old = current->files; ++ task_lock(current); + current->files = files; ++ task_unlock(current); ++ put_files_struct(old); + } + goto _ret; + } +diff -upr linux-2.6.16.orig/fs/binfmt_script.c linux-2.6.16-026test015/fs/binfmt_script.c +--- linux-2.6.16.orig/fs/binfmt_script.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/binfmt_script.c 2006-07-04 14:41:37.000000000 +0400 +@@ -85,7 +85,7 @@ static int load_script(struct linux_binp + /* + * OK, now restart the process with the interpreter's dentry. + */ +- file = open_exec(interp); ++ file = open_exec(interp, bprm); + if (IS_ERR(file)) + return PTR_ERR(file); + +diff -upr linux-2.6.16.orig/fs/block_dev.c linux-2.6.16-026test015/fs/block_dev.c +--- linux-2.6.16.orig/fs/block_dev.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/block_dev.c 2006-07-04 14:41:37.000000000 +0400 +@@ -561,9 +561,16 @@ static int do_open(struct block_device * + { + struct module *owner = NULL; + struct gendisk *disk; +- int ret = -ENXIO; ++ int ret; + int part; + ++#ifdef CONFIG_VE ++ ret = get_device_perms_ve(S_IFBLK, bdev->bd_dev, ++ file->f_mode&(FMODE_READ|FMODE_WRITE)); ++ if (ret) ++ return ret; ++#endif ++ ret = -ENXIO; + file->f_mapping = bdev->bd_inode->i_mapping; + lock_kernel(); + disk = get_gendisk(bdev->bd_dev, &part); +@@ -832,7 +839,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); + * namespace if possible and return it. Return ERR_PTR(error) + * otherwise. + */ +-struct block_device *lookup_bdev(const char *path) ++struct block_device *lookup_bdev(const char *path, int mode) + { + struct block_device *bdev; + struct inode *inode; +@@ -850,6 +857,11 @@ struct block_device *lookup_bdev(const c + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto fail; ++#ifdef CONFIG_VE ++ error = get_device_perms_ve(S_IFBLK, inode->i_rdev, mode); ++ if (error) ++ goto fail; ++#endif + error = -EACCES; + if (nd.mnt->mnt_flags & MNT_NODEV) + goto fail; +@@ -881,12 +893,13 @@ struct block_device *open_bdev_excl(cons + mode_t mode = FMODE_READ; + int error = 0; + +- bdev = lookup_bdev(path); ++ if (!(flags & MS_RDONLY)) ++ mode |= FMODE_WRITE; ++ ++ bdev = lookup_bdev(path, mode); + if (IS_ERR(bdev)) + return bdev; + +- if (!(flags & MS_RDONLY)) +- mode |= FMODE_WRITE; + error = blkdev_get(bdev, mode, 0); + if (error) + return ERR_PTR(error); +diff -upr linux-2.6.16.orig/fs/buffer.c linux-2.6.16-026test015/fs/buffer.c +--- linux-2.6.16.orig/fs/buffer.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/buffer.c 2006-07-04 14:41:37.000000000 +0400 +@@ -1942,8 +1942,9 @@ static int __block_prepare_write(struct + if (err) + break; + if (buffer_new(bh)) { +- unmap_underlying_metadata(bh->b_bdev, +- bh->b_blocknr); ++ if (buffer_mapped(bh)) ++ unmap_underlying_metadata(bh->b_bdev, ++ bh->b_blocknr); + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; +diff -upr linux-2.6.16.orig/fs/char_dev.c linux-2.6.16-026test015/fs/char_dev.c +--- linux-2.6.16.orig/fs/char_dev.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/char_dev.c 2006-07-04 14:41:37.000000000 +0400 +@@ -15,6 +15,7 @@ + #include <linux/module.h> + #include <linux/smp_lock.h> + #include <linux/devfs_fs_kernel.h> ++#include <linux/seq_file.h> + + #include <linux/kobject.h> + #include <linux/kobj_map.h> +@@ -26,8 +27,6 @@ + + static struct kobj_map *cdev_map; + +-#define MAX_PROBE_HASH 255 /* random */ +- + static DECLARE_MUTEX(chrdevs_lock); + + static struct char_device_struct { +@@ -38,93 +37,29 @@ static struct char_device_struct { + char name[64]; + struct file_operations *fops; + struct cdev *cdev; /* will die */ +-} *chrdevs[MAX_PROBE_HASH]; ++} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; + + /* index in the above */ + static inline int major_to_index(int major) + { +- return major % MAX_PROBE_HASH; +-} +- +-struct chrdev_info { +- int index; +- struct char_device_struct *cd; +-}; +- +-void *get_next_chrdev(void *dev) +-{ +- struct chrdev_info *info; +- +- if (dev == NULL) { +- info = kmalloc(sizeof(*info), GFP_KERNEL); +- if (!info) +- goto out; +- info->index=0; +- info->cd = chrdevs[info->index]; +- if (info->cd) +- goto out; +- } else { +- info = dev; +- } +- +- while (info->index < ARRAY_SIZE(chrdevs)) { +- if (info->cd) +- info->cd = info->cd->next; +- if (info->cd) +- goto out; +- /* +- * No devices on this chain, move to the next +- */ +- info->index++; +- info->cd = (info->index < ARRAY_SIZE(chrdevs)) ? +- chrdevs[info->index] : NULL; +- if (info->cd) +- goto out; +- } +- +-out: +- return info; +-} +- +-void *acquire_chrdev_list(void) +-{ +- down(&chrdevs_lock); +- return get_next_chrdev(NULL); +-} +- +-void release_chrdev_list(void *dev) +-{ +- up(&chrdevs_lock); +- kfree(dev); ++ return major % CHRDEV_MAJOR_HASH_SIZE; + } + ++#ifdef CONFIG_PROC_FS + +-int count_chrdev_list(void) ++void chrdev_show(struct seq_file *f, off_t offset) + { + struct char_device_struct *cd; +- int i, count; + +- count = 0; +- +- for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) { +- for (cd = chrdevs[i]; cd; cd = cd->next) +- count++; ++ if (offset < CHRDEV_MAJOR_HASH_SIZE) { ++ down(&chrdevs_lock); ++ for (cd = chrdevs[offset]; cd; cd = cd->next) ++ seq_printf(f, "%3d %s\n", cd->major, cd->name); ++ up(&chrdevs_lock); + } +- +- return count; + } + +-int get_chrdev_info(void *dev, int *major, char **name) +-{ +- struct chrdev_info *info = dev; +- +- if (info->cd == NULL) +- return 1; +- +- *major = info->cd->major; +- *name = info->cd->name; +- return 0; +-} ++#endif /* CONFIG_PROC_FS */ + + /* + * Register a single major with a specified minor range. +@@ -342,6 +277,13 @@ int chrdev_open(struct inode * inode, st + struct cdev *new = NULL; + int ret = 0; + ++#ifdef CONFIG_VE ++ ret = get_device_perms_ve(S_IFCHR, inode->i_rdev, ++ filp->f_mode&(FMODE_READ|FMODE_WRITE)); ++ if (ret) ++ return ret; ++#endif ++ + spin_lock(&cdev_lock); + p = inode->i_cdev; + if (!p) { +diff -upr linux-2.6.16.orig/fs/cifs/cifsencrypt.c linux-2.6.16-026test015/fs/cifs/cifsencrypt.c +--- linux-2.6.16.orig/fs/cifs/cifsencrypt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/cifs/cifsencrypt.c 2006-07-04 14:41:36.000000000 +0400 +@@ -56,9 +56,6 @@ int cifs_sign_smb(struct smb_hdr * cifs_ + int rc = 0; + char smb_signature[20]; + +- /* BB remember to initialize sequence number elsewhere and initialize mac_signing key elsewhere BB */ +- /* BB remember to add code to save expected sequence number in midQ entry BB */ +- + if((cifs_pdu == NULL) || (server == NULL)) + return -EINVAL; + +@@ -85,20 +82,33 @@ int cifs_sign_smb(struct smb_hdr * cifs_ + static int cifs_calc_signature2(const struct kvec * iov, int n_vec, + const char * key, char * signature) + { +- struct MD5Context context; +- +- if((iov == NULL) || (signature == NULL)) +- return -EINVAL; ++ struct MD5Context context; ++ int i; + +- MD5Init(&context); +- MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); ++ if((iov == NULL) || (signature == NULL)) ++ return -EINVAL; + +-/* MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length); */ /* BB FIXME BB */ ++ MD5Init(&context); ++ MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); ++ for(i=0;i<n_vec;i++) { ++ if(iov[i].iov_base == NULL) { ++ cERROR(1,("null iovec entry")); ++ return -EIO; ++ } else if(iov[i].iov_len == 0) ++ break; /* bail out if we are sent nothing to sign */ ++ /* The first entry includes a length field (which does not get ++ signed that occupies the first 4 bytes before the header */ ++ if(i==0) { ++ if (iov[0].iov_len <= 8 ) /* cmd field at offset 9 */ ++ break; /* nothing to sign or corrupt header */ ++ MD5Update(&context,iov[0].iov_base+4, iov[0].iov_len-4); ++ } else ++ MD5Update(&context,iov[i].iov_base, iov[i].iov_len); ++ } + +- MD5Final(signature,&context); ++ MD5Final(signature,&context); + +- return -EOPNOTSUPP; +-/* return 0; */ ++ return 0; + } + + +diff -upr linux-2.6.16.orig/fs/cifs/cifsfs.c linux-2.6.16-026test015/fs/cifs/cifsfs.c +--- linux-2.6.16.orig/fs/cifs/cifsfs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/cifs/cifsfs.c 2006-07-04 14:41:37.000000000 +0400 +@@ -220,7 +220,8 @@ cifs_statfs(struct super_block *sb, stru + longer available? */ + } + +-static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd) ++static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + struct cifs_sb_info *cifs_sb; + +@@ -232,7 +233,7 @@ static int cifs_permission(struct inode + on the client (above and beyond ACL on servers) for + servers which do not support setting and viewing mode bits, + so allowing client to check permissions is useful */ +- return generic_permission(inode, mask, NULL); ++ return generic_permission(inode, mask, NULL, perm); + } + + static kmem_cache_t *cifs_inode_cachep; +diff -upr linux-2.6.16.orig/fs/cifs/dir.c linux-2.6.16-026test015/fs/cifs/dir.c +--- linux-2.6.16.orig/fs/cifs/dir.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/cifs/dir.c 2006-07-04 14:41:36.000000000 +0400 +@@ -441,6 +441,20 @@ cifs_lookup(struct inode *parent_dir_ino + cifs_sb = CIFS_SB(parent_dir_inode->i_sb); + pTcon = cifs_sb->tcon; + ++ /* ++ * Don't allow the separator character in a path component. ++ * The VFS will not allow "/", but "\" is allowed by posix. ++ */ ++ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { ++ int i; ++ for (i = 0; i < direntry->d_name.len; i++) ++ if (direntry->d_name.name[i] == '\\') { ++ cFYI(1, ("Invalid file name")); ++ FreeXid(xid); ++ return ERR_PTR(-EINVAL); ++ } ++ } ++ + /* can not grab the rename sem here since it would + deadlock in the cases (beginning of sys_rename itself) + in which we already have the sb rename sem */ +diff -upr linux-2.6.16.orig/fs/coda/dir.c linux-2.6.16-026test015/fs/coda/dir.c +--- linux-2.6.16.orig/fs/coda/dir.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/coda/dir.c 2006-07-04 14:41:37.000000000 +0400 +@@ -151,7 +151,8 @@ exit: + } + + +-int coda_permission(struct inode *inode, int mask, struct nameidata *nd) ++int coda_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + int error = 0; + +diff -upr linux-2.6.16.orig/fs/coda/pioctl.c linux-2.6.16-026test015/fs/coda/pioctl.c +--- linux-2.6.16.orig/fs/coda/pioctl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/coda/pioctl.c 2006-07-04 14:41:37.000000000 +0400 +@@ -25,7 +25,7 @@ + + /* pioctl ops */ + static int coda_ioctl_permission(struct inode *inode, int mask, +- struct nameidata *nd); ++ struct nameidata *nd, struct exec_perm *perm); + static int coda_pioctl(struct inode * inode, struct file * filp, + unsigned int cmd, unsigned long user_data); + +@@ -43,7 +43,7 @@ struct file_operations coda_ioctl_operat + + /* the coda pioctl inode ops */ + static int coda_ioctl_permission(struct inode *inode, int mask, +- struct nameidata *nd) ++ struct nameidata *nd, struct exec_perm *perm) + { + return 0; + } +diff -upr linux-2.6.16.orig/fs/compat.c linux-2.6.16-026test015/fs/compat.c +--- linux-2.6.16.orig/fs/compat.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/compat.c 2006-07-04 14:41:39.000000000 +0400 +@@ -197,6 +197,8 @@ asmlinkage long compat_sys_statfs(const + struct kstatfs tmp; + error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); + if (!error) ++ error = faudit_statfs(nd.mnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs(buf, &tmp); + path_release(&nd); + } +@@ -215,6 +217,8 @@ asmlinkage long compat_sys_fstatfs(unsig + goto out; + error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); + if (!error) ++ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs(buf, &tmp); + fput(file); + out: +@@ -265,6 +269,8 @@ asmlinkage long compat_sys_statfs64(cons + struct kstatfs tmp; + error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); + if (!error) ++ error = faudit_statfs(nd.mnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs64(buf, &tmp); + path_release(&nd); + } +@@ -286,6 +292,8 @@ asmlinkage long compat_sys_fstatfs64(uns + goto out; + error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); + if (!error) ++ error = faudit_statfs(file->f_vfsmnt->mnt_sb, &tmp); ++ if (!error) + error = put_compat_statfs64(buf, &tmp); + fput(file); + out: +@@ -1215,6 +1223,10 @@ static ssize_t compat_do_readv_writev(in + if (ret < 0) + goto out; + ++ ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE); ++ if (ret) ++ goto out; ++ + fnv = NULL; + if (type == READ) { + fn = file->f_op->read; +@@ -1479,7 +1491,7 @@ int compat_do_execve(char * filename, + goto out_ret; + memset(bprm, 0, sizeof(*bprm)); + +- file = open_exec(filename); ++ file = open_exec(filename, bprm); + retval = PTR_ERR(file); + if (IS_ERR(file)) + goto out_kfree; +@@ -1897,7 +1909,7 @@ asmlinkage long compat_sys_ppoll(struct + } + + if (sigmask) { +- if (sigsetsize |= sizeof(compat_sigset_t)) ++ if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; +diff -upr linux-2.6.16.orig/fs/dcache.c linux-2.6.16-026test015/fs/dcache.c +--- linux-2.6.16.orig/fs/dcache.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/dcache.c 2006-07-04 14:41:38.000000000 +0400 +@@ -28,11 +28,16 @@ + #include <linux/module.h> + #include <linux/mount.h> + #include <linux/file.h> ++#include <linux/namei.h> + #include <asm/uaccess.h> + #include <linux/security.h> + #include <linux/seqlock.h> + #include <linux/swap.h> + #include <linux/bootmem.h> ++#include <linux/kernel_stat.h> ++#include <net/inet_sock.h> ++ ++#include <ub/ub_dcache.h> + + /* #define DCACHE_DEBUG 1 */ + +@@ -44,7 +49,7 @@ static seqlock_t rename_lock __cacheline + + EXPORT_SYMBOL(dcache_lock); + +-static kmem_cache_t *dentry_cache; ++kmem_cache_t *dentry_cache; + + #define DNAME_INLINE_LEN (sizeof(struct dentry)-offsetof(struct dentry,d_iname)) + +@@ -143,11 +148,8 @@ static void dentry_iput(struct dentry * + * no dcache lock, please. + */ + +-void dput(struct dentry *dentry) ++static void dput_recursive(struct dentry *dentry) + { +- if (!dentry) +- return; +- + repeat: + if (atomic_read(&dentry->d_count) == 1) + might_sleep(); +@@ -206,6 +208,17 @@ kill_it: { + } + } + ++void dput(struct dentry *dentry) ++{ ++ if (!dentry) ++ return; ++ ++ spin_lock(&dcache_lock); ++ ub_dentry_uncharge(dentry); ++ spin_unlock(&dcache_lock); ++ dput_recursive(dentry); ++} ++ + /** + * d_invalidate - invalidate a dentry + * @dentry: dentry to invalidate +@@ -272,6 +285,8 @@ static inline struct dentry * __dget_loc + dentry_stat.nr_unused--; + list_del_init(&dentry->d_lru); + } ++ ++ ub_dentry_charge_nofail(dentry); + return dentry; + } + +@@ -373,13 +388,19 @@ static inline void prune_one_dentry(stru + parent = dentry->d_parent; + d_free(dentry); + if (parent != dentry) +- dput(parent); ++ /* ++ * dentry is not in use, only child (not outside) ++ * references change, so parent->d_inuse does not change ++ */ ++ dput_recursive(parent); + spin_lock(&dcache_lock); + } + + /** + * prune_dcache - shrink the dcache + * @count: number of entries to try and free ++ * @sb: if given, ignore dentries for other superblocks ++ * which are being unmounted. + * + * Shrink the dcache. This is done when we need + * more memory, or simply when we need to unmount +@@ -390,16 +411,29 @@ static inline void prune_one_dentry(stru + * all the dentries are in use. + */ + +-static void prune_dcache(int count) ++static void prune_dcache(int count, struct super_block *sb) + { + spin_lock(&dcache_lock); + for (; count ; count--) { + struct dentry *dentry; + struct list_head *tmp; ++ struct rw_semaphore *s_umount; + + cond_resched_lock(&dcache_lock); + + tmp = dentry_unused.prev; ++ if (unlikely(sb)) { ++ /* Try to find a dentry for this sb, but don't try ++ * too hard, if they aren't near the tail they will ++ * be moved down again soon ++ */ ++ int skip = count; ++ while (skip && tmp != &dentry_unused && ++ list_entry(tmp, struct dentry, d_lru)->d_sb != sb) { ++ skip--; ++ tmp = tmp->prev; ++ } ++ } + if (tmp == &dentry_unused) + break; + list_del_init(tmp); +@@ -425,7 +459,45 @@ static void prune_dcache(int count) + spin_unlock(&dentry->d_lock); + continue; + } +- prune_one_dentry(dentry); ++ /* ++ * If the dentry is not DCACHED_REFERENCED, it is time ++ * to remove it from the dcache, provided the super block is ++ * NULL (which means we are trying to reclaim memory) ++ * or this dentry belongs to the same super block that ++ * we want to shrink. ++ */ ++ /* ++ * If this dentry is for "my" filesystem, then I can prune it ++ * without taking the s_umount lock (I already hold it). ++ */ ++ if (sb && dentry->d_sb == sb) { ++ prune_one_dentry(dentry); ++ continue; ++ } ++ /* ++ * ...otherwise we need to be sure this filesystem isn't being ++ * unmounted, otherwise we could race with ++ * generic_shutdown_super(), and end up holding a reference to ++ * an inode while the filesystem is unmounted. ++ * So we try to get s_umount, and make sure s_root isn't NULL. ++ * (Take a local copy of s_umount to avoid a use-after-free of ++ * `dentry'). ++ */ ++ s_umount = &dentry->d_sb->s_umount; ++ if (down_read_trylock(s_umount)) { ++ if (dentry->d_sb->s_root != NULL) { ++ prune_one_dentry(dentry); ++ up_read(s_umount); ++ continue; ++ } ++ up_read(s_umount); ++ } ++ spin_unlock(&dentry->d_lock); ++ /* Cannot remove the first dentry, and it isn't appropriate ++ * to move it to the head of the list, so give up, and try ++ * later ++ */ ++ break; + } + spin_unlock(&dcache_lock); + } +@@ -486,6 +558,7 @@ repeat: + continue; + } + prune_one_dentry(dentry); ++ cond_resched_lock(&dcache_lock); + goto repeat; + } + spin_unlock(&dcache_lock); +@@ -635,7 +708,7 @@ void shrink_dcache_parent(struct dentry + int found; + + while ((found = select_parent(parent)) != 0) +- prune_dcache(found); ++ prune_dcache(found, parent->d_sb); + } + + /** +@@ -648,9 +721,10 @@ void shrink_dcache_parent(struct dentry + * done under dcache_lock. + * + */ +-void shrink_dcache_anon(struct hlist_head *head) ++void shrink_dcache_anon(struct super_block *sb) + { + struct hlist_node *lp; ++ struct hlist_head *head = &sb->s_anon; + int found; + do { + found = 0; +@@ -673,7 +747,7 @@ void shrink_dcache_anon(struct hlist_hea + } + } + spin_unlock(&dcache_lock); +- prune_dcache(found); ++ prune_dcache(found, sb); + } while(found); + } + +@@ -691,12 +765,18 @@ void shrink_dcache_anon(struct hlist_hea + */ + static int shrink_dcache_memory(int nr, gfp_t gfp_mask) + { ++ int res = -1; ++ ++ KSTAT_PERF_ENTER(shrink_dcache) + if (nr) { + if (!(gfp_mask & __GFP_FS)) +- return -1; +- prune_dcache(nr); ++ goto out; ++ prune_dcache(nr, NULL); + } +- return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; ++ res = (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; ++out: ++ KSTAT_PERF_LEAVE(shrink_dcache) ++ return res; + } + + /** +@@ -716,19 +796,20 @@ struct dentry *d_alloc(struct dentry * p + + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) +- return NULL; ++ goto err_alloc; + + if (name->len > DNAME_INLINE_LEN-1) { + dname = kmalloc(name->len + 1, GFP_KERNEL); +- if (!dname) { +- kmem_cache_free(dentry_cache, dentry); +- return NULL; +- } ++ if (!dname) ++ goto err_name; + } else { + dname = dentry->d_iname; + } + dentry->d_name.name = dname; + ++ if (ub_dentry_alloc(dentry)) ++ goto err_charge; ++ + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); +@@ -759,12 +840,23 @@ struct dentry *d_alloc(struct dentry * p + } + + spin_lock(&dcache_lock); +- if (parent) ++ if (parent) { + list_add(&dentry->d_u.d_child, &parent->d_subdirs); ++ if (parent->d_flags & DCACHE_VIRTUAL) ++ dentry->d_flags |= DCACHE_VIRTUAL; ++ } + dentry_stat.nr_dentry++; + spin_unlock(&dcache_lock); + + return dentry; ++ ++err_charge: ++ if (name->len > DNAME_INLINE_LEN - 1) ++ kfree(dname); ++err_name: ++ kmem_cache_free(dentry_cache, dentry); ++err_alloc: ++ return NULL; + } + + struct dentry *d_alloc_name(struct dentry *parent, const char *name) +@@ -1048,7 +1140,6 @@ struct dentry * __d_lookup(struct dentry + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct hlist_head *head = d_hash(parent,hash); +- struct dentry *found = NULL; + struct hlist_node *node; + struct dentry *dentry; + +@@ -1089,7 +1180,7 @@ struct dentry * __d_lookup(struct dentry + + if (!d_unhashed(dentry)) { + atomic_inc(&dentry->d_count); +- found = dentry; ++ goto found; + } + spin_unlock(&dentry->d_lock); + break; +@@ -1098,7 +1189,18 @@ next: + } + rcu_read_unlock(); + +- return found; ++ return NULL; ++ ++found: ++ /* ++ * d_lock and rcu_read_lock ++ * are dropped in ub_dentry_charge() ++ */ ++ if (ub_dentry_charge(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; + } + + /** +@@ -1345,6 +1447,32 @@ already_unhashed: + } + + /** ++ * __d_path_add_deleted - prepend "(deleted) " text ++ * @end: a pointer to the character after free space at the beginning of the ++ * buffer ++ * @buflen: remaining free space ++ */ ++static inline char * __d_path_add_deleted(char * end, int buflen) ++{ ++ buflen -= 10; ++ if (buflen < 0) ++ return ERR_PTR(-ENAMETOOLONG); ++ end -= 10; ++ memcpy(end, "(deleted) ", 10); ++ return end; ++} ++ ++/** ++ * d_root_check - checks if dentry is accessible from current's fs root ++ * @dentry: dentry to be verified ++ * @vfsmnt: vfsmnt to which the dentry belongs ++ */ ++int d_root_check(struct dentry *dentry, struct vfsmount *vfsmnt) ++{ ++ return PTR_ERR(d_path(dentry, vfsmnt, NULL, 0)); ++} ++ ++/** + * d_path - return the path of a dentry + * @dentry: dentry to report + * @vfsmnt: vfsmnt to which the dentry belongs +@@ -1365,36 +1493,35 @@ static char * __d_path( struct dentry *d + char *buffer, int buflen) + { + char * end = buffer+buflen; +- char * retval; ++ char * retval = NULL; + int namelen; ++ int deleted; ++ struct vfsmount *oldvfsmnt; + +- *--end = '\0'; +- buflen--; +- if (!IS_ROOT(dentry) && d_unhashed(dentry)) { +- buflen -= 10; +- end -= 10; +- if (buflen < 0) ++ oldvfsmnt = vfsmnt; ++ deleted = (!IS_ROOT(dentry) && d_unhashed(dentry)); ++ if (buffer != NULL) { ++ *--end = '\0'; ++ buflen--; ++ ++ if (buflen < 1) + goto Elong; +- memcpy(end, " (deleted)", 10); ++ /* Get '/' right */ ++ retval = end-1; ++ *retval = '/'; + } + +- if (buflen < 1) +- goto Elong; +- /* Get '/' right */ +- retval = end-1; +- *retval = '/'; +- + for (;;) { + struct dentry * parent; + + if (dentry == root && vfsmnt == rootmnt) + break; + if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { +- /* Global root? */ ++ /* root of a tree? */ + spin_lock(&vfsmount_lock); + if (vfsmnt->mnt_parent == vfsmnt) { + spin_unlock(&vfsmount_lock); +- goto global_root; ++ goto other_root; + } + dentry = vfsmnt->mnt_mountpoint; + vfsmnt = vfsmnt->mnt_parent; +@@ -1403,27 +1530,51 @@ static char * __d_path( struct dentry *d + } + parent = dentry->d_parent; + prefetch(parent); ++ if (buffer != NULL) { ++ namelen = dentry->d_name.len; ++ buflen -= namelen + 1; ++ if (buflen < 0) ++ goto Elong; ++ end -= namelen; ++ memcpy(end, dentry->d_name.name, namelen); ++ *--end = '/'; ++ retval = end; ++ } ++ dentry = parent; ++ } ++ /* the given root point is reached */ ++finish: ++ if (buffer != NULL && deleted) ++ retval = __d_path_add_deleted(end, buflen); ++ return retval; ++ ++other_root: ++ /* ++ * We traversed the tree upward and reached a root, but the given ++ * lookup terminal point wasn't encountered. It means either that the ++ * dentry is out of our scope or belongs to an abstract space like ++ * sock_mnt or pipe_mnt. Check for it. ++ * ++ * There are different options to check it. ++ * We may assume that any dentry tree is unreachable unless it's ++ * connected to `root' (defined as fs root of init aka child reaper) ++ * and expose all paths that are not connected to it. ++ * The other option is to allow exposing of known abstract spaces ++ * explicitly and hide the path information for other cases. ++ * This approach is more safe, let's take it. 2001/04/22 SAW ++ */ ++ if (!(oldvfsmnt->mnt_sb->s_flags & MS_NOUSER)) ++ return ERR_PTR(-EINVAL); ++ if (buffer != NULL) { + namelen = dentry->d_name.len; +- buflen -= namelen + 1; ++ buflen -= namelen; + if (buflen < 0) + goto Elong; +- end -= namelen; +- memcpy(end, dentry->d_name.name, namelen); +- *--end = '/'; +- retval = end; +- dentry = parent; ++ retval -= namelen-1; /* hit the slash */ ++ memcpy(retval, dentry->d_name.name, namelen); + } ++ goto finish; + +- return retval; +- +-global_root: +- namelen = dentry->d_name.len; +- buflen -= namelen; +- if (buflen < 0) +- goto Elong; +- retval -= namelen-1; /* hit the slash */ +- memcpy(retval, dentry->d_name.name, namelen); +- return retval; + Elong: + return ERR_PTR(-ENAMETOOLONG); + } +@@ -1448,6 +1599,229 @@ char * d_path(struct dentry *dentry, str + return res; + } + ++#ifdef CONFIG_VE ++#include <net/sock.h> ++#include <linux/ip.h> ++#include <linux/file.h> ++#include <linux/namespace.h> ++#include <linux/vzratelimit.h> ++ ++static void mark_sub_tree_virtual(struct dentry *d) ++{ ++ struct dentry *orig_root; ++ ++ orig_root = d; ++ while (1) { ++ spin_lock(&d->d_lock); ++ d->d_flags |= DCACHE_VIRTUAL; ++ spin_unlock(&d->d_lock); ++ ++ if (!list_empty(&d->d_subdirs)) { ++ d = list_entry(d->d_subdirs.next, ++ struct dentry, d_u.d_child); ++ continue; ++ } ++ if (d == orig_root) ++ break; ++ while (d == list_entry(d->d_parent->d_subdirs.prev, ++ struct dentry, d_u.d_child)) { ++ d = d->d_parent; ++ if (d == orig_root) ++ goto out; ++ } ++ d = list_entry(d->d_u.d_child.next, ++ struct dentry, d_u.d_child); ++ } ++out: ++ return; ++} ++ ++void mark_tree_virtual(struct vfsmount *m, struct dentry *d) ++{ ++ struct vfsmount *orig_rootmnt; ++ ++ spin_lock(&dcache_lock); ++ spin_lock(&vfsmount_lock); ++ orig_rootmnt = m; ++ while (1) { ++ mark_sub_tree_virtual(d); ++ if (!list_empty(&m->mnt_mounts)) { ++ m = list_entry(m->mnt_mounts.next, ++ struct vfsmount, mnt_child); ++ d = m->mnt_root; ++ continue; ++ } ++ if (m == orig_rootmnt) ++ break; ++ while (m == list_entry(m->mnt_parent->mnt_mounts.prev, ++ struct vfsmount, mnt_child)) { ++ m = m->mnt_parent; ++ if (m == orig_rootmnt) ++ goto out; ++ } ++ m = list_entry(m->mnt_child.next, ++ struct vfsmount, mnt_child); ++ d = m->mnt_root; ++ } ++out: ++ spin_unlock(&vfsmount_lock); ++ spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(mark_tree_virtual); ++ ++static struct vz_rate_info area_ri = { 20, 10*HZ }; ++#define VE_AREA_ACC_CHECK 0x0001 ++#define VE_AREA_ACC_DENY 0x0002 ++#define VE_AREA_EXEC_CHECK 0x0010 ++#define VE_AREA_EXEC_DENY 0x0020 ++#define VE0_AREA_ACC_CHECK 0x0100 ++#define VE0_AREA_ACC_DENY 0x0200 ++#define VE0_AREA_EXEC_CHECK 0x1000 ++#define VE0_AREA_EXEC_DENY 0x2000 ++int ve_area_access_check = 0; ++ ++static void print_connection_info(struct task_struct *tsk) ++{ ++ struct files_struct *files; ++ struct fdtable *fdt; ++ int fd; ++ ++ files = get_files_struct(tsk); ++ if (!files) ++ return; ++ ++ spin_lock(&files->file_lock); ++ fdt = files_fdtable(files); ++ for (fd = 0; fd < fdt->max_fds; fd++) { ++ struct file *file; ++ struct inode *inode; ++ struct socket *socket; ++ struct sock *sk; ++ struct inet_sock *inet; ++ ++ file = fdt->fd[fd]; ++ if (file == NULL) ++ continue; ++ ++ inode = file->f_dentry->d_inode; ++ if (!S_ISSOCK(inode->i_mode)) ++ continue; ++ ++ socket = SOCKET_I(inode); ++ if (socket == NULL) ++ continue; ++ ++ sk = socket->sk; ++ if ((sk->sk_family != PF_INET && sk->sk_family != PF_INET6) ++ || sk->sk_type != SOCK_STREAM) ++ continue; ++ ++ inet = inet_sk(sk); ++ printk(KERN_ALERT "connection from %u.%u.%u.%u:%u to port %u\n", ++ NIPQUAD(inet->daddr), ntohs(inet->dport), ++ inet->num); ++ } ++ spin_unlock(&files->file_lock); ++ put_files_struct(files); ++} ++ ++static void check_alert(struct vfsmount *vfsmnt, struct dentry *dentry, ++ char *str) ++{ ++ struct task_struct *tsk; ++ unsigned long page; ++ struct super_block *sb; ++ char *p; ++ ++ if (!vz_ratelimit(&area_ri)) ++ return; ++ ++ tsk = current; ++ p = ERR_PTR(-ENOMEM); ++ page = __get_free_page(GFP_KERNEL); ++ if (page) { ++ spin_lock(&dcache_lock); ++ p = __d_path(dentry, vfsmnt, tsk->fs->root, tsk->fs->rootmnt, ++ (char *)page, PAGE_SIZE); ++ spin_unlock(&dcache_lock); ++ } ++ if (IS_ERR(p)) ++ p = "(undefined)"; ++ ++ sb = dentry->d_sb; ++ printk(KERN_ALERT "%s check alert! file:[%s] from %d/%s, dev%x\n" ++ "Task %d/%d[%s] from VE%d, execenv %d\n", ++ str, p, VE_OWNER_FSTYPE(sb->s_type)->veid, ++ sb->s_type->name, sb->s_dev, ++ tsk->pid, virt_pid(tsk), tsk->comm, ++ VE_TASK_INFO(tsk)->owner_env->veid, ++ get_exec_env()->veid); ++ ++ free_page(page); ++ ++ print_connection_info(tsk); ++ ++ read_lock(&tasklist_lock); ++ tsk = tsk->real_parent; ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ ++ printk(KERN_ALERT "Parent %d/%d[%s] from VE%d\n", ++ tsk->pid, virt_pid(tsk), tsk->comm, ++ VE_TASK_INFO(tsk)->owner_env->veid); ++ ++ print_connection_info(tsk); ++ put_task_struct(tsk); ++ dump_stack(); ++} ++#endif ++ ++int check_area_access_ve(struct dentry *dentry, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_VE ++ int check, alert, deny; ++ ++ if (ve_is_super(get_exec_env())) { ++ check = ve_area_access_check & VE0_AREA_ACC_CHECK; ++ alert = dentry->d_flags & DCACHE_VIRTUAL; ++ deny = ve_area_access_check & VE0_AREA_ACC_DENY; ++ } else { ++ check = ve_area_access_check & VE_AREA_ACC_CHECK; ++ alert = !(dentry->d_flags & DCACHE_VIRTUAL); ++ deny = ve_area_access_check & VE_AREA_ACC_DENY; ++ } ++ ++ if (check && alert) ++ check_alert(mnt, dentry, "Access"); ++ if (deny && alert) ++ return -EACCES; ++#endif ++ return 0; ++} ++ ++int check_area_execute_ve(struct dentry *dentry, struct vfsmount *mnt) ++{ ++#ifdef CONFIG_VE ++ int check, alert, deny; ++ ++ if (ve_is_super(get_exec_env())) { ++ check = ve_area_access_check & VE0_AREA_EXEC_CHECK; ++ alert = dentry->d_flags & DCACHE_VIRTUAL; ++ deny = ve_area_access_check & VE0_AREA_EXEC_DENY; ++ } else { ++ check = ve_area_access_check & VE_AREA_EXEC_CHECK; ++ alert = !(dentry->d_flags & DCACHE_VIRTUAL); ++ deny = ve_area_access_check & VE_AREA_EXEC_DENY; ++ } ++ ++ if (check && alert) ++ check_alert(mnt, dentry, "Exec"); ++ if (deny && alert) ++ return -EACCES; ++#endif ++ return 0; ++} ++ + /* + * NOTE! The user-level library version returns a + * character pointer. The kernel system call just +@@ -1584,10 +1958,12 @@ resume: + goto repeat; + } + atomic_dec(&dentry->d_count); ++ ub_dentry_uncharge(dentry); + } + if (this_parent != root) { + next = this_parent->d_u.d_child.next; + atomic_dec(&this_parent->d_count); ++ ub_dentry_uncharge(this_parent); + this_parent = this_parent->d_parent; + goto resume; + } +@@ -1736,7 +2112,8 @@ void __init vfs_caches_init(unsigned lon + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, ++ NULL, NULL); + + dcache_init(mempages); + inode_init(mempages); +diff -upr linux-2.6.16.orig/fs/devpts/inode.c linux-2.6.16-026test015/fs/devpts/inode.c +--- linux-2.6.16.orig/fs/devpts/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/devpts/inode.c 2006-07-04 14:41:38.000000000 +0400 +@@ -12,6 +12,7 @@ + + #include <linux/module.h> + #include <linux/init.h> ++#include <linux/ve.h> + #include <linux/fs.h> + #include <linux/sched.h> + #include <linux/namei.h> +@@ -21,16 +22,17 @@ + + #define DEVPTS_SUPER_MAGIC 0x1cd1 + ++struct devpts_config devpts_config = {.mode = 0600}; ++ ++#ifndef CONFIG_VE + static struct vfsmount *devpts_mnt; + static struct dentry *devpts_root; +- +-static struct { +- int setuid; +- int setgid; +- uid_t uid; +- gid_t gid; +- umode_t mode; +-} config = {.mode = 0600}; ++#define config devpts_config ++#else ++#define devpts_mnt (get_exec_env()->devpts_mnt) ++#define devpts_root (get_exec_env()->devpts_root) ++#define config (*(get_exec_env()->devpts_config)) ++#endif + + static int devpts_remount(struct super_block *sb, int *flags, char *data) + { +@@ -56,7 +58,8 @@ static int devpts_remount(struct super_b + } else if (sscanf(this_char, "mode=%o%c", &n, &dummy) == 1) + mode = n & ~S_IFMT; + else { +- printk("devpts: called with bogus options\n"); ++ ve_printk(VE_LOG, ++ "devpts: called with bogus options\n"); + return -EINVAL; + } + } +@@ -114,13 +117,15 @@ static struct super_block *devpts_get_sb + return get_sb_single(fs_type, flags, data, devpts_fill_super); + } + +-static struct file_system_type devpts_fs_type = { ++struct file_system_type devpts_fs_type = { + .owner = THIS_MODULE, + .name = "devpts", + .get_sb = devpts_get_sb, + .kill_sb = kill_anon_super, + }; + ++EXPORT_SYMBOL(devpts_fs_type); ++ + /* + * The normal naming convention is simply /dev/pts/<number>; this conforms + * to the System V naming convention +@@ -212,6 +217,7 @@ static int __init init_devpts_fs(void) + + static void __exit exit_devpts_fs(void) + { ++ /* the code is never called, the argument is irrelevant */ + unregister_filesystem(&devpts_fs_type); + mntput(devpts_mnt); + } +diff -upr linux-2.6.16.orig/fs/eventpoll.c linux-2.6.16-026test015/fs/eventpoll.c +--- linux-2.6.16.orig/fs/eventpoll.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/eventpoll.c 2006-07-04 14:41:39.000000000 +0400 +@@ -105,11 +105,6 @@ + #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) + + +-struct epoll_filefd { +- struct file *file; +- int fd; +-}; +- + /* + * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". + * It is used to keep track on all tasks that are currently inside the wake_up() code +@@ -132,36 +127,6 @@ struct poll_safewake { + spinlock_t lock; + }; + +-/* +- * This structure is stored inside the "private_data" member of the file +- * structure and rapresent the main data sructure for the eventpoll +- * interface. +- */ +-struct eventpoll { +- /* Protect the this structure access */ +- rwlock_t lock; +- +- /* +- * This semaphore is used to ensure that files are not removed +- * while epoll is using them. This is read-held during the event +- * collection loop and it is write-held during the file cleanup +- * path, the epoll file exit code and the ctl operations. +- */ +- struct rw_semaphore sem; +- +- /* Wait queue used by sys_epoll_wait() */ +- wait_queue_head_t wq; +- +- /* Wait queue used by file->poll() */ +- wait_queue_head_t poll_wait; +- +- /* List of ready file descriptors */ +- struct list_head rdllist; +- +- /* RB-Tree root used to store monitored fd structs */ +- struct rb_root rbr; +-}; +- + /* Wait structure used by the poll hooks */ + struct eppoll_entry { + /* List header used to link this structure to the "struct epitem" */ +@@ -180,51 +145,6 @@ struct eppoll_entry { + wait_queue_head_t *whead; + }; + +-/* +- * Each file descriptor added to the eventpoll interface will +- * have an entry of this type linked to the hash. +- */ +-struct epitem { +- /* RB-Tree node used to link this structure to the eventpoll rb-tree */ +- struct rb_node rbn; +- +- /* List header used to link this structure to the eventpoll ready list */ +- struct list_head rdllink; +- +- /* The file descriptor information this item refers to */ +- struct epoll_filefd ffd; +- +- /* Number of active wait queue attached to poll operations */ +- int nwait; +- +- /* List containing poll wait queues */ +- struct list_head pwqlist; +- +- /* The "container" of this item */ +- struct eventpoll *ep; +- +- /* The structure that describe the interested events and the source fd */ +- struct epoll_event event; +- +- /* +- * Used to keep track of the usage count of the structure. This avoids +- * that the structure will desappear from underneath our processing. +- */ +- atomic_t usecnt; +- +- /* List header used to link this item to the "struct file" items list */ +- struct list_head fllink; +- +- /* List header used to link the item to the transfer list */ +- struct list_head txlink; +- +- /* +- * This is used during the collection/transfer of events to userspace +- * to pin items empty events set. +- */ +- unsigned int revents; +-}; +- + /* Wrapper struct used by poll queueing */ + struct ep_pqueue { + poll_table pt; +@@ -239,14 +159,10 @@ static int ep_getfd(int *efd, struct ino + struct eventpoll *ep); + static int ep_alloc(struct eventpoll **pep); + static void ep_free(struct eventpoll *ep); +-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); + static void ep_use_epitem(struct epitem *epi); +-static void ep_release_epitem(struct epitem *epi); + static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); +-static int ep_insert(struct eventpoll *ep, struct epoll_event *event, +- struct file *tfile, int fd); + static int ep_modify(struct eventpoll *ep, struct epitem *epi, + struct epoll_event *event); + static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); +@@ -274,7 +190,8 @@ static struct super_block *eventpollfs_g + /* + * This semaphore is used to serialize ep_free() and eventpoll_release_file(). + */ +-static struct semaphore epsem; ++struct semaphore epsem; ++EXPORT_SYMBOL_GPL(epsem); + + /* Safe wake up implementation */ + static struct poll_safewake psw; +@@ -289,10 +206,11 @@ static kmem_cache_t *pwq_cache; + static struct vfsmount *eventpoll_mnt; + + /* File callbacks that implement the eventpoll file behaviour */ +-static struct file_operations eventpoll_fops = { ++struct file_operations eventpoll_fops = { + .release = ep_eventpoll_close, + .poll = ep_eventpoll_poll + }; ++EXPORT_SYMBOL_GPL(eventpoll_fops); + + /* + * This is used to register the virtual file system from where +@@ -542,7 +460,7 @@ eexit_1: + current, size, error)); + return error; + } +- ++EXPORT_SYMBOL_GPL(sys_epoll_create); + + /* + * The following function implements the controller interface for +@@ -852,7 +770,7 @@ static void ep_free(struct eventpoll *ep + * the returned item, so the caller must call ep_release_epitem() + * after finished using the "struct epitem". + */ +-static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) ++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) + { + int kcmp; + unsigned long flags; +@@ -882,6 +800,7 @@ static struct epitem *ep_find(struct eve + + return epir; + } ++EXPORT_SYMBOL_GPL(ep_find); + + + /* +@@ -900,13 +819,13 @@ static void ep_use_epitem(struct epitem + * has finished using the structure. It might lead to freeing the + * structure itself if the count goes to zero. + */ +-static void ep_release_epitem(struct epitem *epi) ++void ep_release_epitem(struct epitem *epi) + { + + if (atomic_dec_and_test(&epi->usecnt)) + kmem_cache_free(epi_cache, epi); + } +- ++EXPORT_SYMBOL_GPL(ep_release_epitem); + + /* + * This is the callback that is used to add our wait queue to the +@@ -952,7 +871,7 @@ static void ep_rbtree_insert(struct even + } + + +-static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ++int ep_insert(struct eventpoll *ep, struct epoll_event *event, + struct file *tfile, int fd) + { + int error, revents, pwake = 0; +@@ -1044,6 +963,7 @@ eexit_2: + eexit_1: + return error; + } ++EXPORT_SYMBOL_GPL(ep_insert); + + + /* +diff -upr linux-2.6.16.orig/fs/exec.c linux-2.6.16-026test015/fs/exec.c +--- linux-2.6.16.orig/fs/exec.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/exec.c 2006-07-04 14:41:39.000000000 +0400 +@@ -53,6 +53,8 @@ + #include <asm/uaccess.h> + #include <asm/mmu_context.h> + ++#include <ub/ub_vmpages.h> ++ + #ifdef CONFIG_KMOD + #include <linux/kmod.h> + #endif +@@ -64,6 +66,8 @@ int suid_dumpable = 0; + EXPORT_SYMBOL(suid_dumpable); + /* The maximal length of core_pattern is also specified in sysctl.c */ + ++int sysctl_at_vsyscall; ++ + static struct linux_binfmt *formats; + static DEFINE_RWLOCK(binfmt_lock); + +@@ -135,7 +139,7 @@ asmlinkage long sys_uselib(const char __ + if (!S_ISREG(nd.dentry->d_inode->i_mode)) + goto exit; + +- error = vfs_permission(&nd, MAY_READ | MAY_EXEC); ++ error = vfs_permission(&nd, MAY_READ | MAY_EXEC, NULL); + if (error) + goto exit; + +@@ -308,6 +312,10 @@ void install_arg_page(struct vm_area_str + struct mm_struct *mm = vma->vm_mm; + pte_t * pte; + spinlock_t *ptl; ++ struct page_beancounter *pb; ++ ++ if (unlikely(pb_alloc(&pb))) ++ goto out_nopb; + + if (unlikely(anon_vma_prepare(vma))) + goto out; +@@ -321,15 +329,21 @@ void install_arg_page(struct vm_area_str + goto out; + } + inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); + lru_cache_add_active(page); + set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( + page, vma->vm_page_prot)))); ++ pb_add_ref(page, mm, &pb); ++ ub_unused_privvm_dec(mm, vma); ++ pb_free(&pb); + page_add_new_anon_rmap(page, vma, address); + pte_unmap_unlock(pte, ptl); + + /* no need for flush_tlb */ + return; + out: ++ pb_free(&pb); ++out_nopb: + __free_page(page); + force_sig(SIGKILL, current); + } +@@ -404,9 +418,13 @@ int setup_arg_pages(struct linux_binprm + bprm->loader += stack_base; + bprm->exec += stack_base; + +- mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); ++ if (ub_memory_charge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, ++ NULL, UB_SOFT)) ++ goto fail_charge; ++ ++ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | __GFP_SOFT_UBC); + if (!mpnt) +- return -ENOMEM; ++ goto fail_alloc; + + memset(mpnt, 0, sizeof(*mpnt)); + +@@ -450,6 +468,11 @@ int setup_arg_pages(struct linux_binprm + up_write(&mm->mmap_sem); + + return 0; ++ ++fail_alloc: ++ ub_memory_uncharge(mm, arg_size, VM_STACK_FLAGS | mm->def_flags, NULL); ++fail_charge: ++ return -ENOMEM; + } + + EXPORT_SYMBOL(setup_arg_pages); +@@ -471,7 +494,7 @@ static inline void free_arg_pages(struct + + #endif /* CONFIG_MMU */ + +-struct file *open_exec(const char *name) ++struct file *open_exec(const char *name, struct linux_binprm *bprm) + { + struct nameidata nd; + int err; +@@ -485,9 +508,16 @@ struct file *open_exec(const char *name) + file = ERR_PTR(-EACCES); + if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && + S_ISREG(inode->i_mode)) { +- int err = vfs_permission(&nd, MAY_EXEC); +- if (!err && !(inode->i_mode & 0111)) +- err = -EACCES; ++ int err; ++ struct exec_perm *perm; ++ ++ if (bprm != NULL) { ++ perm = &bprm->perm; ++ perm->set = 0; ++ } else ++ perm = NULL; ++ ++ err = vfs_permission(&nd, MAY_EXEC, perm); + file = ERR_PTR(err); + if (!err) { + file = nameidata_to_filp(&nd, O_RDONLY); +@@ -657,7 +687,7 @@ static int de_thread(struct task_struct + */ + if (!thread_group_leader(current)) { + struct task_struct *parent; +- struct dentry *proc_dentry1, *proc_dentry2; ++ struct dentry *proc_dentry1[2], *proc_dentry2[2]; + unsigned long ptrace; + + /* +@@ -671,8 +701,8 @@ static int de_thread(struct task_struct + + spin_lock(&leader->proc_lock); + spin_lock(¤t->proc_lock); +- proc_dentry1 = proc_pid_unhash(current); +- proc_dentry2 = proc_pid_unhash(leader); ++ proc_pid_unhash(current, proc_dentry1); ++ proc_pid_unhash(leader, proc_dentry2); + write_lock_irq(&tasklist_lock); + + BUG_ON(leader->tgid != current->tgid); +@@ -829,7 +859,7 @@ int flush_old_exec(struct linux_binprm * + { + char * name; + int i, ch, retval; +- struct files_struct *files; ++ struct files_struct *files, *old; + char tcomm[sizeof(current->comm)]; + + /* +@@ -897,6 +927,7 @@ int flush_old_exec(struct linux_binprm * + suid_keys(current); + current->mm->dumpable = suid_dumpable; + } ++ current->mm->vps_dumpable = 1; + + /* An exec changes our domain. We are no longer part of the thread + group */ +@@ -909,8 +940,11 @@ int flush_old_exec(struct linux_binprm * + return 0; + + mmap_failed: +- put_files_struct(current->files); ++ old = current->files; ++ task_lock(current); + current->files = files; ++ task_unlock(current); ++ put_files_struct(old); + out: + return retval; + } +@@ -927,13 +961,6 @@ int prepare_binprm(struct linux_binprm * + struct inode * inode = bprm->file->f_dentry->d_inode; + int retval; + +- mode = inode->i_mode; +- /* +- * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, +- * generic_permission lets a non-executable through +- */ +- if (!(mode & 0111)) /* with at least _one_ execute bit set */ +- return -EACCES; + if (bprm->file->f_op == NULL) + return -EACCES; + +@@ -941,10 +968,24 @@ int prepare_binprm(struct linux_binprm * + bprm->e_gid = current->egid; + + if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { ++ if (!bprm->perm.set) { ++ /* ++ * This piece of code creates a time window between ++ * MAY_EXEC permission check and setuid/setgid ++ * operations and may be considered as a security hole. ++ * This code is here for compatibility reasons, ++ * if the filesystem is unable to return info now. ++ */ ++ bprm->perm.mode = inode->i_mode; ++ bprm->perm.uid = inode->i_uid; ++ bprm->perm.gid = inode->i_gid; ++ } ++ mode = bprm->perm.mode; ++ + /* Set-uid? */ + if (mode & S_ISUID) { + current->personality &= ~PER_CLEAR_ON_SETID; +- bprm->e_uid = inode->i_uid; ++ bprm->e_uid = bprm->perm.uid; + } + + /* Set-gid? */ +@@ -955,7 +996,7 @@ int prepare_binprm(struct linux_binprm * + */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + current->personality &= ~PER_CLEAR_ON_SETID; +- bprm->e_gid = inode->i_gid; ++ bprm->e_gid = bprm->perm.gid; + } + } + +@@ -1054,7 +1095,7 @@ int search_binary_handler(struct linux_b + + loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); + +- file = open_exec("/sbin/loader"); ++ file = open_exec("/sbin/loader", bprm); + retval = PTR_ERR(file); + if (IS_ERR(file)) + return retval; +@@ -1148,7 +1189,7 @@ int do_execve(char * filename, + goto out_ret; + memset(bprm, 0, sizeof(*bprm)); + +- file = open_exec(filename); ++ file = open_exec(filename, bprm); + retval = PTR_ERR(file); + if (IS_ERR(file)) + goto out_kfree; +@@ -1288,7 +1329,7 @@ static void format_corename(char *corena + case 'p': + pid_in_pattern = 1; + rc = snprintf(out_ptr, out_end - out_ptr, +- "%d", current->tgid); ++ "%d", virt_tgid(current)); + if (rc > out_end - out_ptr) + goto out; + out_ptr += rc; +@@ -1332,7 +1373,7 @@ static void format_corename(char *corena + case 'h': + down_read(&uts_sem); + rc = snprintf(out_ptr, out_end - out_ptr, +- "%s", system_utsname.nodename); ++ "%s", ve_utsname.nodename); + up_read(&uts_sem); + if (rc > out_end - out_ptr) + goto out; +@@ -1360,7 +1401,7 @@ static void format_corename(char *corena + if (!pid_in_pattern + && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { + rc = snprintf(out_ptr, out_end - out_ptr, +- ".%d", current->tgid); ++ ".%d", virt_tgid(current)); + if (rc > out_end - out_ptr) + goto out; + out_ptr += rc; +@@ -1386,7 +1427,7 @@ static void zap_threads (struct mm_struc + } + + read_lock(&tasklist_lock); +- do_each_thread(g,p) ++ do_each_thread_ve(g,p) + if (mm == p->mm && p != tsk) { + force_sig_specific(SIGKILL, p); + mm->core_waiters++; +@@ -1394,7 +1435,7 @@ static void zap_threads (struct mm_struc + unlikely(p->parent->mm == mm)) + traced = 1; + } +- while_each_thread(g,p); ++ while_each_thread_ve(g,p); + + read_unlock(&tasklist_lock); + +@@ -1406,12 +1447,12 @@ static void zap_threads (struct mm_struc + * coredump to finish. Detach them so they can both die. + */ + write_lock_irq(&tasklist_lock); +- do_each_thread(g,p) { ++ do_each_thread_ve(g,p) { + if (mm == p->mm && p != tsk && + p->ptrace && p->parent->mm == mm) { + __ptrace_detach(p, 0); + } +- } while_each_thread(g,p); ++ } while_each_thread_ve(g,p); + write_unlock_irq(&tasklist_lock); + } + } +@@ -1447,7 +1488,8 @@ int do_coredump(long signr, int exit_cod + if (!binfmt || !binfmt->core_dump) + goto fail; + down_write(&mm->mmap_sem); +- if (!mm->dumpable) { ++ if (!mm->dumpable || ++ (!mm->vps_dumpable && !ve_is_super(get_exec_env()))) { + up_write(&mm->mmap_sem); + goto fail; + } +diff -upr linux-2.6.16.orig/fs/ext2/acl.c linux-2.6.16-026test015/fs/ext2/acl.c +--- linux-2.6.16.orig/fs/ext2/acl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext2/acl.c 2006-07-04 14:41:37.000000000 +0400 +@@ -294,9 +294,10 @@ ext2_check_acl(struct inode *inode, int + } + + int +-ext2_permission(struct inode *inode, int mask, struct nameidata *nd) ++ext2_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { +- return generic_permission(inode, mask, ext2_check_acl); ++ return generic_permission(inode, mask, ext2_check_acl, perm); + } + + /* +diff -upr linux-2.6.16.orig/fs/ext2/acl.h linux-2.6.16-026test015/fs/ext2/acl.h +--- linux-2.6.16.orig/fs/ext2/acl.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext2/acl.h 2006-07-04 14:41:37.000000000 +0400 +@@ -58,7 +58,8 @@ static inline int ext2_acl_count(size_t + #define EXT2_ACL_NOT_CACHED ((void *)-1) + + /* acl.c */ +-extern int ext2_permission (struct inode *, int, struct nameidata *); ++extern int ext2_permission (struct inode *, int, struct nameidata *, ++ struct exec_perm *); + extern int ext2_acl_chmod (struct inode *); + extern int ext2_init_acl (struct inode *, struct inode *); + +diff -upr linux-2.6.16.orig/fs/ext2/namei.c linux-2.6.16-026test015/fs/ext2/namei.c +--- linux-2.6.16.orig/fs/ext2/namei.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext2/namei.c 2006-07-04 14:41:39.000000000 +0400 +@@ -31,6 +31,7 @@ + */ + + #include <linux/pagemap.h> ++#include <linux/quotaops.h> + #include "ext2.h" + #include "xattr.h" + #include "acl.h" +@@ -273,6 +274,8 @@ static int ext2_unlink(struct inode * di + struct page * page; + int err = -ENOENT; + ++ DQUOT_INIT(inode); ++ + de = ext2_find_entry (dir, dentry, &page); + if (!de) + goto out; +@@ -315,6 +318,9 @@ static int ext2_rename (struct inode * o + struct ext2_dir_entry_2 * old_de; + int err = -ENOENT; + ++ if (new_inode) ++ DQUOT_INIT(new_inode); ++ + old_de = ext2_find_entry (old_dir, old_dentry, &old_page); + if (!old_de) + goto out; +diff -upr linux-2.6.16.orig/fs/ext2/super.c linux-2.6.16-026test015/fs/ext2/super.c +--- linux-2.6.16.orig/fs/ext2/super.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext2/super.c 2006-07-04 14:41:38.000000000 +0400 +@@ -996,7 +996,7 @@ static int ext2_remount (struct super_bl + es = sbi->s_es; + if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != + (old_mount_opt & EXT2_MOUNT_XIP)) && +- invalidate_inodes(sb)) ++ invalidate_inodes(sb, 0)) + ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\ + "xip remain in cache (no functional problem)"); + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) +@@ -1205,7 +1205,7 @@ static struct file_system_type ext2_fs_t + .name = "ext2", + .get_sb = ext2_get_sb, + .kill_sb = kill_block_super, +- .fs_flags = FS_REQUIRES_DEV, ++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, + }; + + static int __init init_ext2_fs(void) +diff -upr linux-2.6.16.orig/fs/ext3/acl.c linux-2.6.16-026test015/fs/ext3/acl.c +--- linux-2.6.16.orig/fs/ext3/acl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext3/acl.c 2006-07-04 14:41:37.000000000 +0400 +@@ -299,9 +299,10 @@ ext3_check_acl(struct inode *inode, int + } + + int +-ext3_permission(struct inode *inode, int mask, struct nameidata *nd) ++ext3_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { +- return generic_permission(inode, mask, ext3_check_acl); ++ return generic_permission(inode, mask, ext3_check_acl, perm); + } + + /* +diff -upr linux-2.6.16.orig/fs/ext3/acl.h linux-2.6.16-026test015/fs/ext3/acl.h +--- linux-2.6.16.orig/fs/ext3/acl.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext3/acl.h 2006-07-04 14:41:37.000000000 +0400 +@@ -58,7 +58,8 @@ static inline int ext3_acl_count(size_t + #define EXT3_ACL_NOT_CACHED ((void *)-1) + + /* acl.c */ +-extern int ext3_permission (struct inode *, int, struct nameidata *); ++extern int ext3_permission (struct inode *, int, struct nameidata *, ++ struct exec_perm *); + extern int ext3_acl_chmod (struct inode *); + extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); + +diff -upr linux-2.6.16.orig/fs/ext3/inode.c linux-2.6.16-026test015/fs/ext3/inode.c +--- linux-2.6.16.orig/fs/ext3/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext3/inode.c 2006-07-04 14:41:37.000000000 +0400 +@@ -771,6 +771,7 @@ ext3_get_block_handle(handle_t *handle, + + set_buffer_new(bh_result); + got_it: ++ clear_buffer_delay(bh_result); + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (boundary) + set_buffer_boundary(bh_result); +@@ -964,11 +965,13 @@ static int walk_page_buffers( handle_t * + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * +- * Also, this function can nest inside ext3_writepage() -> +- * block_write_full_page(). In that case, we *know* that ext3_writepage() +- * has generated enough buffer credits to do the whole page. So we won't +- * block on the journal in that case, which is good, because the caller may +- * be PF_MEMALLOC. ++ * [2004/09/04 SAW] journal_start() in prepare_write() causes different ranking ++ * violations if copy_from_user() triggers a page fault (mmap_sem, may be page ++ * lock, plus __GFP_FS allocations). ++ * Now we read in not up-to-date buffers in prepare_write(), and do the rest ++ * including hole instantiation and inode extension in commit_write(). ++ * ++ * Other notes. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus +@@ -983,6 +986,27 @@ static int walk_page_buffers( handle_t * + * write. + */ + ++static int ext3_get_block_delay(struct inode *inode, sector_t iblock, ++ struct buffer_head *bh, int create) ++{ ++ int ret; ++ ++ ret = ext3_get_block_handle(NULL, inode, iblock, bh, 0, 0); ++ if (ret) ++ return ret; ++ if (!buffer_mapped(bh)) { ++ set_buffer_delay(bh); ++ set_buffer_new(bh); ++ } ++ return ret; ++} ++ ++static int ext3_prepare_write(struct file *file, struct page *page, ++ unsigned from, unsigned to) ++{ ++ return block_prepare_write(page, from, to, ext3_get_block_delay); ++} ++ + static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) + { +@@ -991,8 +1015,52 @@ static int do_journal_get_write_access(h + return ext3_journal_get_write_access(handle, bh); + } + +-static int ext3_prepare_write(struct file *file, struct page *page, +- unsigned from, unsigned to) ++/* ++ * This function zeroes buffers not mapped to disk. ++ * We do it similarly to the error path in __block_prepare_write() to avoid ++ * keeping garbage in the page cache. ++ * Here we check BH_delay state. We know that if the buffer appears ++ * !buffer_mapped then ++ * - it was !buffer_mapped at the moment of ext3_prepare_write, and ++ * - ext3_get_block failed to map this buffer (e.g., ENOSPC). ++ * If this !mapped buffer is not up to date (it can be up to date if ++ * PageUptodate), then we zero its content. ++ */ ++static void ext3_clear_delayed_buffers(struct page *page, ++ unsigned from, unsigned to) ++{ ++ struct buffer_head *bh, *head, *next; ++ unsigned block_start, block_end; ++ unsigned blocksize; ++ void *kaddr; ++ ++ head = page_buffers(page); ++ blocksize = head->b_size; ++ for ( bh = head, block_start = 0; ++ bh != head || !block_start; ++ block_start = block_end, bh = next) ++ { ++ next = bh->b_this_page; ++ block_end = block_start + blocksize; ++ if (block_end <= from || block_start >= to) ++ continue; ++ if (!buffer_delay(bh)) ++ continue; ++ J_ASSERT_BH(bh, !buffer_mapped(bh)); ++ clear_buffer_new(bh); ++ clear_buffer_delay(bh); ++ if (!buffer_uptodate(bh)) { ++ kaddr = kmap_atomic(page, KM_USER0); ++ memset(kaddr + block_start, 0, bh->b_size); ++ kunmap_atomic(kaddr, KM_USER0); ++ set_buffer_uptodate(bh); ++ mark_buffer_dirty(bh); ++ } ++ } ++} ++ ++static int ext3_map_write(struct file *file, struct page *page, ++ unsigned from, unsigned to) + { + struct inode *inode = page->mapping->host; + int ret, needed_blocks = ext3_writepage_trans_blocks(inode); +@@ -1009,18 +1077,17 @@ retry: + ret = nobh_prepare_write(page, from, to, ext3_get_block); + else + ret = block_prepare_write(page, from, to, ext3_get_block); +- if (ret) +- goto prepare_write_failed; +- +- if (ext3_should_journal_data(inode)) { ++ if (!ret && ext3_should_journal_data(inode)) { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, do_journal_get_write_access); + } +-prepare_write_failed: +- if (ret) +- ext3_journal_stop(handle); ++ if (!ret) ++ goto out; ++ ++ ext3_journal_stop(handle); + if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) + goto retry; ++ ext3_clear_delayed_buffers(page, from, to); + out: + return ret; + } +@@ -1055,10 +1122,15 @@ static int commit_write_fn(handle_t *han + static int ext3_ordered_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) + { +- handle_t *handle = ext3_journal_current_handle(); ++ handle_t *handle; + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + ++ ret = ext3_map_write(file, page, from, to); ++ if (ret) ++ return ret; ++ handle = ext3_journal_current_handle(); ++ + ret = walk_page_buffers(handle, page_buffers(page), + from, to, NULL, ext3_journal_dirty_data); + +@@ -1084,11 +1156,15 @@ static int ext3_ordered_commit_write(str + static int ext3_writeback_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) + { +- handle_t *handle = ext3_journal_current_handle(); ++ handle_t *handle; + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + loff_t new_i_size; + ++ ret = ext3_map_write(file, page, from, to); ++ if (ret) ++ return ret; ++ handle = ext3_journal_current_handle(); + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > EXT3_I(inode)->i_disksize) + EXT3_I(inode)->i_disksize = new_i_size; +@@ -1107,12 +1183,17 @@ static int ext3_writeback_commit_write(s + static int ext3_journalled_commit_write(struct file *file, + struct page *page, unsigned from, unsigned to) + { +- handle_t *handle = ext3_journal_current_handle(); ++ handle_t *handle; + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + int partial = 0; + loff_t pos; + ++ ret = ext3_map_write(file, page, from, to); ++ if (ret) ++ return ret; ++ handle = ext3_journal_current_handle(); ++ + /* + * Here we duplicate the generic_commit_write() functionality + */ +diff -upr linux-2.6.16.orig/fs/ext3/ioctl.c linux-2.6.16-026test015/fs/ext3/ioctl.c +--- linux-2.6.16.orig/fs/ext3/ioctl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext3/ioctl.c 2006-07-04 14:41:37.000000000 +0400 +@@ -69,7 +69,7 @@ int ext3_ioctl (struct inode * inode, st + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { +- if (!capable(CAP_SYS_RESOURCE)) ++ if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + +diff -upr linux-2.6.16.orig/fs/ext3/resize.c linux-2.6.16-026test015/fs/ext3/resize.c +--- linux-2.6.16.orig/fs/ext3/resize.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext3/resize.c 2006-07-04 14:41:36.000000000 +0400 +@@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block + if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!"); ++ unlock_super(sb); + err = -EBUSY; + goto exit_put; + } +diff -upr linux-2.6.16.orig/fs/ext3/super.c linux-2.6.16-026test015/fs/ext3/super.c +--- linux-2.6.16.orig/fs/ext3/super.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ext3/super.c 2006-07-04 14:41:38.000000000 +0400 +@@ -2661,7 +2661,7 @@ static struct file_system_type ext3_fs_t + .name = "ext3", + .get_sb = ext3_get_sb, + .kill_sb = kill_block_super, +- .fs_flags = FS_REQUIRES_DEV, ++ .fs_flags = FS_REQUIRES_DEV | FS_VIRTUALIZED, + }; + + static int __init init_ext3_fs(void) +diff -upr linux-2.6.16.orig/fs/fcntl.c linux-2.6.16-026test015/fs/fcntl.c +--- linux-2.6.16.orig/fs/fcntl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/fcntl.c 2006-07-04 14:41:39.000000000 +0400 +@@ -18,6 +18,7 @@ + #include <linux/ptrace.h> + #include <linux/signal.h> + #include <linux/rcupdate.h> ++#include <linux/ve_owner.h> + + #include <asm/poll.h> + #include <asm/siginfo.h> +@@ -190,6 +191,7 @@ out_fput: + fput(file); + goto out; + } ++EXPORT_SYMBOL_GPL(sys_dup2); + + asmlinkage long sys_dup(unsigned int fildes) + { +@@ -254,6 +256,7 @@ static int setfl(int fd, struct file * f + static void f_modown(struct file *filp, unsigned long pid, + uid_t uid, uid_t euid, int force) + { ++ pid = comb_vpid_to_pid(pid); + write_lock_irq(&filp->f_owner.lock); + if (force || !filp->f_owner.pid) { + filp->f_owner.pid = pid; +@@ -320,7 +323,7 @@ static long do_fcntl(int fd, unsigned in + * current syscall conventions, the only way + * to fix this will be in libc. + */ +- err = filp->f_owner.pid; ++ err = comb_pid_to_vpid(filp->f_owner.pid); + force_successful_syscall_return(); + break; + case F_SETOWN: +@@ -472,23 +475,29 @@ static void send_sigio_to_task(struct ta + void send_sigio(struct fown_struct *fown, int fd, int band) + { + struct task_struct *p; ++ struct file *f; ++ struct ve_struct *ve; + int pid; + + read_lock(&fown->lock); + pid = fown->pid; + if (!pid) + goto out_unlock_fown; ++ ++ /* hack: fown's are always embedded in struct file */ ++ f = container_of(fown, struct file, f_owner); ++ ve = VE_OWNER_FILP(f); + + read_lock(&tasklist_lock); + if (pid > 0) { +- p = find_task_by_pid(pid); +- if (p) { ++ p = find_task_by_pid_all(pid); ++ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { + send_sigio_to_task(p, fown, fd, band); + } + } else { +- do_each_task_pid(-pid, PIDTYPE_PGID, p) { ++ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { + send_sigio_to_task(p, fown, fd, band); +- } while_each_task_pid(-pid, PIDTYPE_PGID, p); ++ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); + } + read_unlock(&tasklist_lock); + out_unlock_fown: +@@ -505,6 +514,8 @@ static void send_sigurg_to_task(struct t + int send_sigurg(struct fown_struct *fown) + { + struct task_struct *p; ++ struct file *f; ++ struct ve_struct *ve; + int pid, ret = 0; + + read_lock(&fown->lock); +@@ -513,17 +524,19 @@ int send_sigurg(struct fown_struct *fown + goto out_unlock_fown; + + ret = 1; ++ f = container_of(fown, struct file, f_owner); ++ ve = VE_OWNER_FILP(f); + + read_lock(&tasklist_lock); + if (pid > 0) { +- p = find_task_by_pid(pid); +- if (p) { ++ p = find_task_by_pid_all(pid); ++ if (p && ve_accessible(VE_TASK_INFO(p)->owner_env, ve)) { + send_sigurg_to_task(p, fown); + } + } else { +- do_each_task_pid(-pid, PIDTYPE_PGID, p) { ++ __do_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve) { + send_sigurg_to_task(p, fown); +- } while_each_task_pid(-pid, PIDTYPE_PGID, p); ++ } __while_each_task_pid_ve(-pid, PIDTYPE_PGID, p, ve); + } + read_unlock(&tasklist_lock); + out_unlock_fown: +diff -upr linux-2.6.16.orig/fs/file.c linux-2.6.16-026test015/fs/file.c +--- linux-2.6.16.orig/fs/file.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/file.c 2006-07-04 14:41:39.000000000 +0400 +@@ -8,6 +8,7 @@ + + #include <linux/fs.h> + #include <linux/mm.h> ++#include <linux/module.h> + #include <linux/time.h> + #include <linux/slab.h> + #include <linux/vmalloc.h> +@@ -18,6 +19,8 @@ + #include <linux/rcupdate.h> + #include <linux/workqueue.h> + ++#include <ub/ub_mem.h> ++ + struct fdtable_defer { + spinlock_t lock; + struct work_struct wq; +@@ -44,9 +47,9 @@ struct file ** alloc_fd_array(int num) + int size = num * sizeof(struct file *); + + if (size <= PAGE_SIZE) +- new_fds = (struct file **) kmalloc(size, GFP_KERNEL); ++ new_fds = (struct file **) ub_kmalloc(size, GFP_KERNEL); + else +- new_fds = (struct file **) vmalloc(size); ++ new_fds = (struct file **) ub_vmalloc(size); + return new_fds; + } + +@@ -212,9 +215,9 @@ fd_set * alloc_fdset(int num) + int size = num / 8; + + if (size <= PAGE_SIZE) +- new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); ++ new_fdset = (fd_set *) ub_kmalloc(size, GFP_KERNEL); + else +- new_fdset = (fd_set *) vmalloc(size); ++ new_fdset = (fd_set *) ub_vmalloc(size); + return new_fdset; + } + +@@ -302,7 +305,7 @@ out: + * both fd array and fdset. It is expected to be called with the + * files_lock held. + */ +-static int expand_fdtable(struct files_struct *files, int nr) ++int expand_fdtable(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) + { +@@ -338,6 +341,7 @@ static int expand_fdtable(struct files_s + out: + return error; + } ++EXPORT_SYMBOL_GPL(expand_fdtable); + + /* + * Expand files. +diff -upr linux-2.6.16.orig/fs/file_table.c linux-2.6.16-026test015/fs/file_table.c +--- linux-2.6.16.orig/fs/file_table.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/file_table.c 2006-07-04 14:41:38.000000000 +0400 +@@ -9,6 +9,7 @@ + #include <linux/string.h> + #include <linux/slab.h> + #include <linux/file.h> ++#include <linux/ve_owner.h> + #include <linux/init.h> + #include <linux/module.h> + #include <linux/smp_lock.h> +@@ -25,6 +26,8 @@ + + #include <asm/atomic.h> + ++#include <ub/ub_misc.h> ++ + /* sysctl tunables... */ + struct files_stat_struct files_stat = { + .max_files = NR_FILE +@@ -38,6 +41,8 @@ static struct percpu_counter nr_files __ + static inline void file_free_rcu(struct rcu_head *head) + { + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); ++ ub_file_uncharge(f); ++ put_ve(VE_OWNER_FILP(f)); + kmem_cache_free(filp_cachep, f); + } + +@@ -109,6 +114,12 @@ struct file *get_empty_filp(void) + + percpu_counter_inc(&nr_files); + memset(f, 0, sizeof(*f)); ++ ++ if (ub_file_charge(f)) ++ goto fail_ch; ++ ++ SET_VE_OWNER_FILP(f, get_ve(get_exec_env())); ++ + if (security_file_alloc(f)) + goto fail_sec; + +@@ -134,6 +145,10 @@ fail_sec: + file_free(f); + fail: + return NULL; ++ ++fail_ch: ++ kmem_cache_free(filp_cachep, f); ++ return NULL; + } + + EXPORT_SYMBOL(get_empty_filp); +diff -upr linux-2.6.16.orig/fs/filesystems.c linux-2.6.16-026test015/fs/filesystems.c +--- linux-2.6.16.orig/fs/filesystems.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/filesystems.c 2006-07-04 14:41:38.000000000 +0400 +@@ -13,6 +13,7 @@ + #include <linux/init.h> + #include <linux/module.h> + #include <linux/sched.h> /* for 'current' */ ++#include <linux/ve_owner.h> + #include <asm/uaccess.h> + + /* +@@ -22,8 +23,8 @@ + * During the unload module must call unregister_filesystem(). + * We can access the fields of list element if: + * 1) spinlock is held or +- * 2) we hold the reference to the module. +- * The latter can be guaranteed by call of try_module_get(); if it ++ * 2) we hold the reference to the element. ++ * The latter can be guaranteed by call of try_filesystem(); if it + * returned 0 we must skip the element, otherwise we got the reference. + * Once the reference is obtained we can drop the spinlock. + */ +@@ -31,23 +32,51 @@ + static struct file_system_type *file_systems; + static DEFINE_RWLOCK(file_systems_lock); + ++int try_get_filesystem(struct file_system_type *fs) ++{ ++ if (try_module_get(fs->owner)) { ++#ifdef CONFIG_VE ++ get_ve(VE_OWNER_FSTYPE(fs)); ++#endif ++ return 1; ++ } ++ return 0; ++} ++ + /* WARNING: This can be used only if we _already_ own a reference */ + void get_filesystem(struct file_system_type *fs) + { ++#ifdef CONFIG_VE ++ get_ve(VE_OWNER_FSTYPE(fs)); ++#endif + __module_get(fs->owner); + } + + void put_filesystem(struct file_system_type *fs) + { + module_put(fs->owner); ++#ifdef CONFIG_VE ++ put_ve(VE_OWNER_FSTYPE(fs)); ++#endif ++} ++ ++static inline int check_ve_fstype(struct file_system_type *p, ++ struct ve_struct *env) ++{ ++ return ((p->fs_flags & FS_VIRTUALIZED) || ++ ve_accessible_strict(VE_OWNER_FSTYPE(p), env)); + } + +-static struct file_system_type **find_filesystem(const char *name) ++static struct file_system_type **find_filesystem(const char *name, ++ struct ve_struct *env) + { + struct file_system_type **p; +- for (p=&file_systems; *p; p=&(*p)->next) ++ for (p=&file_systems; *p; p=&(*p)->next) { ++ if (!check_ve_fstype(*p, env)) ++ continue; + if (strcmp((*p)->name,name) == 0) + break; ++ } + return p; + } + +@@ -74,8 +103,10 @@ int register_filesystem(struct file_syst + if (fs->next) + return -EBUSY; + INIT_LIST_HEAD(&fs->fs_supers); ++ if (VE_OWNER_FSTYPE(fs) == NULL) ++ SET_VE_OWNER_FSTYPE(fs, get_ve0()); + write_lock(&file_systems_lock); +- p = find_filesystem(fs->name); ++ p = find_filesystem(fs->name, VE_OWNER_FSTYPE(fs)); + if (*p) + res = -EBUSY; + else +@@ -132,11 +163,14 @@ static int fs_index(const char __user * + + err = -EINVAL; + read_lock(&file_systems_lock); +- for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { ++ for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next) { ++ if (!check_ve_fstype(tmp, get_exec_env())) ++ continue; + if (strcmp(tmp->name,name) == 0) { + err = index; + break; + } ++ index++; + } + read_unlock(&file_systems_lock); + putname(name); +@@ -149,9 +183,15 @@ static int fs_name(unsigned int index, c + int len, res; + + read_lock(&file_systems_lock); +- for (tmp = file_systems; tmp; tmp = tmp->next, index--) +- if (index <= 0 && try_module_get(tmp->owner)) +- break; ++ for (tmp = file_systems; tmp; tmp = tmp->next) { ++ if (!check_ve_fstype(tmp, get_exec_env())) ++ continue; ++ if (!index) { ++ if (try_get_filesystem(tmp)) ++ break; ++ } else ++ index--; ++ } + read_unlock(&file_systems_lock); + if (!tmp) + return -EINVAL; +@@ -169,8 +209,9 @@ static int fs_maxindex(void) + int index; + + read_lock(&file_systems_lock); +- for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) +- ; ++ for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next) ++ if (check_ve_fstype(tmp, get_exec_env())) ++ index++; + read_unlock(&file_systems_lock); + return index; + } +@@ -206,9 +247,10 @@ int get_filesystem_list(char * buf) + read_lock(&file_systems_lock); + tmp = file_systems; + while (tmp && len < PAGE_SIZE - 80) { +- len += sprintf(buf+len, "%s\t%s\n", +- (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", +- tmp->name); ++ if (check_ve_fstype(tmp, get_exec_env())) ++ len += sprintf(buf+len, "%s\t%s\n", ++ (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", ++ tmp->name); + tmp = tmp->next; + } + read_unlock(&file_systems_lock); +@@ -220,14 +262,14 @@ struct file_system_type *get_fs_type(con + struct file_system_type *fs; + + read_lock(&file_systems_lock); +- fs = *(find_filesystem(name)); +- if (fs && !try_module_get(fs->owner)) ++ fs = *(find_filesystem(name, get_exec_env())); ++ if (fs && !try_get_filesystem(fs)) + fs = NULL; + read_unlock(&file_systems_lock); + if (!fs && (request_module("%s", name) == 0)) { + read_lock(&file_systems_lock); +- fs = *(find_filesystem(name)); +- if (fs && !try_module_get(fs->owner)) ++ fs = *(find_filesystem(name, get_exec_env())); ++ if (fs && !try_get_filesystem(fs)) + fs = NULL; + read_unlock(&file_systems_lock); + } +@@ -235,3 +277,5 @@ struct file_system_type *get_fs_type(con + } + + EXPORT_SYMBOL(get_fs_type); ++EXPORT_SYMBOL(get_filesystem); ++EXPORT_SYMBOL(put_filesystem); +diff -upr linux-2.6.16.orig/fs/fuse/dir.c linux-2.6.16-026test015/fs/fuse/dir.c +--- linux-2.6.16.orig/fs/fuse/dir.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/fuse/dir.c 2006-07-04 14:41:37.000000000 +0400 +@@ -708,14 +708,15 @@ static int fuse_access(struct inode *ino + * access request is sent. Execute permission is still checked + * locally based on file mode. + */ +-static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) ++static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + else if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { +- int err = generic_permission(inode, mask, NULL); ++ int err = generic_permission(inode, mask, NULL, perm); + + /* If permission is denied, try to refresh file + attributes. This is also needed, because the root +@@ -723,7 +724,7 @@ static int fuse_permission(struct inode + if (err == -EACCES) { + err = fuse_do_getattr(inode); + if (!err) +- err = generic_permission(inode, mask, NULL); ++ err = generic_permission(inode, mask, NULL, perm); + } + + /* Note: the opposite of the above test does not +diff -upr linux-2.6.16.orig/fs/fuse/file.c linux-2.6.16-026test015/fs/fuse/file.c +--- linux-2.6.16.orig/fs/fuse/file.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/fuse/file.c 2006-07-04 14:41:36.000000000 +0400 +@@ -397,8 +397,12 @@ static int fuse_readpages(struct file *f + return -EINTR; + + err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); +- if (!err) +- fuse_send_readpages(data.req, file, inode); ++ if (!err) { ++ if (data.req->num_pages) ++ fuse_send_readpages(data.req, file, inode); ++ else ++ fuse_put_request(fc, data.req); ++ } + return err; + } + +diff -upr linux-2.6.16.orig/fs/hfs/inode.c linux-2.6.16-026test015/fs/hfs/inode.c +--- linux-2.6.16.orig/fs/hfs/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/hfs/inode.c 2006-07-04 14:41:37.000000000 +0400 +@@ -520,11 +520,11 @@ void hfs_clear_inode(struct inode *inode + } + + static int hfs_permission(struct inode *inode, int mask, +- struct nameidata *nd) ++ struct nameidata *nd, struct exec_perm *perm) + { + if (S_ISREG(inode->i_mode) && mask & MAY_EXEC) + return 0; +- return generic_permission(inode, mask, NULL); ++ return generic_permission(inode, mask, NULL, perm); + } + + static int hfs_file_open(struct inode *inode, struct file *file) +diff -upr linux-2.6.16.orig/fs/hfsplus/inode.c linux-2.6.16-026test015/fs/hfsplus/inode.c +--- linux-2.6.16.orig/fs/hfsplus/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/hfsplus/inode.c 2006-07-04 14:41:37.000000000 +0400 +@@ -237,7 +237,8 @@ static void hfsplus_set_perms(struct ino + perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev); + } + +-static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd) ++static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + /* MAY_EXEC is also used for lookup, if no x bit is set allow lookup, + * open_exec has the same test, so it's still not executable, if a x bit +@@ -245,7 +246,7 @@ static int hfsplus_permission(struct ino + */ + if (S_ISREG(inode->i_mode) && mask & MAY_EXEC && !(inode->i_mode & 0111)) + return 0; +- return generic_permission(inode, mask, NULL); ++ return generic_permission(inode, mask, NULL, perm); + } + + +diff -upr linux-2.6.16.orig/fs/hostfs/hostfs_kern.c linux-2.6.16-026test015/fs/hostfs/hostfs_kern.c +--- linux-2.6.16.orig/fs/hostfs/hostfs_kern.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/hostfs/hostfs_kern.c 2006-07-04 14:41:37.000000000 +0400 +@@ -796,7 +796,8 @@ int hostfs_rename(struct inode *from_ino + return(err); + } + +-int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd) ++int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd, ++ struct exec_perm *perm) + { + char *name; + int r = 0, w = 0, x = 0, err; +@@ -814,7 +815,7 @@ int hostfs_permission(struct inode *ino, + err = access_file(name, r, w, x); + kfree(name); + if(!err) +- err = generic_permission(ino, desired, NULL); ++ err = generic_permission(ino, desired, NULL, perm); + return err; + } + +diff -upr linux-2.6.16.orig/fs/hpfs/namei.c linux-2.6.16-026test015/fs/hpfs/namei.c +--- linux-2.6.16.orig/fs/hpfs/namei.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/hpfs/namei.c 2006-07-04 14:41:37.000000000 +0400 +@@ -415,7 +415,7 @@ again: + d_drop(dentry); + spin_lock(&dentry->d_lock); + if (atomic_read(&dentry->d_count) > 1 || +- permission(inode, MAY_WRITE, NULL) || ++ permission(inode, MAY_WRITE, NULL, NULL) || + !S_ISREG(inode->i_mode) || + get_write_access(inode)) { + spin_unlock(&dentry->d_lock); +diff -upr linux-2.6.16.orig/fs/hugetlbfs/inode.c linux-2.6.16-026test015/fs/hugetlbfs/inode.c +--- linux-2.6.16.orig/fs/hugetlbfs/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/hugetlbfs/inode.c 2006-07-04 14:41:39.000000000 +0400 +@@ -800,7 +800,7 @@ struct file *hugetlb_zero_setup(size_t s + struct inode *inode; + struct dentry *dentry, *root; + struct qstr quick_string; +- char buf[16]; ++ char buf[64]; + + if (!can_do_hugetlb_shm()) + return ERR_PTR(-EPERM); +@@ -812,7 +812,8 @@ struct file *hugetlb_zero_setup(size_t s + return ERR_PTR(-ENOMEM); + + root = hugetlbfs_vfsmount->mnt_root; +- snprintf(buf, 16, "%lu", hugetlbfs_counter()); ++ snprintf(buf, sizeof(buf), "VE%d-%lu", ++ VEID(get_exec_env()), hugetlbfs_counter()); + quick_string.name = buf; + quick_string.len = strlen(quick_string.name); + quick_string.hash = 0; +diff -upr linux-2.6.16.orig/fs/inode.c linux-2.6.16-026test015/fs/inode.c +--- linux-2.6.16.orig/fs/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/inode.c 2006-07-04 14:41:39.000000000 +0400 +@@ -9,6 +9,7 @@ + #include <linux/mm.h> + #include <linux/dcache.h> + #include <linux/init.h> ++#include <linux/kernel_stat.h> + #include <linux/quotaops.h> + #include <linux/slab.h> + #include <linux/writeback.h> +@@ -98,13 +99,15 @@ DECLARE_MUTEX(iprune_sem); + */ + struct inodes_stat_t inodes_stat; + +-static kmem_cache_t * inode_cachep; ++kmem_cache_t *inode_cachep; ++ ++static struct address_space_operations vfs_empty_aops; ++struct inode_operations vfs_empty_iops; ++static struct file_operations vfs_empty_fops; ++EXPORT_SYMBOL(vfs_empty_iops); + + static struct inode *alloc_inode(struct super_block *sb) + { +- static struct address_space_operations empty_aops; +- static struct inode_operations empty_iops; +- static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) +@@ -119,8 +122,8 @@ static struct inode *alloc_inode(struct + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); +- inode->i_op = &empty_iops; +- inode->i_fop = &empty_fops; ++ inode->i_op = &vfs_empty_iops; ++ inode->i_fop = &vfs_empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; +@@ -144,7 +147,7 @@ static struct inode *alloc_inode(struct + return NULL; + } + +- mapping->a_ops = &empty_aops; ++ mapping->a_ops = &vfs_empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER); +@@ -303,13 +306,57 @@ static void dispose_list(struct list_hea + spin_unlock(&inode_lock); + } + ++static void show_header(struct inode *inode) ++{ ++ struct super_block *sb = inode->i_sb; ++ ++ printk("VFS: Busy inodes after unmount. " ++ "sb = %p, fs type = %s, sb count = %d, " ++ "sb->s_root = %s\n", sb, ++ (sb->s_type != NULL) ? sb->s_type->name : "", ++ sb->s_count, ++ (sb->s_root != NULL) ? ++ (char *)sb->s_root->d_name.name : ""); ++} ++ ++static void show_inode(struct list_head *tmp, struct inode *inode) ++{ ++ struct dentry *d; ++ int i; ++ ++ printk("inode = %p, inode->i_count = %d, " ++ "inode->i_nlink = %d, " ++ "inode->i_mode = %d, " ++ "inode->i_state = %ld, " ++ "inode->i_flags = %d, " ++ "inode->i_devices.next = %p, " ++ "inode->i_devices.prev = %p, " ++ "inode->i_ino = %ld\n", ++ tmp, ++ atomic_read(&inode->i_count), ++ inode->i_nlink, ++ inode->i_mode, ++ inode->i_state, ++ inode->i_flags, ++ inode->i_devices.next, ++ inode->i_devices.prev, ++ inode->i_ino); ++ printk("inode dump: "); ++ for (i = 0; i < sizeof(*tmp); i++) ++ printk("%2.2x ", *((u_char *)tmp + i)); ++ printk("\n"); ++ list_for_each_entry(d, &inode->i_dentry, d_alias) ++ printk(" d_alias %s\n", ++ d->d_name.name); ++} ++ + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct list_head *dispose) ++static int invalidate_list(struct list_head *head, struct list_head *dispose, int check) + { + struct list_head *next; +- int busy = 0, count = 0; ++ int busy = 0, count = 0, once = 1; + + next = head->next; + for (;;) { +@@ -336,6 +383,14 @@ static int invalidate_list(struct list_h + continue; + } + busy = 1; ++ ++ if (check) { ++ if (once) { ++ once = 0; ++ show_header(inode); ++ } ++ show_inode(tmp, inode); ++ } + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; +@@ -350,7 +405,7 @@ static int invalidate_list(struct list_h + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int check) + { + int busy; + LIST_HEAD(throw_away); +@@ -358,7 +413,7 @@ int invalidate_inodes(struct super_block + down(&iprune_sem); + spin_lock(&inode_lock); + inotify_unmount_inodes(&sb->s_inodes); +- busy = invalidate_list(&sb->s_inodes, &throw_away); ++ busy = invalidate_list(&sb->s_inodes, &throw_away, check); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -382,7 +437,7 @@ int __invalidate_device(struct block_dev + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, 0); + drop_super(sb); + } + invalidate_bdev(bdev, 0); +@@ -478,6 +533,7 @@ static void prune_icache(int nr_to_scan) + */ + static int shrink_icache_memory(int nr, gfp_t gfp_mask) + { ++ KSTAT_PERF_ENTER(shrink_icache) + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, +@@ -488,6 +544,7 @@ static int shrink_icache_memory(int nr, + return -1; + prune_icache(nr); + } ++ KSTAT_PERF_LEAVE(shrink_icache) + return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + } + +@@ -737,7 +794,7 @@ EXPORT_SYMBOL(iunique); + struct inode *igrab(struct inode *inode) + { + spin_lock(&inode_lock); +- if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) ++ if (inode && !(inode->i_state & (I_FREEING|I_WILL_FREE))) + __iget(inode); + else + /* +diff -upr linux-2.6.16.orig/fs/inotify.c linux-2.6.16-026test015/fs/inotify.c +--- linux-2.6.16.orig/fs/inotify.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/inotify.c 2006-07-04 14:41:37.000000000 +0400 +@@ -374,7 +374,7 @@ static int find_inode(const char __user + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ +- error = vfs_permission(nd, MAY_READ); ++ error = vfs_permission(nd, MAY_READ, NULL); + if (error) + path_release(nd); + return error; +diff -upr linux-2.6.16.orig/fs/ioprio.c linux-2.6.16-026test015/fs/ioprio.c +--- linux-2.6.16.orig/fs/ioprio.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ioprio.c 2006-07-04 14:41:38.000000000 +0400 +@@ -53,6 +53,9 @@ asmlinkage long sys_ioprio_set(int which + struct user_struct *user; + int ret; + ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + switch (class) { + case IOPRIO_CLASS_RT: + if (!capable(CAP_SYS_ADMIN)) +@@ -78,18 +81,18 @@ asmlinkage long sys_ioprio_set(int which + if (!who) + p = current; + else +- p = find_task_by_pid(who); ++ p = find_task_by_pid_all(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ do_each_task_pid_all(who, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_all(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) +@@ -100,13 +103,13 @@ asmlinkage long sys_ioprio_set(int which + if (!user) + break; + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->uid != who) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + break; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + if (who) + free_uid(user); +@@ -131,19 +134,19 @@ asmlinkage long sys_ioprio_get(int which + if (!who) + p = current; + else +- p = find_task_by_pid(who); ++ p = find_task_by_pid_ve(who); + if (p) + ret = p->ioprio; + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) +@@ -154,14 +157,14 @@ asmlinkage long sys_ioprio_get(int which + if (!user) + break; + +- do_each_thread(g, p) { ++ do_each_thread_ve(g, p) { + if (p->uid != user->uid) + continue; + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); +- } while_each_thread(g, p); ++ } while_each_thread_ve(g, p); + + if (who) + free_uid(user); +diff -upr linux-2.6.16.orig/fs/jbd/journal.c linux-2.6.16-026test015/fs/jbd/journal.c +--- linux-2.6.16.orig/fs/jbd/journal.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/jbd/journal.c 2006-07-04 14:41:37.000000000 +0400 +@@ -210,10 +210,16 @@ end_loop: + return 0; + } + +-static void journal_start_thread(journal_t *journal) ++static int journal_start_thread(journal_t *journal) + { +- kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES); ++ int err; ++ ++ err = kernel_thread(kjournald, journal, CLONE_VM|CLONE_FS|CLONE_FILES); ++ if (err < 0) ++ return err; ++ + wait_event(journal->j_wait_done_commit, journal->j_task != 0); ++ return 0; + } + + static void journal_kill_thread(journal_t *journal) +@@ -839,8 +845,7 @@ static int journal_reset(journal_t *jour + + /* Add the dynamic fields and write it to disk. */ + journal_update_superblock(journal, 1); +- journal_start_thread(journal); +- return 0; ++ return journal_start_thread(journal); + } + + /** +diff -upr linux-2.6.16.orig/fs/jbd/transaction.c linux-2.6.16-026test015/fs/jbd/transaction.c +--- linux-2.6.16.orig/fs/jbd/transaction.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/jbd/transaction.c 2006-07-04 14:41:37.000000000 +0400 +@@ -1868,6 +1868,7 @@ zap_buffer_unlocked: + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); ++ clear_buffer_delay(bh); + bh->b_bdev = NULL; + return may_free; + } +diff -upr linux-2.6.16.orig/fs/jfs/acl.c linux-2.6.16-026test015/fs/jfs/acl.c +--- linux-2.6.16.orig/fs/jfs/acl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/jfs/acl.c 2006-07-04 14:41:37.000000000 +0400 +@@ -140,9 +140,10 @@ static int jfs_check_acl(struct inode *i + return -EAGAIN; + } + +-int jfs_permission(struct inode *inode, int mask, struct nameidata *nd) ++int jfs_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { +- return generic_permission(inode, mask, jfs_check_acl); ++ return generic_permission(inode, mask, jfs_check_acl, perm); + } + + int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) +diff -upr linux-2.6.16.orig/fs/jfs/jfs_acl.h linux-2.6.16-026test015/fs/jfs/jfs_acl.h +--- linux-2.6.16.orig/fs/jfs/jfs_acl.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/jfs/jfs_acl.h 2006-07-04 14:41:37.000000000 +0400 +@@ -20,7 +20,7 @@ + + #ifdef CONFIG_JFS_POSIX_ACL + +-int jfs_permission(struct inode *, int, struct nameidata *); ++int jfs_permission(struct inode *, int, struct nameidata *, struct exec_perm *); + int jfs_init_acl(tid_t, struct inode *, struct inode *); + int jfs_setattr(struct dentry *, struct iattr *); + +diff -upr linux-2.6.16.orig/fs/jfs/jfs_metapage.c linux-2.6.16-026test015/fs/jfs/jfs_metapage.c +--- linux-2.6.16.orig/fs/jfs/jfs_metapage.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/jfs/jfs_metapage.c 2006-07-04 14:41:36.000000000 +0400 +@@ -543,7 +543,7 @@ add_failed: + static int metapage_releasepage(struct page *page, gfp_t gfp_mask) + { + struct metapage *mp; +- int busy = 0; ++ int ret = 1; + unsigned int offset; + + for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { +@@ -553,30 +553,20 @@ static int metapage_releasepage(struct p + continue; + + jfs_info("metapage_releasepage: mp = 0x%p", mp); +- if (mp->count || mp->nohomeok) { ++ if (mp->count || mp->nohomeok || ++ test_bit(META_dirty, &mp->flag)) { + jfs_info("count = %ld, nohomeok = %d", mp->count, + mp->nohomeok); +- busy = 1; ++ ret = 0; + continue; + } +- wait_on_page_writeback(page); +- //WARN_ON(test_bit(META_dirty, &mp->flag)); +- if (test_bit(META_dirty, &mp->flag)) { +- dump_mem("dirty mp in metapage_releasepage", mp, +- sizeof(struct metapage)); +- dump_mem("page", page, sizeof(struct page)); +- dump_stack(); +- } + if (mp->lsn) + remove_from_logsync(mp); + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); + } +- if (busy) +- return -1; +- +- return 0; ++ return ret; + } + + static int metapage_invalidatepage(struct page *page, unsigned long offset) +diff -upr linux-2.6.16.orig/fs/lockd/clntproc.c linux-2.6.16-026test015/fs/lockd/clntproc.c +--- linux-2.6.16.orig/fs/lockd/clntproc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/lockd/clntproc.c 2006-07-04 14:41:38.000000000 +0400 +@@ -130,10 +130,10 @@ static void nlmclnt_setlockargs(struct n + nlmclnt_next_cookie(&argp->cookie); + argp->state = nsm_local_state; + memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); +- lock->caller = system_utsname.nodename; ++ lock->caller = ve_utsname.nodename; + lock->oh.data = req->a_owner; + lock->oh.len = sprintf(req->a_owner, "%d@%s", +- current->pid, system_utsname.nodename); ++ current->pid, ve_utsname.nodename); + locks_copy_lock(&lock->fl, fl); + } + +@@ -154,7 +154,7 @@ nlmclnt_setgrantargs(struct nlm_rqst *ca + { + locks_copy_lock(&call->a_args.lock.fl, &lock->fl); + memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); +- call->a_args.lock.caller = system_utsname.nodename; ++ call->a_args.lock.caller = ve_utsname.nodename; + call->a_args.lock.oh.len = lock->oh.len; + + /* set default data area */ +diff -upr linux-2.6.16.orig/fs/lockd/mon.c linux-2.6.16-026test015/fs/lockd/mon.c +--- linux-2.6.16.orig/fs/lockd/mon.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/lockd/mon.c 2006-07-04 14:41:38.000000000 +0400 +@@ -147,7 +147,7 @@ xdr_encode_common(struct rpc_rqst *rqstp + */ + sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); + if (!(p = xdr_encode_string(p, buffer)) +- || !(p = xdr_encode_string(p, system_utsname.nodename))) ++ || !(p = xdr_encode_string(p, ve_utsname.nodename))) + return ERR_PTR(-EIO); + *p++ = htonl(argp->prog); + *p++ = htonl(argp->vers); +diff -upr linux-2.6.16.orig/fs/locks.c linux-2.6.16-026test015/fs/locks.c +--- linux-2.6.16.orig/fs/locks.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/locks.c 2006-07-04 14:41:39.000000000 +0400 +@@ -129,6 +129,8 @@ + #include <asm/semaphore.h> + #include <asm/uaccess.h> + ++#include <ub/ub_misc.h> ++ + #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) + #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) + #define IS_LEASE(fl) (fl->fl_flags & FL_LEASE) +@@ -148,11 +150,28 @@ static LIST_HEAD(blocked_list); + static kmem_cache_t *filelock_cache; + + /* Allocate an empty lock structure. */ +-static struct file_lock *locks_alloc_lock(void) ++static struct file_lock *locks_alloc_lock(int charge) + { +- return kmem_cache_alloc(filelock_cache, SLAB_KERNEL); ++ struct file_lock *fl; ++ ++ fl = kmem_cache_alloc(filelock_cache, SLAB_KERNEL); ++#ifdef CONFIG_USER_RESOURCE ++ if (fl == NULL) ++ goto out; ++ fl->fl_charged = 0; ++ if (!charge) ++ goto out; ++ if (!ub_flock_charge(fl, 1)) ++ goto out; ++ ++ kmem_cache_free(filelock_cache, fl); ++ fl = NULL; ++out: ++#endif ++ return fl; + } + ++ + /* Free a lock which is not in use. */ + static void locks_free_lock(struct file_lock *fl) + { +@@ -181,6 +200,7 @@ static void locks_free_lock(struct file_ + fl->fl_lmops = NULL; + } + ++ ub_flock_uncharge(fl); + kmem_cache_free(filelock_cache, fl); + } + +@@ -263,7 +283,7 @@ static int flock_make_lock(struct file * + if (type < 0) + return type; + +- fl = locks_alloc_lock(); ++ fl = locks_alloc_lock(type != F_UNLCK); + if (fl == NULL) + return -ENOMEM; + +@@ -432,15 +452,14 @@ static struct lock_manager_operations le + */ + static int lease_init(struct file *filp, int type, struct file_lock *fl) + { ++ if (assign_type(fl, type) != 0) ++ return -EINVAL; ++ + fl->fl_owner = current->files; + fl->fl_pid = current->tgid; + + fl->fl_file = filp; + fl->fl_flags = FL_LEASE; +- if (assign_type(fl, type) != 0) { +- locks_free_lock(fl); +- return -EINVAL; +- } + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; + fl->fl_ops = NULL; +@@ -451,17 +470,20 @@ static int lease_init(struct file *filp, + /* Allocate a file_lock initialised to this type of lease */ + static int lease_alloc(struct file *filp, int type, struct file_lock **flp) + { +- struct file_lock *fl = locks_alloc_lock(); +- int error; ++ struct file_lock *fl = locks_alloc_lock(1); ++ int error = -ENOMEM; + + if (fl == NULL) +- return -ENOMEM; ++ goto out; + + error = lease_init(filp, type, fl); +- if (error) +- return error; ++ if (error) { ++ locks_free_lock(fl); ++ fl = NULL; ++ } ++out: + *flp = fl; +- return 0; ++ return error; + } + + /* Check if two locks overlap each other. +@@ -712,8 +734,9 @@ EXPORT_SYMBOL(posix_locks_deadlock); + * at the head of the list, but that's secret knowledge known only to + * flock_lock_file and posix_lock_file. + */ +-static int flock_lock_file(struct file *filp, struct file_lock *new_fl) ++static int flock_lock_file(struct file *filp, struct file_lock *request) + { ++ struct file_lock *new_fl = NULL; + struct file_lock **before; + struct inode * inode = filp->f_dentry->d_inode; + int error = 0; +@@ -728,44 +751,60 @@ static int flock_lock_file(struct file * + continue; + if (filp != fl->fl_file) + continue; +- if (new_fl->fl_type == fl->fl_type) ++ if (request->fl_type == fl->fl_type) + goto out; + found = 1; + locks_delete_lock(before); + break; + } +- unlock_kernel(); + +- if (new_fl->fl_type == F_UNLCK) +- return 0; ++ if (request->fl_type == F_UNLCK) ++ goto out; + + /* ++ * Nont F_UNLCK request must be already charged in ++ * flock_make_lock(). ++ * ++ * actually new_fl must be charged not the request, ++ * but we try to fail earlier ++ */ ++ error = -ENOMEM; ++ new_fl = locks_alloc_lock(0); ++ if (new_fl == NULL) ++ goto out; ++ /* + * If a higher-priority process was blocked on the old file lock, + * give it the opportunity to lock the file. + */ + if (found) + cond_resched(); + +- lock_kernel(); + for_each_lock(inode, before) { + struct file_lock *fl = *before; + if (IS_POSIX(fl)) + break; + if (IS_LEASE(fl)) + continue; +- if (!flock_locks_conflict(new_fl, fl)) ++ if (!flock_locks_conflict(request, fl)) + continue; + error = -EAGAIN; +- if (new_fl->fl_flags & FL_SLEEP) { +- locks_insert_block(fl, new_fl); +- } ++ if (request->fl_flags & FL_SLEEP) ++ locks_insert_block(fl, request); + goto out; + } ++ ++ set_flock_charged(new_fl); ++ unset_flock_charged(request); ++ ++ locks_copy_lock(new_fl, request); + locks_insert_lock(&inode->i_flock, new_fl); ++ new_fl = NULL; + error = 0; + + out: + unlock_kernel(); ++ if (new_fl) ++ locks_free_lock(new_fl); + return error; + } + +@@ -784,8 +823,11 @@ static int __posix_lock_file(struct inod + * We may need two file_lock structures for this operation, + * so we get them in advance to avoid races. + */ +- new_fl = locks_alloc_lock(); +- new_fl2 = locks_alloc_lock(); ++ if (request->fl_type != F_UNLCK) ++ new_fl = locks_alloc_lock(1); ++ else ++ new_fl = NULL; ++ new_fl2 = locks_alloc_lock(0); + + lock_kernel(); + if (request->fl_type != F_UNLCK) { +@@ -813,7 +855,7 @@ static int __posix_lock_file(struct inod + goto out; + + error = -ENOLCK; /* "no luck" */ +- if (!(new_fl && new_fl2)) ++ if (!((request->fl_type == F_UNLCK || new_fl) && new_fl2)) + goto out; + + /* +@@ -919,19 +961,30 @@ static int __posix_lock_file(struct inod + if (!added) { + if (request->fl_type == F_UNLCK) + goto out; ++ error = -ENOLCK; ++ if (right && (left == right) && ub_flock_charge(new_fl, 1)) ++ goto out; + locks_copy_lock(new_fl, request); + locks_insert_lock(before, new_fl); + new_fl = NULL; ++ error = 0; + } + if (right) { + if (left == right) { + /* The new lock breaks the old one in two pieces, + * so we have to use the second new lock. + */ ++ error = -ENOLCK; ++ if (added && ub_flock_charge(new_fl2, ++ request->fl_type != F_UNLCK)) ++ goto out; ++ /* FIXME move all fl_charged manipulations in ub code */ ++ set_flock_charged(new_fl2); + left = new_fl2; + new_fl2 = NULL; + locks_copy_lock(left, right); + locks_insert_lock(before, left); ++ error = 0; + } + right->fl_start = request->fl_end + 1; + locks_wake_up_blocks(right); +@@ -1337,6 +1390,7 @@ static int __setlease(struct file *filp, + goto out; + + if (my_before != NULL) { ++ *flp = *my_before; + error = lease->fl_lmops->fl_change(my_before, arg); + goto out; + } +@@ -1529,15 +1583,14 @@ asmlinkage long sys_flock(unsigned int f + error = flock_lock_file_wait(filp, lock); + + out_free: +- if (list_empty(&lock->fl_link)) { +- locks_free_lock(lock); +- } ++ locks_free_lock(lock); + + out_putf: + fput(filp); + out: + return error; + } ++EXPORT_SYMBOL_GPL(sys_flock); + + /* Report the first existing lock that would conflict with l. + * This implements the F_GETLK command of fcntl(). +@@ -1573,7 +1626,7 @@ int fcntl_getlk(struct file *filp, struc + + flock.l_type = F_UNLCK; + if (fl != NULL) { +- flock.l_pid = fl->fl_pid; ++ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); + #if BITS_PER_LONG == 32 + /* + * Make sure we can represent the posix lock via +@@ -1605,7 +1658,7 @@ out: + int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock __user *l) + { +- struct file_lock *file_lock = locks_alloc_lock(); ++ struct file_lock *file_lock = locks_alloc_lock(0); + struct flock flock; + struct inode *inode; + int error; +@@ -1727,7 +1780,7 @@ int fcntl_getlk64(struct file *filp, str + + flock.l_type = F_UNLCK; + if (fl != NULL) { +- flock.l_pid = fl->fl_pid; ++ flock.l_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); + flock.l_start = fl->fl_start; + flock.l_len = fl->fl_end == OFFSET_MAX ? 0 : + fl->fl_end - fl->fl_start + 1; +@@ -1748,7 +1801,7 @@ out: + int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock64 __user *l) + { +- struct file_lock *file_lock = locks_alloc_lock(); ++ struct file_lock *file_lock = locks_alloc_lock(0); + struct flock64 flock; + struct inode *inode; + int error; +@@ -1976,7 +2029,9 @@ EXPORT_SYMBOL(posix_unblock_lock); + static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx) + { + struct inode *inode = NULL; ++ unsigned int fl_pid; + ++ fl_pid = pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); + if (fl->fl_file != NULL) + inode = fl->fl_file->f_dentry->d_inode; + +@@ -2018,16 +2073,16 @@ static void lock_get_status(char* out, s + } + if (inode) { + #ifdef WE_CAN_BREAK_LSLK_NOW +- out += sprintf(out, "%d %s:%ld ", fl->fl_pid, ++ out += sprintf(out, "%d %s:%ld ", fl_pid, + inode->i_sb->s_id, inode->i_ino); + #else + /* userspace relies on this representation of dev_t ;-( */ +- out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid, ++ out += sprintf(out, "%d %02x:%02x:%ld ", fl_pid, + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino); + #endif + } else { +- out += sprintf(out, "%d <none>:0 ", fl->fl_pid); ++ out += sprintf(out, "%d <none>:0 ", fl_pid); + } + if (IS_POSIX(fl)) { + if (fl->fl_end == OFFSET_MAX) +@@ -2076,11 +2131,17 @@ int get_locks_status(char *buffer, char + char *q = buffer; + off_t pos = 0; + int i = 0; ++ struct ve_struct *env; + + lock_kernel(); ++ env = get_exec_env(); + list_for_each(tmp, &file_lock_list) { + struct list_head *btmp; + struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link); ++ ++ if (!ve_accessible(VE_OWNER_FILP(fl->fl_file), env)) ++ continue; ++ + lock_get_status(q, fl, ++i, ""); + move_lock_status(&q, &pos, offset); + +@@ -2212,7 +2273,12 @@ void steal_locks(fl_owner_t from) + + lock_kernel(); + j = 0; +- rcu_read_lock(); ++ ++ /* ++ * We are not taking a ref to the file structures, so ++ * we need to acquire ->file_lock. ++ */ ++ spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (;;) { + unsigned long set; +@@ -2230,7 +2296,7 @@ void steal_locks(fl_owner_t from) + set >>= 1; + } + } +- rcu_read_unlock(); ++ spin_unlock(&files->file_lock); + unlock_kernel(); + } + EXPORT_SYMBOL(steal_locks); +@@ -2238,7 +2304,7 @@ EXPORT_SYMBOL(steal_locks); + static int __init filelock_init(void) + { + filelock_cache = kmem_cache_create("file_lock_cache", +- sizeof(struct file_lock), 0, SLAB_PANIC, ++ sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_UBC, + init_once, NULL); + return 0; + } +diff -upr linux-2.6.16.orig/fs/namei.c linux-2.6.16-026test015/fs/namei.c +--- linux-2.6.16.orig/fs/namei.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/namei.c 2006-07-04 14:41:39.000000000 +0400 +@@ -179,7 +179,7 @@ EXPORT_SYMBOL(putname); + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +-int generic_permission(struct inode *inode, int mask, ++static int __generic_permission(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) + { + umode_t mode = inode->i_mode; +@@ -225,7 +225,26 @@ int generic_permission(struct inode *ino + return -EACCES; + } + +-int permission(struct inode *inode, int mask, struct nameidata *nd) ++int generic_permission(struct inode *inode, int mask, ++ int (*check_acl)(struct inode *inode, int mask), ++ struct exec_perm *perm) ++{ ++ int ret; ++ ++ if (perm == NULL) ++ return __generic_permission(inode, mask, check_acl); ++ ++ mutex_lock(&inode->i_mutex); ++ ret = __generic_permission(inode, mask, check_acl); ++ if (!ret) ++ set_exec_perm(perm, inode); ++ mutex_unlock(&inode->i_mutex); ++ return ret; ++} ++ ++ ++int permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + int retval, submask; + +@@ -250,9 +269,9 @@ int permission(struct inode *inode, int + /* Ordinary permission routines do not understand MAY_APPEND. */ + submask = mask & ~MAY_APPEND; + if (inode->i_op && inode->i_op->permission) +- retval = inode->i_op->permission(inode, submask, nd); ++ retval = inode->i_op->permission(inode, submask, nd, perm); + else +- retval = generic_permission(inode, submask, NULL); ++ retval = generic_permission(inode, submask, NULL, perm); + if (retval) + return retval; + +@@ -269,9 +288,9 @@ int permission(struct inode *inode, int + * for filesystem access without changing the "normal" uids which + * are used for other things. + */ +-int vfs_permission(struct nameidata *nd, int mask) ++int vfs_permission(struct nameidata *nd, int mask, struct exec_perm *perm) + { +- return permission(nd->dentry->d_inode, mask, nd); ++ return permission(nd->dentry->d_inode, mask, nd, perm); + } + + /** +@@ -288,7 +307,7 @@ int vfs_permission(struct nameidata *nd, + */ + int file_permission(struct file *file, int mask) + { +- return permission(file->f_dentry->d_inode, mask, NULL); ++ return permission(file->f_dentry->d_inode, mask, NULL, NULL); + } + + /* +@@ -379,6 +398,21 @@ static struct dentry * cached_lookup(str + if (!dentry) + dentry = d_lookup(parent, name); + ++ /* ++ * The revalidation rules are simple: ++ * d_revalidate operation is called when we're about to use a cached ++ * dentry rather than call d_lookup. ++ * d_revalidate method may unhash the dentry itself or return FALSE, in ++ * which case if the dentry can be released d_lookup will be called. ++ * ++ * Additionally, by request of NFS people ++ * (http://linux.bkbits.net:8080/linux-2.4/cset@1.181?nav=index.html|src/|src/fs|related/fs/namei.c) ++ * d_revalidate is called when `/', `.' or `..' are looked up. ++ * Since re-lookup is impossible on them, we introduce a hack and ++ * return an error in this case. ++ * ++ * 2003/02/19 SAW ++ */ + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) { + dput(dentry); +@@ -441,6 +475,7 @@ static struct dentry * real_lookup(struc + struct dentry * result; + struct inode *dir = parent->d_inode; + ++repeat: + mutex_lock(&dir->i_mutex); + /* + * First re-do the cached lookup just in case it was created +@@ -479,7 +514,7 @@ static struct dentry * real_lookup(struc + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { + dput(result); +- result = ERR_PTR(-ENOENT); ++ goto repeat; + } + } + return result; +@@ -704,7 +739,14 @@ static __always_inline void follow_dotdo + read_unlock(¤t->fs->lock); + break; + } +- read_unlock(¤t->fs->lock); ++#ifdef CONFIG_VE ++ if (nd->dentry == get_exec_env()->fs_root && ++ nd->mnt == get_exec_env()->fs_rootmnt) { ++ read_unlock(¤t->fs->lock); ++ break; ++ } ++#endif ++ read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + if (nd->dentry != nd->mnt->mnt_root) { + nd->dentry = dget(nd->dentry->d_parent); +@@ -745,6 +787,10 @@ static int do_lookup(struct nameidata *n + if (dentry->d_op && dentry->d_op->d_revalidate) + goto need_revalidate; + done: ++ if ((nd->flags & LOOKUP_STRICT) && d_mountpoint(dentry)) { ++ dput(dentry); ++ return -ENOENT; ++ } + path->mnt = mnt; + path->dentry = dentry; + __follow_mount(path); +@@ -780,6 +826,7 @@ static fastcall int __link_path_walk(con + { + struct path next; + struct inode *inode; ++ int real_components = 0; + int err; + unsigned int lookup_flags = nd->flags; + +@@ -801,7 +848,7 @@ static fastcall int __link_path_walk(con + nd->flags |= LOOKUP_CONTINUE; + err = exec_permission_lite(inode, nd); + if (err == -EAGAIN) +- err = vfs_permission(nd, MAY_EXEC); ++ err = vfs_permission(nd, MAY_EXEC, NULL); + if (err) + break; + +@@ -851,6 +898,7 @@ static fastcall int __link_path_walk(con + break; + } + /* This does the actual lookups.. */ ++ real_components++; + err = do_lookup(nd, &this, &next); + if (err) + break; +@@ -864,6 +912,9 @@ static fastcall int __link_path_walk(con + goto out_dput; + + if (inode->i_op->follow_link) { ++ err = -ENOENT; ++ if (lookup_flags & LOOKUP_STRICT) ++ goto out_dput; + err = do_follow_link(&next, nd); + if (err) + goto return_err; +@@ -911,6 +962,7 @@ last_component: + break; + inode = next.dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) ++ && !(lookup_flags & LOOKUP_STRICT) + && inode && inode->i_op && inode->i_op->follow_link) { + err = do_follow_link(&next, nd); + if (err) +@@ -932,26 +984,40 @@ lookup_parent: + nd->last_type = LAST_NORM; + if (this.name[0] != '.') + goto return_base; +- if (this.len == 1) ++ if (this.len == 1) { + nd->last_type = LAST_DOT; +- else if (this.len == 2 && this.name[1] == '.') ++ goto return_reval; ++ } else if (this.len == 2 && this.name[1] == '.') { + nd->last_type = LAST_DOTDOT; +- else +- goto return_base; ++ goto return_reval; ++ } ++return_base: ++ if (!(nd->flags & LOOKUP_NOAREACHECK)) { ++ err = check_area_access_ve(nd->dentry, nd->mnt); ++ if (err) ++ break; ++ } ++ return 0; + return_reval: + /* + * We bypassed the ordinary revalidation routines. + * We may need to check the cached dentry for staleness. + */ +- if (nd->dentry && nd->dentry->d_sb && ++ if (!real_components && nd->dentry && nd->dentry->d_sb && + (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) { + err = -ESTALE; + /* Note: we do not d_invalidate() */ + if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd)) ++ /* ++ * This lookup is for `/' or `.' or `..'. ++ * The filesystem unhashed the dentry itself ++ * inside d_revalidate (otherwise, d_invalidate ++ * wouldn't succeed). As a special courtesy to ++ * NFS we return an error. 2003/02/19 SAW ++ */ + break; + } +-return_base: +- return 0; ++ goto return_base; + out_dput: + dput_path(&next, nd); + break; +@@ -1077,8 +1143,8 @@ static int fastcall do_path_lookup(int d + nd->flags = flags; + nd->depth = 0; + +- read_lock(¤t->fs->lock); + if (*name=='/') { ++ read_lock(¤t->fs->lock); + if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { + nd->mnt = mntget(current->fs->altrootmnt); + nd->dentry = dget(current->fs->altroot); +@@ -1089,33 +1155,35 @@ static int fastcall do_path_lookup(int d + } + nd->mnt = mntget(current->fs->rootmnt); + nd->dentry = dget(current->fs->root); ++ read_unlock(¤t->fs->lock); + } else if (dfd == AT_FDCWD) { ++ read_lock(¤t->fs->lock); + nd->mnt = mntget(current->fs->pwdmnt); + nd->dentry = dget(current->fs->pwd); ++ read_unlock(¤t->fs->lock); + } else { + struct dentry *dentry; + + file = fget_light(dfd, &fput_needed); + retval = -EBADF; + if (!file) +- goto unlock_fail; ++ goto out_fail; + + dentry = file->f_dentry; + + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) +- goto fput_unlock_fail; ++ goto fput_fail; + + retval = file_permission(file, MAY_EXEC); + if (retval) +- goto fput_unlock_fail; ++ goto fput_fail; + + nd->mnt = mntget(file->f_vfsmnt); + nd->dentry = dget(dentry); + + fput_light(file, fput_needed); + } +- read_unlock(¤t->fs->lock); + current->total_link_count = 0; + retval = link_path_walk(name, nd); + out: +@@ -1124,13 +1192,12 @@ out: + nd->dentry->d_inode)) + audit_inode(name, nd->dentry->d_inode, flags); + } ++out_fail: + return retval; + +-fput_unlock_fail: ++fput_fail: + fput_light(file, fput_needed); +-unlock_fail: +- read_unlock(¤t->fs->lock); +- return retval; ++ goto out_fail; + } + + int fastcall path_lookup(const char *name, unsigned int flags, +@@ -1219,7 +1286,7 @@ static struct dentry * __lookup_hash(str + int err; + + inode = base->d_inode; +- err = permission(inode, MAY_EXEC, nd); ++ err = permission(inode, MAY_EXEC, nd, NULL); + dentry = ERR_PTR(err); + if (err) + goto out; +@@ -1354,7 +1421,7 @@ static int may_delete(struct inode *dir, + + BUG_ON(victim->d_parent->d_inode != dir); + +- error = permission(dir,MAY_WRITE | MAY_EXEC, NULL); ++ error = permission(dir,MAY_WRITE | MAY_EXEC, NULL, NULL); + if (error) + return error; + if (IS_APPEND(dir)) +@@ -1391,7 +1458,7 @@ static inline int may_create(struct inod + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; +- return permission(dir,MAY_WRITE | MAY_EXEC, nd); ++ return permission(dir,MAY_WRITE | MAY_EXEC, nd, NULL); + } + + /* +@@ -1491,7 +1558,7 @@ int may_open(struct nameidata *nd, int a + if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) + return -EISDIR; + +- error = vfs_permission(nd, acc_mode); ++ error = vfs_permission(nd, acc_mode, NULL); + if (error) + return error; + +@@ -1628,6 +1695,12 @@ do_last: + goto exit; + } + ++ if (IS_ERR(nd->intent.open.file)) { ++ mutex_unlock(&dir->d_inode->i_mutex); ++ error = PTR_ERR(nd->intent.open.file); ++ goto exit_dput; ++ } ++ + /* Negative dentry, just create the file */ + if (!path.dentry->d_inode) { + if (!IS_POSIXACL(dir->d_inode)) +@@ -1851,6 +1924,7 @@ asmlinkage long sys_mknod(const char __u + { + return sys_mknodat(AT_FDCWD, filename, mode, dev); + } ++EXPORT_SYMBOL_GPL(sys_mknod); + + int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) + { +@@ -1909,6 +1983,7 @@ asmlinkage long sys_mkdir(const char __u + { + return sys_mkdirat(AT_FDCWD, pathname, mode); + } ++EXPORT_SYMBOL_GPL(sys_mkdir); + + /* + * We try to drop the dentry early: we should have +@@ -2016,6 +2091,7 @@ asmlinkage long sys_rmdir(const char __u + { + return do_rmdir(AT_FDCWD, pathname); + } ++EXPORT_SYMBOL_GPL(sys_rmdir); + + int vfs_unlink(struct inode *dir, struct dentry *dentry) + { +@@ -2115,6 +2191,7 @@ asmlinkage long sys_unlink(const char __ + { + return do_unlinkat(AT_FDCWD, pathname); + } ++EXPORT_SYMBOL_GPL(sys_unlink); + + int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode) + { +@@ -2313,7 +2390,7 @@ static int vfs_rename_dir(struct inode * + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { +- error = permission(old_dentry->d_inode, MAY_WRITE, NULL); ++ error = permission(old_dentry->d_inode, MAY_WRITE, NULL, NULL); + if (error) + return error; + } +@@ -2380,6 +2457,9 @@ int vfs_rename(struct inode *old_dir, st + int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + const char *old_name; + ++ if (DQUOT_RENAME(old_dentry->d_inode, old_dir, new_dir)) ++ return -EXDEV; ++ + if (old_dentry->d_inode == new_dentry->d_inode) + return 0; + +diff -upr linux-2.6.16.orig/fs/namespace.c linux-2.6.16-026test015/fs/namespace.c +--- linux-2.6.16.orig/fs/namespace.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/namespace.c 2006-07-04 14:41:39.000000000 +0400 +@@ -40,13 +40,15 @@ static inline int sysfs_init(void) + + /* spinlock for vfsmount related operations, inplace of dcache_lock */ + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); ++EXPORT_SYMBOL(vfsmount_lock); + + static int event; + + static struct list_head *mount_hashtable; + static int hash_mask __read_mostly, hash_bits __read_mostly; + static kmem_cache_t *mnt_cache; +-static struct rw_semaphore namespace_sem; ++struct rw_semaphore namespace_sem; ++EXPORT_SYMBOL(namespace_sem); + + /* /sys/fs */ + decl_subsys(fs, NULL, NULL); +@@ -65,6 +67,7 @@ struct vfsmount *alloc_vfsmnt(const char + struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); + if (mnt) { + memset(mnt, 0, sizeof(struct vfsmount)); ++ mnt->owner = VEID(get_exec_env()); + atomic_set(&mnt->mnt_count, 1); + INIT_LIST_HEAD(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); +@@ -371,10 +374,32 @@ static int show_vfsmnt(struct seq_file * + { 0, NULL } + }; + struct proc_fs_info *fs_infop; ++ char *path_buf, *path; + +- mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ /* skip FS_NOMOUNT mounts (rootfs) */ ++ if (mnt->mnt_sb->s_flags & MS_NOUSER) ++ return 0; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) ++ return -ENOMEM; ++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ free_page((unsigned long) path_buf); ++ /* ++ * This means that the file position will be incremented, i.e. ++ * the total number of "invisible" vfsmnt will leak. ++ */ ++ return 0; ++ } ++ ++ if (ve_is_super(get_exec_env())) ++ mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); ++ else ++ mangle(m, mnt->mnt_sb->s_type->name); + seq_putc(m, ' '); +- seq_path(m, mnt, mnt->mnt_root, " \t\n\\"); ++ mangle(m, path); ++ free_page((unsigned long) path_buf); + seq_putc(m, ' '); + mangle(m, mnt->mnt_sb->s_type->name); + seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); +@@ -474,6 +499,7 @@ void release_mounts(struct list_head *he + mntput(mnt); + } + } ++EXPORT_SYMBOL(release_mounts); + + void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) + { +@@ -498,6 +524,7 @@ void umount_tree(struct vfsmount *mnt, i + change_mnt_propagation(p, MS_PRIVATE); + } + } ++EXPORT_SYMBOL(umount_tree); + + static int do_umount(struct vfsmount *mnt, int flags) + { +@@ -608,7 +635,7 @@ asmlinkage long sys_umount(char __user * + goto dput_and_out; + + retval = -EPERM; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + goto dput_and_out; + + retval = do_umount(nd.mnt, flags); +@@ -632,7 +659,7 @@ asmlinkage long sys_oldumount(char __use + + static int mount_is_safe(struct nameidata *nd) + { +- if (capable(CAP_SYS_ADMIN)) ++ if (capable(CAP_VE_SYS_ADMIN)) + return 0; + return -EPERM; + #ifdef notyet +@@ -642,7 +669,7 @@ static int mount_is_safe(struct nameidat + if (current->uid != nd->dentry->d_inode->i_uid) + return -EPERM; + } +- if (vfs_permission(nd, MAY_WRITE)) ++ if (vfs_permission(nd, MAY_WRITE, NULL)) + return -EPERM; + return 0; + #endif +@@ -848,6 +875,8 @@ static int do_change_type(struct nameida + + if (nd->dentry != nd->mnt->mnt_root) + return -EINVAL; ++ if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid)) ++ return -EPERM; + + down_write(&namespace_sem); + spin_lock(&vfsmount_lock); +@@ -917,7 +946,7 @@ static int do_remount(struct nameidata * + int err; + struct super_block *sb = nd->mnt->mnt_sb; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + if (!check_mnt(nd->mnt)) +@@ -926,6 +955,9 @@ static int do_remount(struct nameidata * + if (nd->dentry != nd->mnt->mnt_root) + return -EINVAL; + ++ if (!ve_accessible_veid(nd->mnt->owner, get_exec_env()->veid)) ++ return -EPERM; ++ + down_write(&sb->s_umount); + err = do_remount_sb(sb, flags, data, 0); + if (!err) +@@ -951,7 +983,7 @@ static int do_move_mount(struct nameidat + struct nameidata old_nd, parent_nd; + struct vfsmount *p; + int err = 0; +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (!old_name || !*old_name) + return -EINVAL; +@@ -959,6 +991,10 @@ static int do_move_mount(struct nameidat + if (err) + return err; + ++ err = -EPERM; ++ if (!ve_accessible_veid(old_nd.mnt->owner, get_exec_env()->veid)) ++ goto out_nosem; ++ + down_write(&namespace_sem); + while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry)) + ; +@@ -1014,6 +1050,7 @@ out: + up_write(&namespace_sem); + if (!err) + path_release(&parent_nd); ++out_nosem: + path_release(&old_nd); + return err; + } +@@ -1031,7 +1068,7 @@ static int do_new_mount(struct nameidata + return -EINVAL; + + /* we need capabilities... */ +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + mnt = do_kern_mount(type, flags, name, data); +@@ -1072,6 +1109,10 @@ int do_add_mount(struct vfsmount *newmnt + if ((err = graft_tree(newmnt, nd))) + goto unlock; + ++ if (newmnt->mnt_mountpoint->d_flags & DCACHE_VIRTUAL) ++ /* unaccessible yet - no lock */ ++ newmnt->mnt_root->d_flags |= DCACHE_VIRTUAL; ++ + if (fslist) { + /* add to the specified expiration list */ + spin_lock(&vfsmount_lock); +@@ -1469,6 +1510,7 @@ out1: + free_page(type_page); + return retval; + } ++EXPORT_SYMBOL_GPL(sys_mount); + + /* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. +@@ -1520,7 +1562,7 @@ static void chroot_fs_refs(struct nameid + struct fs_struct *fs; + + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_ve(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { +@@ -1535,7 +1577,7 @@ static void chroot_fs_refs(struct nameid + put_fs_struct(fs); + } else + task_unlock(p); +- } while_each_thread(g, p); ++ } while_each_thread_ve(g, p); + read_unlock(&tasklist_lock); + } + +@@ -1688,10 +1730,10 @@ static void __init init_mount_tree(void) + + init_task.namespace = namespace; + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + get_namespace(namespace); + p->namespace = namespace; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + + set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); +@@ -1707,7 +1749,8 @@ void __init mnt_init(unsigned long mempa + init_rwsem(&namespace_sem); + + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct vfsmount), +- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); ++ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_UBC, ++ NULL, NULL); + + mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC); + +@@ -1763,3 +1806,4 @@ void __put_namespace(struct namespace *n + release_mounts(&umount_list); + kfree(namespace); + } ++EXPORT_SYMBOL_GPL(__put_namespace); +diff -upr linux-2.6.16.orig/fs/nfs/dir.c linux-2.6.16-026test015/fs/nfs/dir.c +--- linux-2.6.16.orig/fs/nfs/dir.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfs/dir.c 2006-07-04 14:41:37.000000000 +0400 +@@ -1635,7 +1635,8 @@ out: + return -EACCES; + } + +-int nfs_permission(struct inode *inode, int mask, struct nameidata *nd) ++int nfs_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + struct rpc_cred *cred; + int res = 0; +@@ -1683,7 +1684,7 @@ out: + out_notsup: + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (res == 0) +- res = generic_permission(inode, mask, NULL); ++ res = generic_permission(inode, mask, NULL, perm); + unlock_kernel(); + return res; + } +diff -upr linux-2.6.16.orig/fs/nfs/nfsroot.c linux-2.6.16-026test015/fs/nfs/nfsroot.c +--- linux-2.6.16.orig/fs/nfs/nfsroot.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfs/nfsroot.c 2006-07-04 14:41:38.000000000 +0400 +@@ -312,7 +312,7 @@ static int __init root_nfs_name(char *na + /* Override them by options set on kernel command-line */ + root_nfs_parse(name, buf); + +- cp = system_utsname.nodename; ++ cp = ve_utsname.nodename; + if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { + printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); + return -1; +diff -upr linux-2.6.16.orig/fs/nfsd/nfs3proc.c linux-2.6.16-026test015/fs/nfsd/nfs3proc.c +--- linux-2.6.16.orig/fs/nfsd/nfs3proc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfsd/nfs3proc.c 2006-07-04 14:41:36.000000000 +0400 +@@ -682,7 +682,7 @@ static struct svc_procedure nfsd_proced + PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), + PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), + PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), +- PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), ++ PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), + PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), + PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), + PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), +diff -upr linux-2.6.16.orig/fs/nfsd/nfs4proc.c linux-2.6.16-026test015/fs/nfsd/nfs4proc.c +--- linux-2.6.16.orig/fs/nfsd/nfs4proc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfsd/nfs4proc.c 2006-07-04 14:41:36.000000000 +0400 +@@ -975,7 +975,7 @@ struct nfsd4_voidargs { int dummy; }; + */ + static struct svc_procedure nfsd_procedures4[2] = { + PROC(null, void, void, void, RC_NOCACHE, 1), +- PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) ++ PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) + }; + + struct svc_version nfsd_version4 = { +diff -upr linux-2.6.16.orig/fs/nfsd/nfsfh.c linux-2.6.16-026test015/fs/nfsd/nfsfh.c +--- linux-2.6.16.orig/fs/nfsd/nfsfh.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfsd/nfsfh.c 2006-07-04 14:41:37.000000000 +0400 +@@ -56,7 +56,7 @@ static int nfsd_acceptable(void *expv, s + /* make sure parents give x permission to user */ + int err; + parent = dget_parent(tdentry); +- err = permission(parent->d_inode, MAY_EXEC, NULL); ++ err = permission(parent->d_inode, MAY_EXEC, NULL, NULL); + if (err < 0) { + dput(parent); + break; +diff -upr linux-2.6.16.orig/fs/nfsd/nfsproc.c linux-2.6.16-026test015/fs/nfsd/nfsproc.c +--- linux-2.6.16.orig/fs/nfsd/nfsproc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfsd/nfsproc.c 2006-07-04 14:41:36.000000000 +0400 +@@ -553,7 +553,7 @@ static struct svc_procedure nfsd_proced + PROC(none, void, void, none, RC_NOCACHE, ST), + PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), + PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), +- PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), ++ PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), + PROC(none, void, void, none, RC_NOCACHE, ST), + PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), + PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), +diff -upr linux-2.6.16.orig/fs/nfsd/vfs.c linux-2.6.16-026test015/fs/nfsd/vfs.c +--- linux-2.6.16.orig/fs/nfsd/vfs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/nfsd/vfs.c 2006-07-04 14:41:37.000000000 +0400 +@@ -1817,12 +1817,13 @@ nfsd_permission(struct svc_export *exp, + inode->i_uid == current->fsuid) + return 0; + +- err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); ++ err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), ++ NULL, NULL); + + /* Allow read access to binaries even when mode 111 */ + if (err == -EACCES && S_ISREG(inode->i_mode) && + acc == (MAY_READ | MAY_OWNER_OVERRIDE)) +- err = permission(inode, MAY_EXEC, NULL); ++ err = permission(inode, MAY_EXEC, NULL, NULL); + + return err? nfserrno(err) : 0; + } +diff -upr linux-2.6.16.orig/fs/ntfs/file.c linux-2.6.16-026test015/fs/ntfs/file.c +--- linux-2.6.16.orig/fs/ntfs/file.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ntfs/file.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1489,14 +1489,15 @@ static inline void ntfs_flush_dcache_pag + unsigned nr_pages) + { + BUG_ON(!nr_pages); ++ /* ++ * Warning: Do not do the decrement at the same time as the call to ++ * flush_dcache_page() because it is a NULL macro on i386 and hence the ++ * decrement never happens so the loop never terminates. ++ */ + do { +- /* +- * Warning: Do not do the decrement at the same time as the +- * call because flush_dcache_page() is a NULL macro on i386 +- * and hence the decrement never happens. +- */ ++ --nr_pages; + flush_dcache_page(pages[nr_pages]); +- } while (--nr_pages > 0); ++ } while (nr_pages > 0); + } + + /** +diff -upr linux-2.6.16.orig/fs/ntfs/super.c linux-2.6.16-026test015/fs/ntfs/super.c +--- linux-2.6.16.orig/fs/ntfs/super.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/ntfs/super.c 2006-07-04 14:41:37.000000000 +0400 +@@ -3033,7 +3033,7 @@ iput_tmp_ino_err_out_now: + * method again... FIXME: Do we need to do this twice now because of + * attribute inodes? I think not, so leave as is for now... (AIA) + */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 0)) { + ntfs_error(sb, "Busy inodes left. This is most likely a NTFS " + "driver bug."); + /* Copied from fs/super.c. I just love this message. (-; */ +diff -upr linux-2.6.16.orig/fs/open.c linux-2.6.16-026test015/fs/open.c +--- linux-2.6.16.orig/fs/open.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/open.c 2006-07-04 14:41:39.000000000 +0400 +@@ -25,6 +25,7 @@ + #include <linux/fs.h> + #include <linux/personality.h> + #include <linux/pagemap.h> ++#include <linux/faudit.h> + #include <linux/syscalls.h> + #include <linux/rcupdate.h> + +@@ -51,7 +52,21 @@ int vfs_statfs(struct super_block *sb, s + + EXPORT_SYMBOL(vfs_statfs); + +-static int vfs_statfs_native(struct super_block *sb, struct statfs *buf) ++int faudit_statfs(struct super_block *sb, struct kstatfs *buf) ++{ ++ struct faudit_statfs_arg arg; ++ ++ arg.sb = sb; ++ arg.stat = buf; ++ ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STATFS, &arg) ++ != NOTIFY_DONE) ++ return arg.err; ++ return 0; ++} ++ ++static int vfs_statfs_native(struct super_block *sb, struct vfsmount *mnt, ++ struct statfs *buf) + { + struct kstatfs st; + int retval; +@@ -60,6 +75,10 @@ static int vfs_statfs_native(struct supe + if (retval) + return retval; + ++ retval = faudit_statfs(mnt->mnt_sb, &st); ++ if (retval) ++ return retval; ++ + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { +@@ -94,7 +113,8 @@ static int vfs_statfs_native(struct supe + return 0; + } + +-static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf) ++static int vfs_statfs64(struct super_block *sb, struct vfsmount *mnt, ++ struct statfs64 *buf) + { + struct kstatfs st; + int retval; +@@ -103,6 +123,10 @@ static int vfs_statfs64(struct super_blo + if (retval) + return retval; + ++ retval = faudit_statfs(mnt->mnt_sb, &st); ++ if (retval) ++ return retval; ++ + if (sizeof(*buf) == sizeof(st)) + memcpy(buf, &st, sizeof(st)); + else { +@@ -129,7 +153,8 @@ asmlinkage long sys_statfs(const char __ + error = user_path_walk(path, &nd); + if (!error) { + struct statfs tmp; +- error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp); ++ error = vfs_statfs_native(nd.dentry->d_inode->i_sb, ++ nd.mnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_release(&nd); +@@ -148,7 +173,8 @@ asmlinkage long sys_statfs64(const char + error = user_path_walk(path, &nd); + if (!error) { + struct statfs64 tmp; +- error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp); ++ error = vfs_statfs64(nd.dentry->d_inode->i_sb, ++ nd.mnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + path_release(&nd); +@@ -167,7 +193,8 @@ asmlinkage long sys_fstatfs(unsigned int + file = fget(fd); + if (!file) + goto out; +- error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp); ++ error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, ++ file->f_vfsmnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +@@ -188,7 +215,8 @@ asmlinkage long sys_fstatfs64(unsigned i + file = fget(fd); + if (!file) + goto out; +- error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp); ++ error = vfs_statfs64(file->f_dentry->d_inode->i_sb, ++ file->f_vfsmnt, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) + error = -EFAULT; + fput(file); +@@ -243,7 +271,7 @@ static long do_sys_truncate(const char _ + if (!S_ISREG(inode->i_mode)) + goto dput_and_out; + +- error = vfs_permission(&nd, MAY_WRITE); ++ error = vfs_permission(&nd, MAY_WRITE, NULL); + if (error) + goto dput_and_out; + +@@ -330,7 +358,10 @@ out: + + asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) + { +- return do_sys_ftruncate(fd, length, 1); ++ long ret = do_sys_ftruncate(fd, length, 1); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + /* LFS versions of truncate are only needed on 32 bit machines */ +@@ -342,7 +373,10 @@ asmlinkage long sys_truncate64(const cha + + asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) + { +- return do_sys_ftruncate(fd, length, 0); ++ long ret = do_sys_ftruncate(fd, length, 0); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + #endif + +@@ -397,7 +431,7 @@ asmlinkage long sys_utime(char __user * + goto dput_and_out; + + if (current->fsuid != inode->i_uid && +- (error = vfs_permission(&nd, MAY_WRITE)) != 0) ++ (error = vfs_permission(&nd, MAY_WRITE, NULL)) != 0) + goto dput_and_out; + } + mutex_lock(&inode->i_mutex); +@@ -450,7 +484,7 @@ long do_utimes(int dfd, char __user *fil + goto dput_and_out; + + if (current->fsuid != inode->i_uid && +- (error = vfs_permission(&nd, MAY_WRITE)) != 0) ++ (error = vfs_permission(&nd, MAY_WRITE, NULL)) != 0) + goto dput_and_out; + } + mutex_lock(&inode->i_mutex); +@@ -514,7 +548,7 @@ asmlinkage long sys_faccessat(int dfd, c + + res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd); + if (!res) { +- res = vfs_permission(&nd, mode); ++ res = vfs_permission(&nd, mode, NULL); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) +@@ -543,7 +577,7 @@ asmlinkage long sys_chdir(const char __u + if (error) + goto out; + +- error = vfs_permission(&nd, MAY_EXEC); ++ error = vfs_permission(&nd, MAY_EXEC, NULL); + if (error) + goto dput_and_out; + +@@ -594,7 +628,7 @@ asmlinkage long sys_chroot(const char __ + if (error) + goto out; + +- error = vfs_permission(&nd, MAY_EXEC); ++ error = vfs_permission(&nd, MAY_EXEC, NULL); + if (error) + goto dput_and_out; + +@@ -733,6 +767,7 @@ asmlinkage long sys_chown(const char __u + } + return error; + } ++EXPORT_SYMBOL_GPL(sys_chown); + + asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, + gid_t group, int flag) +@@ -1083,20 +1118,30 @@ long do_sys_open(int dfd, const char __u + + asmlinkage long sys_open(const char __user *filename, int flags, int mode) + { ++ long ret; ++ + if (force_o_largefile()) + flags |= O_LARGEFILE; + +- return do_sys_open(AT_FDCWD, filename, flags, mode); ++ ret = do_sys_open(AT_FDCWD, filename, flags, mode); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + EXPORT_SYMBOL_GPL(sys_open); + + asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, + int mode) + { ++ long ret; ++ + if (force_o_largefile()) + flags |= O_LARGEFILE; + +- return do_sys_open(dfd, filename, flags, mode); ++ ret = do_sys_open(dfd, filename, flags, mode); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + EXPORT_SYMBOL_GPL(sys_openat); + +diff -upr linux-2.6.16.orig/fs/partitions/check.c linux-2.6.16-026test015/fs/partitions/check.c +--- linux-2.6.16.orig/fs/partitions/check.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/partitions/check.c 2006-07-04 14:41:38.000000000 +0400 +@@ -128,6 +128,7 @@ char *disk_name(struct gendisk *hd, int + + return buf; + } ++EXPORT_SYMBOL(disk_name); + + const char *bdevname(struct block_device *bdev, char *buf) + { +@@ -345,6 +346,7 @@ static char *make_block_name(struct gend + char *name; + static char *block_str = "block:"; + int size; ++ char *s; + + size = strlen(block_str) + strlen(disk->disk_name) + 1; + name = kmalloc(size, GFP_KERNEL); +@@ -352,6 +354,10 @@ static char *make_block_name(struct gend + return NULL; + strcpy(name, block_str); + strcat(name, disk->disk_name); ++ /* ewww... some of these buggers have / in name... */ ++ s = strchr(name, '/'); ++ if (s) ++ *s = '!'; + return name; + } + +diff -upr linux-2.6.16.orig/fs/pipe.c linux-2.6.16-026test015/fs/pipe.c +--- linux-2.6.16.orig/fs/pipe.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/pipe.c 2006-07-04 14:41:39.000000000 +0400 +@@ -19,6 +19,8 @@ + #include <asm/uaccess.h> + #include <asm/ioctls.h> + ++#include <ub/ub_mem.h> ++ + /* + * We use a start+len construction, which provides full use of the + * allocated memory. +@@ -284,7 +286,7 @@ pipe_writev(struct file *filp, const str + int error; + + if (!page) { +- page = alloc_page(GFP_HIGHUSER); ++ page = alloc_page(GFP_HIGHUSER | __GFP_UBC); + if (unlikely(!page)) { + ret = ret ? : -ENOMEM; + break; +@@ -662,7 +664,7 @@ struct inode* pipe_new(struct inode* ino + { + struct pipe_inode_info *info; + +- info = kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); ++ info = ub_kmalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (!info) + goto fail_page; + memset(info, 0, sizeof(*info)); +@@ -797,6 +799,7 @@ close_f1: + no_files: + return error; + } ++EXPORT_SYMBOL_GPL(do_pipe); + + /* + * pipefs should _never_ be mounted by userland - too much of security hassle, +diff -upr linux-2.6.16.orig/fs/proc/array.c linux-2.6.16-026test015/fs/proc/array.c +--- linux-2.6.16.orig/fs/proc/array.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/array.c 2006-07-04 14:41:39.000000000 +0400 +@@ -75,6 +75,9 @@ + #include <linux/times.h> + #include <linux/cpuset.h> + #include <linux/rcupdate.h> ++#include <linux/fairsched.h> ++ ++#include <ub/beancounter.h> + + #include <asm/uaccess.h> + #include <asm/pgtable.h> +@@ -161,8 +164,13 @@ static inline char * task_state(struct t + struct group_info *group_info; + int g; + struct fdtable *fdt = NULL; ++ pid_t pid, ppid, tgid; ++ ++ pid = get_task_pid(p); ++ tgid = get_task_tgid(p); + + read_lock(&tasklist_lock); ++ ppid = get_task_ppid(p); + buffer += sprintf(buffer, + "State:\t%s\n" + "SleepAVG:\t%lu%%\n" +@@ -170,13 +178,19 @@ static inline char * task_state(struct t + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" ++#ifdef CONFIG_FAIRSCHED ++ "FNid:\t%d\n" ++#endif + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n", + get_task_state(p), + (p->sleep_avg/1024)*100/(1020000000/1024), +- p->tgid, +- p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, +- pid_alive(p) && p->ptrace ? p->parent->pid : 0, ++ tgid, ++ pid, ppid, ++ pid_alive(p) && p->ptrace ? get_task_pid(p->parent) : 0, ++#ifdef CONFIG_FAIRSCHED ++ task_fairsched_node_id(p), ++#endif + p->uid, p->euid, p->suid, p->fsuid, + p->gid, p->egid, p->sgid, p->fsgid); + read_unlock(&tasklist_lock); +@@ -199,6 +213,18 @@ static inline char * task_state(struct t + put_group_info(group_info); + + buffer += sprintf(buffer, "\n"); ++ ++#ifdef CONFIG_VE ++ buffer += sprintf(buffer, ++ "envID:\t%d\n" ++ "VPid:\t%d\n" ++ "PNState:\t%u\n" ++ "StopState:\t%u\n", ++ VE_TASK_INFO(p)->owner_env->veid, ++ virt_pid(p), ++ p->pn_state, ++ p->stopped_state); ++#endif + return buffer; + } + +@@ -244,7 +270,7 @@ static void collect_sigign_sigcatch(stru + + static inline char * task_sig(struct task_struct *p, char *buffer) + { +- sigset_t pending, shpending, blocked, ignored, caught; ++ sigset_t pending, shpending, blocked, ignored, caught, saved; + int num_threads = 0; + unsigned long qsize = 0; + unsigned long qlim = 0; +@@ -254,6 +280,7 @@ static inline char * task_sig(struct tas + sigemptyset(&blocked); + sigemptyset(&ignored); + sigemptyset(&caught); ++ sigemptyset(&saved); + + /* Gather all the data with the appropriate locks held */ + read_lock(&tasklist_lock); +@@ -262,6 +289,7 @@ static inline char * task_sig(struct tas + pending = p->pending.signal; + shpending = p->signal->shared_pending.signal; + blocked = p->blocked; ++ saved = p->saved_sigmask; + collect_sigign_sigcatch(p, &ignored, &caught); + num_threads = atomic_read(&p->signal->count); + qsize = atomic_read(&p->user->sigpending); +@@ -279,6 +307,7 @@ static inline char * task_sig(struct tas + buffer = render_sigset_t("SigBlk:\t", &blocked, buffer); + buffer = render_sigset_t("SigIgn:\t", &ignored, buffer); + buffer = render_sigset_t("SigCgt:\t", &caught, buffer); ++ buffer = render_sigset_t("SigSvd:\t", &saved, buffer); + + return buffer; + } +@@ -293,10 +322,27 @@ static inline char *task_cap(struct task + cap_t(p->cap_effective)); + } + ++#ifdef CONFIG_USER_RESOURCE ++static inline void ub_dump_task_info(struct task_struct *tsk, ++ char *stsk, int ltsk, char *smm, int lmm) ++{ ++ print_ub_uid(tsk->task_bc.task_ub, stsk, ltsk); ++ task_lock(tsk); ++ if (tsk->mm) ++ print_ub_uid(tsk->mm->mm_ub, smm, lmm); ++ else ++ strncpy(smm, "N/A", lmm); ++ task_unlock(tsk); ++} ++#endif ++ + int proc_pid_status(struct task_struct *task, char * buffer) + { + char * orig = buffer; + struct mm_struct *mm = get_task_mm(task); ++#ifdef CONFIG_USER_RESOURCE ++ char tsk_ub_info[64], mm_ub_info[64]; ++#endif + + buffer = task_name(task, buffer); + buffer = task_state(task, buffer); +@@ -311,6 +357,14 @@ int proc_pid_status(struct task_struct * + #if defined(CONFIG_S390) + buffer = task_show_regs(task, buffer); + #endif ++#ifdef CONFIG_USER_RESOURCE ++ ub_dump_task_info(task, ++ tsk_ub_info, sizeof(tsk_ub_info), ++ mm_ub_info, sizeof(mm_ub_info)); ++ ++ buffer += sprintf(buffer, "TaskUB:\t%s\n", tsk_ub_info); ++ buffer += sprintf(buffer, "MMUB:\t%s\n", mm_ub_info); ++#endif + return buffer - orig; + } + +@@ -333,6 +387,10 @@ static int do_task_stat(struct task_stru + DEFINE_KTIME(it_real_value); + struct task_struct *t; + char tcomm[sizeof(task->comm)]; ++#ifdef CONFIG_USER_RESOURCE ++ char ub_task_info[64]; ++ char ub_mm_info[64]; ++#endif + + state = *get_task_state(task); + vsize = eip = esp = 0; +@@ -370,11 +428,12 @@ static int do_task_stat(struct task_stru + } + if (task->signal) { + if (task->signal->tty) { +- tty_pgrp = task->signal->tty->pgrp; ++ tty_pgrp = pid_type_to_vpid(PIDTYPE_PGID, ++ task->signal->tty->pgrp); + tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); + } +- pgid = process_group(task); +- sid = task->signal->session; ++ pgid = get_task_pgid(task); ++ sid = get_task_sid(task); + cmin_flt = task->signal->cmin_flt; + cmaj_flt = task->signal->cmaj_flt; + cutime = task->signal->cutime; +@@ -388,7 +447,7 @@ static int do_task_stat(struct task_stru + } + it_real_value = task->signal->real_timer.expires; + } +- ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; ++ ppid = get_task_ppid(task); + read_unlock(&tasklist_lock); + + if (!whole || num_threads<2) +@@ -407,14 +466,34 @@ static int do_task_stat(struct task_stru + + /* Temporary variable needed for gcc-2.96 */ + /* convert timespec -> nsec*/ ++#ifndef CONFIG_VE + start_time = (unsigned long long)task->start_time.tv_sec * NSEC_PER_SEC + + task->start_time.tv_nsec; ++#else ++ start_time = (unsigned long long)(task->start_time.tv_sec - ++ get_exec_env()->init_entry->start_time.tv_sec) * ++ NSEC_PER_SEC + task->start_time.tv_nsec - ++ get_exec_env()->init_entry->start_time.tv_nsec; ++#endif + /* convert nsec -> ticks */ + start_time = nsec_to_clock_t(start_time); + ++#ifdef CONFIG_USER_RESOURCE ++ ub_dump_task_info(task, ++ ub_task_info, sizeof(ub_task_info), ++ ub_mm_info, sizeof(ub_mm_info)); ++#endif ++ + res = sprintf(buffer,"%d (%s) %c %d %d %d %d %d %lu %lu \ + %lu %lu %lu %lu %lu %ld %ld %ld %ld %d %ld %llu %lu %ld %lu %lu %lu %lu %lu \ +-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu\n", ++%lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %lu" ++#ifdef CONFIG_VE ++"0 0 0 0 0 0 0 0 %d %u" ++#endif ++#ifdef CONFIG_USER_RESOURCE ++ " %s %s" ++#endif ++ "\n", + task->pid, + tcomm, + state, +@@ -459,7 +538,16 @@ static int do_task_stat(struct task_stru + task->exit_signal, + task_cpu(task), + task->rt_priority, +- task->policy); ++ task->policy ++#ifdef CONFIG_VE ++ , virt_pid(task), ++ VEID(VE_TASK_INFO(task)->owner_env) ++#endif ++#ifdef CONFIG_USER_RESOURCE ++ , ub_task_info, ++ ub_mm_info ++#endif ++ ); + if(mm) + mmput(mm); + return res; +diff -upr linux-2.6.16.orig/fs/proc/base.c linux-2.6.16-026test015/fs/proc/base.c +--- linux-2.6.16.orig/fs/proc/base.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/base.c 2006-07-04 14:41:38.000000000 +0400 +@@ -291,22 +291,29 @@ static int proc_fd_link(struct inode *in + struct files_struct *files; + struct file *file; + int fd = proc_type(inode) - PROC_TID_FD_DIR; ++ int err = -ENOENT; + + files = get_files_struct(task); + if (files) { +- rcu_read_lock(); ++ /* ++ * We are not taking a ref to the file structure, so we must ++ * hold ->file_lock. ++ */ ++ spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { +- *mnt = mntget(file->f_vfsmnt); +- *dentry = dget(file->f_dentry); +- rcu_read_unlock(); +- put_files_struct(files); +- return 0; ++ if (d_root_check(file->f_dentry, file->f_vfsmnt)) { ++ err = -EACCES; ++ } else { ++ *mnt = mntget(file->f_vfsmnt); ++ *dentry = dget(file->f_dentry); ++ err = 0; ++ } + } +- rcu_read_unlock(); ++ spin_unlock(&files->file_lock); + put_files_struct(files); + } +- return -ENOENT; ++ return err; + } + + static struct fs_struct *get_fs_struct(struct task_struct *task) +@@ -326,10 +333,12 @@ static int proc_cwd_link(struct inode *i + int result = -ENOENT; + if (fs) { + read_lock(&fs->lock); +- *mnt = mntget(fs->pwdmnt); +- *dentry = dget(fs->pwd); ++ result = d_root_check(fs->pwd, fs->pwdmnt); ++ if (!result) { ++ *mnt = mntget(fs->pwdmnt); ++ *dentry = dget(fs->pwd); ++ } + read_unlock(&fs->lock); +- result = 0; + put_fs_struct(fs); + } + return result; +@@ -579,19 +588,21 @@ static int proc_check_root(struct inode + return proc_check_chroot(root, vfsmnt); + } + +-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd) ++static int proc_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { +- if (generic_permission(inode, mask, NULL) != 0) ++ if (generic_permission(inode, mask, NULL, perm) != 0) + return -EACCES; + return proc_check_root(inode); + } + +-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd) ++static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + struct dentry *root; + struct vfsmount *vfsmnt; + +- if (generic_permission(inode, mask, NULL) != 0) ++ if (generic_permission(inode, mask, NULL, perm) != 0) + return -EACCES; + + if (proc_task_root_link(inode, &root, &vfsmnt)) +@@ -1303,6 +1314,10 @@ static struct inode *proc_pid_make_inode + struct inode * inode; + struct proc_inode *ei; + ++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, ++ VE_OWNER_FSTYPE(sb->s_type))) ++ return NULL; ++ + /* We need a new inode */ + + inode = new_inode(sb); +@@ -1406,6 +1421,10 @@ static void pid_base_iput(struct dentry + spin_lock(&task->proc_lock); + if (task->proc_dentry == dentry) + task->proc_dentry = NULL; ++#ifdef CONFIG_VE ++ if (VE_TASK_INFO(task)->glob_proc_dentry == dentry) ++ VE_TASK_INFO(task)->glob_proc_dentry = NULL; ++#endif + spin_unlock(&task->proc_lock); + iput(inode); + } +@@ -1485,7 +1504,12 @@ static struct dentry *proc_lookupfd(stru + if (!files) + goto out_unlock; + inode->i_mode = S_IFLNK; +- rcu_read_lock(); ++ ++ /* ++ * We are not taking a ref to the file structure, so we must ++ * hold ->file_lock. ++ */ ++ spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (!file) + goto out_unlock2; +@@ -1493,7 +1517,7 @@ static struct dentry *proc_lookupfd(stru + inode->i_mode |= S_IRUSR | S_IXUSR; + if (file->f_mode & 2) + inode->i_mode |= S_IWUSR | S_IXUSR; +- rcu_read_unlock(); ++ spin_unlock(&files->file_lock); + put_files_struct(files); + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; +@@ -1503,7 +1527,7 @@ static struct dentry *proc_lookupfd(stru + return NULL; + + out_unlock2: +- rcu_read_unlock(); ++ spin_unlock(&files->file_lock); + put_files_struct(files); + out_unlock: + iput(inode); +@@ -1879,14 +1903,14 @@ static int proc_self_readlink(struct den + int buflen) + { + char tmp[30]; +- sprintf(tmp, "%d", current->tgid); ++ sprintf(tmp, "%d", get_task_tgid(current)); + return vfs_readlink(dentry,buffer,buflen,tmp); + } + + static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) + { + char tmp[30]; +- sprintf(tmp, "%d", current->tgid); ++ sprintf(tmp, "%d", get_task_tgid(current)); + return ERR_PTR(vfs_follow_link(nd,tmp)); + } + +@@ -1911,11 +1935,8 @@ static struct inode_operations proc_self + * of PIDTYPE_PID. + */ + +-struct dentry *proc_pid_unhash(struct task_struct *p) ++struct dentry *__proc_pid_unhash(struct task_struct *p, struct dentry *proc_dentry) + { +- struct dentry *proc_dentry; +- +- proc_dentry = p->proc_dentry; + if (proc_dentry != NULL) { + + spin_lock(&dcache_lock); +@@ -1933,6 +1954,14 @@ struct dentry *proc_pid_unhash(struct ta + return proc_dentry; + } + ++void proc_pid_unhash(struct task_struct *p, struct dentry *pd[2]) ++{ ++ pd[0] = __proc_pid_unhash(p, p->proc_dentry); ++#ifdef CONFIG_VE ++ pd[1] = __proc_pid_unhash(p, VE_TASK_INFO(p)->glob_proc_dentry); ++#endif ++} ++ + /** + * proc_pid_flush - recover memory used by stale /proc/@pid/x entries + * @proc_dentry: directoy to prune. +@@ -1940,7 +1969,7 @@ struct dentry *proc_pid_unhash(struct ta + * Shrink the /proc directory that was used by the just killed thread. + */ + +-void proc_pid_flush(struct dentry *proc_dentry) ++void __proc_pid_flush(struct dentry *proc_dentry) + { + might_sleep(); + if(proc_dentry != NULL) { +@@ -1949,12 +1978,21 @@ void proc_pid_flush(struct dentry *proc_ + } + } + ++void proc_pid_flush(struct dentry *proc_dentry[2]) ++{ ++ __proc_pid_flush(proc_dentry[0]); ++#ifdef CONFIG_VE ++ __proc_pid_flush(proc_dentry[1]); ++#endif ++} ++ + /* SMP-safe */ + struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) + { + struct task_struct *task; + struct inode *inode; + struct proc_inode *ei; ++ struct dentry *pd[2]; + unsigned tgid; + int died; + +@@ -1978,7 +2016,19 @@ struct dentry *proc_pid_lookup(struct in + goto out; + + read_lock(&tasklist_lock); +- task = find_task_by_pid(tgid); ++ task = find_task_by_pid_ve(tgid); ++ /* In theory we are allowed to lookup both /proc/VIRT_PID and ++ * /proc/GLOBAL_PID inside VE. However, current /proc implementation ++ * cannot maintain two references to one task, so that we have ++ * to prohibit /proc/GLOBAL_PID. ++ */ ++ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tgid)) { ++ /* However, VE_ENTERed tasks are exception, they use global ++ * pids. ++ */ ++ if (virt_pid(task) != tgid) ++ task = NULL; ++ } + if (task) + get_task_struct(task); + read_unlock(&tasklist_lock); +@@ -2007,16 +2057,23 @@ struct dentry *proc_pid_lookup(struct in + died = 0; + d_add(dentry, inode); + spin_lock(&task->proc_lock); ++#ifdef CONFIG_VE ++ if (ve_is_super(VE_OWNER_FSTYPE(inode->i_sb->s_type))) ++ VE_TASK_INFO(task)->glob_proc_dentry = dentry; ++ else ++ task->proc_dentry = dentry; ++#else + task->proc_dentry = dentry; ++#endif + if (!pid_alive(task)) { +- dentry = proc_pid_unhash(task); ++ proc_pid_unhash(task, pd); + died = 1; + } + spin_unlock(&task->proc_lock); + + put_task_struct(task); + if (died) { +- proc_pid_flush(dentry); ++ proc_pid_flush(pd); + goto out; + } + return NULL; +@@ -2037,7 +2094,12 @@ static struct dentry *proc_task_lookup(s + goto out; + + read_lock(&tasklist_lock); +- task = find_task_by_pid(tid); ++ task = find_task_by_pid_ve(tid); ++ /* See comment above in similar place. */ ++ if (task && !ve_is_super(get_exec_env()) && !is_virtual_pid(tid)) { ++ if (virt_pid(task) != tid) ++ task = NULL; ++ } + if (task) + get_task_struct(task); + read_unlock(&tasklist_lock); +@@ -2081,16 +2143,23 @@ out: + * tasklist lock while doing this, and we must release it before + * we actually do the filldir itself, so we use a temp buffer.. + */ +-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) ++static int get_tgid_list(int index, unsigned long version, unsigned int *tgids, ++ struct ve_struct *ve) + { + struct task_struct *p; + int nr_tgids = 0; + + index--; + read_lock(&tasklist_lock); ++ if (list_empty(&ve->vetask_lh)) ++ goto out; + p = NULL; + if (version) { +- p = find_task_by_pid(version); ++ struct ve_struct *oldve; ++ ++ oldve = set_exec_env(ve); ++ p = find_task_by_pid_ve(version); ++ (void)set_exec_env(oldve); + if (p && !thread_group_leader(p)) + p = NULL; + } +@@ -2098,10 +2167,10 @@ static int get_tgid_list(int index, unsi + if (p) + index = 0; + else +- p = next_task(&init_task); ++ p = __first_task_ve(ve); + +- for ( ; p != &init_task; p = next_task(p)) { +- int tgid = p->pid; ++ for ( ; p != NULL; p = __next_task_ve(ve, p)) { ++ int tgid = get_task_pid_ve(p, ve); + if (!pid_alive(p)) + continue; + if (--index >= 0) +@@ -2111,6 +2180,7 @@ static int get_tgid_list(int index, unsi + if (nr_tgids >= PROC_MAXPIDS) + break; + } ++out: + read_unlock(&tasklist_lock); + return nr_tgids; + } +@@ -2134,7 +2204,7 @@ static int get_tid_list(int index, unsig + * via next_thread(). + */ + if (pid_alive(task)) do { +- int tid = task->pid; ++ int tid = get_task_pid(task); + + if (--index >= 0) + continue; +@@ -2171,7 +2241,8 @@ int proc_pid_readdir(struct file * filp, + next_tgid = filp->f_version; + filp->f_version = 0; + for (;;) { +- nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); ++ nr_tgids = get_tgid_list(nr, next_tgid, tgid_array, ++ filp->f_dentry->d_sb->s_type->owner_env); + if (!nr_tgids) { + /* no more entries ! */ + break; +diff -upr linux-2.6.16.orig/fs/proc/generic.c linux-2.6.16-026test015/fs/proc/generic.c +--- linux-2.6.16.orig/fs/proc/generic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/generic.c 2006-07-04 14:41:38.000000000 +0400 +@@ -10,7 +10,9 @@ + + #include <linux/errno.h> + #include <linux/time.h> ++#include <linux/fs.h> + #include <linux/proc_fs.h> ++#include <linux/ve_owner.h> + #include <linux/stat.h> + #include <linux/module.h> + #include <linux/mount.h> +@@ -29,6 +31,8 @@ static ssize_t proc_file_write(struct fi + size_t count, loff_t *ppos); + static loff_t proc_file_lseek(struct file *, loff_t, int); + ++static DEFINE_RWLOCK(proc_tree_lock); ++ + int proc_match(int len, const char *name, struct proc_dir_entry *de) + { + if (de->namelen != len) +@@ -229,6 +233,7 @@ proc_file_lseek(struct file *file, loff_ + return retval; + } + ++#ifndef CONFIG_VE + static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) + { + struct inode *inode = dentry->d_inode; +@@ -261,9 +266,12 @@ static int proc_getattr(struct vfsmount + generic_fillattr(inode, stat); + return 0; + } ++#endif + + static struct inode_operations proc_file_inode_operations = { ++#ifndef CONFIG_VE + .setattr = proc_notify_change, ++#endif + }; + + /* +@@ -271,14 +279,20 @@ static struct inode_operations proc_file + * returns the struct proc_dir_entry for "/proc/tty/driver", and + * returns "serial" in residual. + */ +-static int xlate_proc_name(const char *name, ++static int __xlate_proc_name(struct proc_dir_entry *root, const char *name, + struct proc_dir_entry **ret, const char **residual) + { + const char *cp = name, *next; + struct proc_dir_entry *de; + int len; + +- de = &proc_root; ++ if (*ret) { ++ de_get(*ret); ++ return 0; ++ } ++ ++ read_lock(&proc_tree_lock); ++ de = root; + while (1) { + next = strchr(cp, '/'); + if (!next) +@@ -289,15 +303,35 @@ static int xlate_proc_name(const char *n + if (proc_match(len, cp, de)) + break; + } +- if (!de) ++ if (!de) { ++ read_unlock(&proc_tree_lock); + return -ENOENT; ++ } + cp += len + 1; + } + *residual = cp; +- *ret = de; ++ *ret = de_get(de); ++ read_unlock(&proc_tree_lock); + return 0; + } + ++#ifndef CONFIG_VE ++#define xlate_proc_loc_name xlate_proc_name ++#else ++static int xlate_proc_loc_name(const char *name, ++ struct proc_dir_entry **ret, const char **residual) ++{ ++ return __xlate_proc_name(get_exec_env()->proc_root, ++ name, ret, residual); ++} ++#endif ++ ++static int xlate_proc_name(const char *name, ++ struct proc_dir_entry **ret, const char **residual) ++{ ++ return __xlate_proc_name(&proc_root, name, ret, residual); ++} ++ + static DEFINE_IDR(proc_inum_idr); + static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */ + +@@ -369,6 +403,20 @@ static struct dentry_operations proc_den + .d_delete = proc_delete_dentry, + }; + ++static struct proc_dir_entry *__proc_lookup(struct proc_dir_entry *dir, ++ struct dentry *d) ++{ ++ struct proc_dir_entry *de; ++ ++ for (de = dir->subdir; de; de = de->next) { ++ if (de->namelen != d->d_name.len) ++ continue; ++ if (!memcmp(d->d_name.name, de->name, de->namelen)) ++ break; ++ } ++ return de_get(de); ++} ++ + /* + * Don't create negative dentries here, return -ENOENT by hand + * instead. +@@ -376,34 +424,147 @@ static struct dentry_operations proc_den + struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) + { + struct inode *inode = NULL; +- struct proc_dir_entry * de; ++ struct proc_dir_entry *lde, *gde; + int error = -ENOENT; + + lock_kernel(); +- de = PDE(dir); +- if (de) { +- for (de = de->subdir; de ; de = de->next) { +- if (de->namelen != dentry->d_name.len) +- continue; +- if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { +- unsigned int ino = de->low_ino; ++ lde = LPDE(dir); + +- error = -EINVAL; +- inode = proc_get_inode(dir->i_sb, ino, de); +- break; +- } +- } +- } ++ if (!lde) ++ goto out; ++ ++ read_lock(&proc_tree_lock); ++ lde = __proc_lookup(lde, dentry); ++#ifdef CONFIG_VE ++ gde = GPDE(dir); ++ if (gde) ++ gde = __proc_lookup(gde, dentry); ++#else ++ gde = NULL; ++#endif ++ read_unlock(&proc_tree_lock); ++ ++ /* ++ * There are following possible cases after lookup: ++ * ++ * lde gde ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * NULL NULL ENOENT ++ * loc NULL found in local tree ++ * loc glob found in both trees ++ * NULL glob found in global tree ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * ++ * We initialized inode as follows after lookup: ++ * ++ * inode->lde inode->gde ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * loc NULL in local tree ++ * loc glob both trees ++ * glob glob global tree ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * i.e. inode->lde is always initialized ++ */ ++ ++ if (lde == NULL && gde == NULL) ++ goto out; ++ ++ if (lde != NULL) ++ inode = proc_get_inode(dir->i_sb, lde->low_ino, lde); ++ else ++ inode = proc_get_inode(dir->i_sb, gde->low_ino, gde); ++ ++ /* ++ * We can sleep in proc_get_inode(), but since we have i_sem ++ * being taken, no one can setup GPDE/LPDE on this inode. ++ */ ++ if (!inode) ++ goto out_put; ++ ++#ifdef CONFIG_VE ++ GPDE(inode) = de_get(gde); ++ if (gde) ++ __module_get(gde->owner); ++ ++ /* if dentry is found in both trees and it is a directory ++ * then inode's nlink count must be altered, because local ++ * and global subtrees may differ. ++ * on the other hand, they may intersect, so actual nlink ++ * value is difficult to calculate - upper estimate is used ++ * instead of it. ++ * dentry found in global tree only must not be writable ++ * in non-super ve. ++ */ ++ if (lde && gde && lde != gde && gde->nlink > 1) ++ inode->i_nlink += gde->nlink - 2; ++ if (lde == NULL && !ve_is_super( ++ VE_OWNER_FSTYPE(dir->i_sb->s_type))) ++ inode->i_mode &= ~S_IWUGO; ++#endif + unlock_kernel(); ++ dentry->d_op = &proc_dentry_operations; ++ d_add(dentry, inode); ++ de_put(lde); ++ de_put(gde); ++ return NULL; + +- if (inode) { +- dentry->d_op = &proc_dentry_operations; +- d_add(dentry, inode); +- return NULL; +- } ++out_put: ++ de_put(lde); ++ de_put(gde); ++out: ++ unlock_kernel(); + return ERR_PTR(error); + } + ++struct proc_dir_reader { ++ struct list_head list; ++ struct proc_dir_entry *next; ++}; ++ ++static LIST_HEAD(proc_dir_readers); ++static DEFINE_SPINLOCK(proc_dir_readers_lock); ++ ++static inline void add_reader(struct proc_dir_reader *r, ++ struct proc_dir_entry *cur) ++{ ++ r->next = cur->next; ++ spin_lock(&proc_dir_readers_lock); ++ list_add(&r->list, &proc_dir_readers); ++ spin_unlock(&proc_dir_readers_lock); ++} ++ ++static inline struct proc_dir_entry *del_reader(struct proc_dir_reader *r) ++{ ++ spin_lock(&proc_dir_readers_lock); ++ list_del(&r->list); ++ spin_unlock(&proc_dir_readers_lock); ++ return r->next; ++} ++ ++static void notify_readers(struct proc_dir_entry *de) ++{ ++ struct proc_dir_reader *r; ++ ++ /* lockless since proc_tree_lock is taken for writing */ ++ list_for_each_entry(r, &proc_dir_readers, list) ++ if (r->next == de) ++ r->next = de->next; ++} ++ ++static inline int in_tree(struct proc_dir_entry *de, struct proc_dir_entry *dir) ++{ ++ struct proc_dir_entry *gde; ++ ++ for (gde = dir->subdir; gde; gde = gde->next) { ++ if (de->namelen != gde->namelen) ++ continue; ++ if (memcmp(de->name, gde->name, gde->namelen)) ++ continue; ++ return 1; ++ } ++ return 0; ++} ++ + /* + * This returns non-zero if at EOF, so that the /proc + * root directory can use this and check if it should +@@ -421,6 +582,7 @@ int proc_readdir(struct file * filp, + int i; + struct inode *inode = filp->f_dentry->d_inode; + int ret = 0; ++ struct proc_dir_reader this; + + lock_kernel(); + +@@ -447,13 +609,12 @@ int proc_readdir(struct file * filp, + filp->f_pos++; + /* fall through */ + default: ++ read_lock(&proc_tree_lock); + de = de->subdir; + i -= 2; + for (;;) { +- if (!de) { +- ret = 1; +- goto out; +- } ++ if (!de) ++ goto chk_global; + if (!i) + break; + de = de->next; +@@ -461,12 +622,60 @@ int proc_readdir(struct file * filp, + } + + do { +- if (filldir(dirent, de->name, de->namelen, filp->f_pos, +- de->low_ino, de->mode >> 12) < 0) ++ de_get(de); ++ add_reader(&this, de); ++ read_unlock(&proc_tree_lock); ++ ret = filldir(dirent, de->name, de->namelen, ++ filp->f_pos, de->low_ino, ++ de->mode >> 12); ++ read_lock(&proc_tree_lock); ++ de_put(de); ++ de = del_reader(&this); ++ if (ret < 0) { ++ read_unlock(&proc_tree_lock); ++ ret = 0; + goto out; ++ } + filp->f_pos++; +- de = de->next; + } while (de); ++chk_global: ++#ifdef CONFIG_VE ++ de = GPDE(inode); ++ if (de == NULL) ++ goto done; ++ ++ de = de->subdir; ++ while (de) { ++ if (in_tree(de, LPDE(inode))) { ++ de = de->next; ++ continue; ++ } ++ ++ if (i > 0) { ++ i--; ++ de = de->next; ++ continue; ++ } ++ ++ de_get(de); ++ add_reader(&this, de); ++ read_unlock(&proc_tree_lock); ++ ret = filldir(dirent, de->name, de->namelen, ++ filp->f_pos, de->low_ino, ++ de->mode >> 12); ++ read_lock(&proc_tree_lock); ++ de_put(de); ++ de = del_reader(&this); ++ if (ret < 0) { ++ read_unlock(&proc_tree_lock); ++ ret = 0; ++ goto out; ++ } ++ filp->f_pos++; ++ } ++done: ++#endif ++ read_unlock(&proc_tree_lock); + } + ret = 1; + out: unlock_kernel(); +@@ -488,8 +697,10 @@ static struct file_operations proc_dir_o + */ + static struct inode_operations proc_dir_inode_operations = { + .lookup = proc_lookup, ++#ifndef CONFIG_VE + .getattr = proc_getattr, + .setattr = proc_notify_change, ++#endif + }; + + static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) +@@ -499,10 +710,20 @@ static int proc_register(struct proc_dir + i = get_inode_number(); + if (i == 0) + return -EAGAIN; ++ ++ write_lock(&proc_tree_lock); ++ if (dir->deleted) { ++ write_unlock(&proc_tree_lock); ++ release_inode_number(i); ++ return -ENOENT; ++ } ++ + dp->low_ino = i; + dp->next = dir->subdir; +- dp->parent = dir; ++ dp->parent = de_get(dir); + dir->subdir = dp; ++ write_unlock(&proc_tree_lock); ++ + if (S_ISDIR(dp->mode)) { + if (dp->proc_iops == NULL) { + dp->proc_fops = &proc_dir_operations; +@@ -556,24 +777,26 @@ static struct proc_dir_entry *proc_creat + mode_t mode, + nlink_t nlink) + { +- struct proc_dir_entry *ent = NULL; ++ struct proc_dir_entry *ent; + const char *fn = name; + int len; + + /* make sure name is valid */ +- if (!name || !strlen(name)) goto out; ++ if (!name || !strlen(name)) ++ goto out; + +- if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0) ++ if (xlate_proc_loc_name(name, parent, &fn) != 0) + goto out; + + /* At this point there must not be any '/' characters beyond *fn */ + if (strchr(fn, '/')) +- goto out; ++ goto out_put; + + len = strlen(fn); + + ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); +- if (!ent) goto out; ++ if (!ent) ++ goto out_put; + + memset(ent, 0, sizeof(struct proc_dir_entry)); + memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1); +@@ -581,8 +804,13 @@ static struct proc_dir_entry *proc_creat + ent->namelen = len; + ent->mode = mode; + ent->nlink = nlink; +- out: ++ atomic_set(&ent->count, 1); + return ent; ++ ++out_put: ++ de_put(*parent); ++out: ++ return NULL; + } + + struct proc_dir_entry *proc_symlink(const char *name, +@@ -606,6 +834,7 @@ struct proc_dir_entry *proc_symlink(cons + kfree(ent); + ent = NULL; + } ++ de_put(parent); + } + return ent; + } +@@ -624,6 +853,7 @@ struct proc_dir_entry *proc_mkdir_mode(c + kfree(ent); + ent = NULL; + } ++ de_put(parent); + } + return ent; + } +@@ -662,9 +892,28 @@ struct proc_dir_entry *create_proc_entry + kfree(ent); + ent = NULL; + } ++ de_put(parent); + } + return ent; + } ++EXPORT_SYMBOL(remove_proc_glob_entry); ++ ++struct proc_dir_entry *create_proc_glob_entry(const char *name, mode_t mode, ++ struct proc_dir_entry *parent) ++{ ++ const char *path; ++ struct proc_dir_entry *ent; ++ ++ path = name; ++ if (xlate_proc_name(path, &parent, &name) != 0) ++ return NULL; ++ ++ ent = create_proc_entry(name, mode, parent); ++ de_put(parent); ++ return ent; ++} ++ ++EXPORT_SYMBOL(create_proc_glob_entry); + + void free_proc_entry(struct proc_dir_entry *de) + { +@@ -684,20 +933,21 @@ void free_proc_entry(struct proc_dir_ent + * Remove a /proc entry and free it if it's not currently in use. + * If it is in use, we set the 'deleted' flag. + */ +-void remove_proc_entry(const char *name, struct proc_dir_entry *parent) ++static void __remove_proc_entry(const char *name, struct proc_dir_entry *parent) + { + struct proc_dir_entry **p; + struct proc_dir_entry *de; + const char *fn = name; + int len; + +- if (!parent && xlate_proc_name(name, &parent, &fn) != 0) +- goto out; + len = strlen(fn); ++ write_lock(&proc_tree_lock); + for (p = &parent->subdir; *p; p=&(*p)->next ) { + if (!proc_match(len, fn, *p)) + continue; ++ + de = *p; ++ notify_readers(de); + *p = de->next; + de->next = NULL; + if (S_ISDIR(de->mode)) +@@ -705,15 +955,43 @@ void remove_proc_entry(const char *name, + proc_kill_inodes(de); + de->nlink = 0; + WARN_ON(de->subdir); +- if (!atomic_read(&de->count)) +- free_proc_entry(de); +- else { +- de->deleted = 1; +- printk("remove_proc_entry: %s/%s busy, count=%d\n", +- parent->name, de->name, atomic_read(&de->count)); +- } ++ de->deleted = 1; ++ de_put(de); ++ de_put(parent); + break; + } +-out: +- return; ++ write_unlock(&proc_tree_lock); ++} ++ ++void remove_proc_loc_entry(const char *name, struct proc_dir_entry *parent) ++{ ++ const char *path; ++ ++ path = name; ++ if (xlate_proc_loc_name(path, &parent, &name) != 0) ++ return; ++ ++ __remove_proc_entry(name, parent); ++ de_put(parent); ++} ++ ++void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent) ++{ ++ const char *path; ++ ++ path = name; ++ if (xlate_proc_name(path, &parent, &name) != 0) ++ return; ++ ++ __remove_proc_entry(name, parent); ++ de_put(parent); ++} ++ ++void remove_proc_entry(const char *name, struct proc_dir_entry *parent) ++{ ++ remove_proc_loc_entry(name, parent); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ remove_proc_glob_entry(name, parent); ++#endif + } +diff -upr linux-2.6.16.orig/fs/proc/inode.c linux-2.6.16-026test015/fs/proc/inode.c +--- linux-2.6.16.orig/fs/proc/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/inode.c 2006-07-04 14:41:38.000000000 +0400 +@@ -8,6 +8,7 @@ + #include <linux/proc_fs.h> + #include <linux/kernel.h> + #include <linux/mm.h> ++#include <linux/ve_owner.h> + #include <linux/string.h> + #include <linux/stat.h> + #include <linux/file.h> +@@ -21,34 +22,25 @@ + + #include "internal.h" + +-static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) +-{ +- if (de) +- atomic_inc(&de->count); +- return de; +-} +- + /* + * Decrements the use count and checks for deferred deletion. + */ +-static void de_put(struct proc_dir_entry *de) ++void de_put(struct proc_dir_entry *de) + { + if (de) { +- lock_kernel(); + if (!atomic_read(&de->count)) { + printk("de_put: entry %s already free!\n", de->name); +- unlock_kernel(); + return; + } + + if (atomic_dec_and_test(&de->count)) { +- if (de->deleted) { +- printk("de_put: deferred delete of %s\n", ++ if (unlikely(!de->deleted)) { ++ printk("de_put: early delete of %s\n", + de->name); +- free_proc_entry(de); ++ return; + } ++ free_proc_entry(de); + } +- unlock_kernel(); + } + } + +@@ -68,12 +60,19 @@ static void proc_delete_inode(struct ino + put_task_struct(tsk); + + /* Let go of any associated proc directory entry */ +- de = PROC_I(inode)->pde; ++ de = LPDE(inode); + if (de) { + if (de->owner) + module_put(de->owner); + de_put(de); + } ++#ifdef CONFIG_VE ++ de = GPDE(inode); ++ if (de) { ++ module_put(de->owner); ++ de_put(de); ++ } ++#endif + clear_inode(inode); + } + +@@ -100,6 +99,9 @@ static struct inode *proc_alloc_inode(st + ei->pde = NULL; + inode = &ei->vfs_inode; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++#ifdef CONFIG_VE ++ GPDE(inode) = NULL; ++#endif + return inode; + } + +@@ -209,6 +211,12 @@ int proc_fill_super(struct super_block * + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) + goto out_no_root; ++#ifdef CONFIG_VE ++ LPDE(root_inode) = de_get(get_exec_env()->proc_root); ++ GPDE(root_inode) = &proc_root; ++#else ++ LPDE(root_inode) = &proc_root; ++#endif + return 0; + + out_no_root: +diff -upr linux-2.6.16.orig/fs/proc/kmsg.c linux-2.6.16-026test015/fs/proc/kmsg.c +--- linux-2.6.16.orig/fs/proc/kmsg.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/kmsg.c 2006-07-04 14:41:38.000000000 +0400 +@@ -11,6 +11,7 @@ + #include <linux/kernel.h> + #include <linux/poll.h> + #include <linux/fs.h> ++#include <linux/veprintk.h> + + #include <asm/uaccess.h> + #include <asm/io.h> +@@ -40,7 +41,7 @@ static ssize_t kmsg_read(struct file *fi + + static unsigned int kmsg_poll(struct file *file, poll_table *wait) + { +- poll_wait(file, &log_wait, wait); ++ poll_wait(file, &ve_log_wait, wait); + if (do_syslog(9, NULL, 0)) + return POLLIN | POLLRDNORM; + return 0; +diff -upr linux-2.6.16.orig/fs/proc/proc_misc.c linux-2.6.16-026test015/fs/proc/proc_misc.c +--- linux-2.6.16.orig/fs/proc/proc_misc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/proc_misc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -32,6 +32,7 @@ + #include <linux/pagemap.h> + #include <linux/swap.h> + #include <linux/slab.h> ++#include <linux/virtinfo.h> + #include <linux/smp.h> + #include <linux/signal.h> + #include <linux/module.h> +@@ -45,6 +46,8 @@ + #include <linux/jiffies.h> + #include <linux/sysrq.h> + #include <linux/vmalloc.h> ++#include <linux/version.h> ++#include <linux/compile.h> + #include <linux/crash_dump.h> + #include <asm/uaccess.h> + #include <asm/pgtable.h> +@@ -53,8 +56,10 @@ + #include <asm/div64.h> + #include "internal.h" + +-#define LOAD_INT(x) ((x) >> FSHIFT) +-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) ++#ifdef CONFIG_FAIRSCHED ++#include <linux/fairsched.h> ++#endif ++ + /* + * Warning: stuff below (imported functions) assumes that its output will fit + * into one page. For some of those functions it may be wrong. Moreover, we +@@ -84,15 +89,33 @@ static int loadavg_read_proc(char *page, + { + int a, b, c; + int len; +- +- a = avenrun[0] + (FIXED_1/200); +- b = avenrun[1] + (FIXED_1/200); +- c = avenrun[2] + (FIXED_1/200); ++ unsigned long __nr_running; ++ int __nr_threads; ++ unsigned long *__avenrun; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); ++ ++ if (ve_is_super(ve)) { ++ __avenrun = &avenrun[0]; ++ __nr_running = nr_running(); ++ __nr_threads = nr_threads; ++ } ++#ifdef CONFIG_VE ++ else { ++ __avenrun = &ve->avenrun[0]; ++ __nr_running = nr_running_ve(ve); ++ __nr_threads = atomic_read(&ve->pcounter); ++ } ++#endif ++ a = __avenrun[0] + (FIXED_1/200); ++ b = __avenrun[1] + (FIXED_1/200); ++ c = __avenrun[2] + (FIXED_1/200); + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", + LOAD_INT(a), LOAD_FRAC(a), + LOAD_INT(b), LOAD_FRAC(b), + LOAD_INT(c), LOAD_FRAC(c), +- nr_running(), nr_threads, last_pid); ++ __nr_running, __nr_threads, last_pid); + return proc_calc_metrics(page, start, off, count, eof, len); + } + +@@ -105,6 +128,13 @@ static int uptime_read_proc(char *page, + cputime_t idletime = cputime_add(init_task.utime, init_task.stime); + + do_posix_clock_monotonic_gettime(&uptime); ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) { ++ set_normalized_timespec(&uptime, ++ uptime.tv_sec - get_exec_env()->start_timespec.tv_sec, ++ uptime.tv_nsec - get_exec_env()->start_timespec.tv_nsec); ++ } ++#endif + cputime_to_timespec(idletime, &idle); + len = sprintf(page,"%lu.%02lu %lu.%02lu\n", + (unsigned long) uptime.tv_sec, +@@ -118,35 +148,37 @@ static int uptime_read_proc(char *page, + static int meminfo_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) + { +- struct sysinfo i; ++ struct meminfo mi; + int len; +- struct page_state ps; +- unsigned long inactive; +- unsigned long active; +- unsigned long free; +- unsigned long committed; +- unsigned long allowed; ++ unsigned long dummy; + struct vmalloc_info vmi; +- long cached; + +- get_page_state(&ps); +- get_zone_counts(&active, &inactive, &free); ++ get_page_state(&mi.ps); ++ get_zone_counts(&mi.active, &mi.inactive, &dummy); + + /* + * display in kilobytes. + */ + #define K(x) ((x) << (PAGE_SHIFT - 10)) +- si_meminfo(&i); +- si_swapinfo(&i); +- committed = atomic_read(&vm_committed_space); +- allowed = ((totalram_pages - hugetlb_total_pages()) +- * sysctl_overcommit_ratio / 100) + total_swap_pages; ++ si_meminfo(&mi.si); ++ si_swapinfo(&mi.si); ++ mi.committed_space = atomic_read(&vm_committed_space); ++ mi.swapcache = total_swapcache_pages; ++ mi.cache = get_page_cache_size() - mi.swapcache - mi.si.bufferram; ++ if (mi.cache < 0) ++ mi.cache = 0; + +- cached = get_page_cache_size() - total_swapcache_pages - i.bufferram; +- if (cached < 0) +- cached = 0; ++ mi.vmalloc_total = (VMALLOC_END - VMALLOC_START) >> PAGE_SHIFT; ++ mi.allowed = ((totalram_pages - hugetlb_total_pages()) ++ * sysctl_overcommit_ratio / 100) + total_swap_pages; + + get_vmalloc_info(&vmi); ++ mi.vmalloc_used = vmi.used >> PAGE_SHIFT; ++ mi.vmalloc_largest = vmi.largest_chunk >> PAGE_SHIFT; ++ ++ if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi) ++ & NOTIFY_FAIL) ++ return -ENOMSG; + + /* + * Tagged format, for easy grepping and expansion. +@@ -175,29 +207,29 @@ static int meminfo_read_proc(char *page, + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", +- K(i.totalram), +- K(i.freeram), +- K(i.bufferram), +- K(cached), +- K(total_swapcache_pages), +- K(active), +- K(inactive), +- K(i.totalhigh), +- K(i.freehigh), +- K(i.totalram-i.totalhigh), +- K(i.freeram-i.freehigh), +- K(i.totalswap), +- K(i.freeswap), +- K(ps.nr_dirty), +- K(ps.nr_writeback), +- K(ps.nr_mapped), +- K(ps.nr_slab), +- K(allowed), +- K(committed), +- K(ps.nr_page_table_pages), +- (unsigned long)VMALLOC_TOTAL >> 10, +- vmi.used >> 10, +- vmi.largest_chunk >> 10 ++ K(mi.si.totalram), ++ K(mi.si.freeram), ++ K(mi.si.bufferram), ++ K(mi.cache), ++ K(mi.swapcache), ++ K(mi.active), ++ K(mi.inactive), ++ K(mi.si.totalhigh), ++ K(mi.si.freehigh), ++ K(mi.si.totalram-mi.si.totalhigh), ++ K(mi.si.freeram-mi.si.freehigh), ++ K(mi.si.totalswap), ++ K(mi.si.freeswap), ++ K(mi.ps.nr_dirty), ++ K(mi.ps.nr_writeback), ++ K(mi.ps.nr_mapped), ++ K(mi.ps.nr_slab), ++ K(mi.allowed), ++ K(mi.committed_space), ++ K(mi.ps.nr_page_table_pages), ++ K(mi.vmalloc_total), ++ K(mi.vmalloc_used), ++ K(mi.vmalloc_largest) + ); + + len += hugetlb_report_meminfo(page + len); +@@ -237,8 +269,15 @@ static int version_read_proc(char *page, + int count, int *eof, void *data) + { + int len; ++ struct new_utsname *utsname = &ve_utsname; + +- strcpy(page, linux_banner); ++ if (ve_is_super(get_exec_env())) ++ strcpy(page, linux_banner); ++ else ++ sprintf(page, "Linux version %s (" ++ LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") (" ++ LINUX_COMPILER ") %s\n", ++ utsname->release, utsname->version); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); + } +@@ -249,144 +288,60 @@ static int cpuinfo_open(struct inode *in + return seq_open(file, &cpuinfo_op); + } + +-enum devinfo_states { +- CHR_HDR, +- CHR_LIST, +- BLK_HDR, +- BLK_LIST, +- DEVINFO_DONE +-}; +- +-struct devinfo_state { +- void *chrdev; +- void *blkdev; +- unsigned int num_records; +- unsigned int cur_record; +- enum devinfo_states state; ++static struct file_operations proc_cpuinfo_operations = { ++ .open = cpuinfo_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, + }; + +-static void *devinfo_start(struct seq_file *f, loff_t *pos) ++static int devinfo_show(struct seq_file *f, void *v) + { +- struct devinfo_state *info = f->private; ++ int i = *(loff_t *) v; + +- if (*pos) { +- if ((info) && (*pos <= info->num_records)) +- return info; +- return NULL; ++ if (i < CHRDEV_MAJOR_HASH_SIZE) { ++ if (i == 0) ++ seq_printf(f, "Character devices:\n"); ++ chrdev_show(f, i); ++ } else { ++ i -= CHRDEV_MAJOR_HASH_SIZE; ++ if (i == 0) ++ seq_printf(f, "\nBlock devices:\n"); ++ blkdev_show(f, i); + } +- info = kmalloc(sizeof(*info), GFP_KERNEL); +- f->private = info; +- info->chrdev = acquire_chrdev_list(); +- info->blkdev = acquire_blkdev_list(); +- info->state = CHR_HDR; +- info->num_records = count_chrdev_list(); +- info->num_records += count_blkdev_list(); +- info->num_records += 2; /* Character and Block headers */ +- *pos = 1; +- info->cur_record = *pos; +- return info; ++ return 0; + } + +-static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) ++static void *devinfo_start(struct seq_file *f, loff_t *pos) + { +- int idummy; +- char *ndummy; +- struct devinfo_state *info = f->private; +- +- switch (info->state) { +- case CHR_HDR: +- info->state = CHR_LIST; +- (*pos)++; +- /*fallthrough*/ +- case CHR_LIST: +- if (get_chrdev_info(info->chrdev,&idummy,&ndummy)) { +- /* +- * The character dev list is complete +- */ +- info->state = BLK_HDR; +- } else { +- info->chrdev = get_next_chrdev(info->chrdev); +- } +- (*pos)++; +- break; +- case BLK_HDR: +- info->state = BLK_LIST; +- (*pos)++; +- break; +- case BLK_LIST: +- if (get_blkdev_info(info->blkdev,&idummy,&ndummy)) { +- /* +- * The block dev list is complete +- */ +- info->state = DEVINFO_DONE; +- } else { +- info->blkdev = get_next_blkdev(info->blkdev); +- } +- (*pos)++; +- break; +- case DEVINFO_DONE: +- (*pos)++; +- info->cur_record = *pos; +- info = NULL; +- break; +- default: +- break; +- } +- if (info) +- info->cur_record = *pos; +- return info; ++ if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) ++ return pos; ++ return NULL; + } + +-static void devinfo_stop(struct seq_file *f, void *v) ++static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) + { +- struct devinfo_state *info = f->private; +- +- if (info) { +- release_chrdev_list(info->chrdev); +- release_blkdev_list(info->blkdev); +- f->private = NULL; +- kfree(info); +- } ++ (*pos)++; ++ if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) ++ return NULL; ++ return pos; + } + +-static int devinfo_show(struct seq_file *f, void *arg) +-{ +- int major; +- char *name; +- struct devinfo_state *info = f->private; +- +- switch(info->state) { +- case CHR_HDR: +- seq_printf(f,"Character devices:\n"); +- /* fallthrough */ +- case CHR_LIST: +- if (!get_chrdev_info(info->chrdev,&major,&name)) +- seq_printf(f,"%3d %s\n",major,name); +- break; +- case BLK_HDR: +- seq_printf(f,"\nBlock devices:\n"); +- /* fallthrough */ +- case BLK_LIST: +- if (!get_blkdev_info(info->blkdev,&major,&name)) +- seq_printf(f,"%3d %s\n",major,name); +- break; +- default: +- break; +- } +- +- return 0; ++static void devinfo_stop(struct seq_file *f, void *v) ++{ ++ /* Nothing to do */ + } + +-static struct seq_operations devinfo_op = { +- .start = devinfo_start, +- .next = devinfo_next, +- .stop = devinfo_stop, +- .show = devinfo_show, ++static struct seq_operations devinfo_ops = { ++ .start = devinfo_start, ++ .next = devinfo_next, ++ .stop = devinfo_stop, ++ .show = devinfo_show + }; + +-static int devinfo_open(struct inode *inode, struct file *file) ++static int devinfo_open(struct inode *inode, struct file *filp) + { +- return seq_open(file, &devinfo_op); ++ return seq_open(filp, &devinfo_ops); + } + + static struct file_operations proc_devinfo_operations = { +@@ -396,13 +351,6 @@ static struct file_operations proc_devin + .release = seq_release, + }; + +-static struct file_operations proc_cpuinfo_operations = { +- .open = cpuinfo_open, +- .read = seq_read, +- .llseek = seq_lseek, +- .release = seq_release, +-}; +- + extern struct seq_operations vmstat_op; + static int vmstat_open(struct inode *inode, struct file *file) + { +@@ -487,18 +435,15 @@ static struct file_operations proc_slabi + }; + #endif + +-static int show_stat(struct seq_file *p, void *v) ++static void show_stat_ve0(struct seq_file *p) + { + int i; +- unsigned long jif; ++ struct page_state page_state; + cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; + u64 sum = 0; + + user = nice = system = idle = iowait = + irq = softirq = steal = cputime64_zero; +- jif = - wall_to_monotonic.tv_sec; +- if (wall_to_monotonic.tv_nsec) +- --jif; + + for_each_cpu(i) { + int j; +@@ -552,9 +497,84 @@ static int show_stat(struct seq_file *p, + for (i = 0; i < NR_IRQS; i++) + seq_printf(p, " %u", kstat_irqs(i)); + #endif ++ get_full_page_state(&page_state); ++ seq_printf(p, "\nswap %lu %lu\n", page_state.pswpin, page_state.pswpout); ++} ++ ++#ifdef CONFIG_VE ++static void show_stat_ve(struct seq_file *p, struct ve_struct *env) ++{ ++ int i; ++ u64 user, nice, system; ++ cycles_t idle, iowait; ++ cpumask_t ve_cpus; ++ ++ ve_cpu_online_map(env, &ve_cpus); ++ ++ user = nice = system = idle = iowait = 0; ++ for_each_cpu_mask(i, ve_cpus) { ++ user += VE_CPU_STATS(env, i)->user; ++ nice += VE_CPU_STATS(env, i)->nice; ++ system += VE_CPU_STATS(env, i)->system; ++ idle += ve_sched_get_idle_time(env, i); ++ iowait += ve_sched_get_iowait_time(env, i); ++ } ++ ++ seq_printf(p, "cpu %llu %llu %llu %llu %llu 0 0 0\n", ++ (unsigned long long)cputime64_to_clock_t(user), ++ (unsigned long long)cputime64_to_clock_t(nice), ++ (unsigned long long)cputime64_to_clock_t(system), ++ (unsigned long long)cycles_to_clocks(idle), ++ (unsigned long long)cycles_to_clocks(iowait)); ++ ++ for_each_cpu_mask(i, ve_cpus) { ++ user = VE_CPU_STATS(env, i)->user; ++ nice = VE_CPU_STATS(env, i)->nice; ++ system = VE_CPU_STATS(env, i)->system; ++ idle = ve_sched_get_idle_time(env, i); ++ iowait = ve_sched_get_iowait_time(env, i); ++ seq_printf(p, "cpu%d %llu %llu %llu %llu %llu 0 0 0\n", ++ i, ++ (unsigned long long)cputime64_to_clock_t(user), ++ (unsigned long long)cputime64_to_clock_t(nice), ++ (unsigned long long)cputime64_to_clock_t(system), ++ (unsigned long long)cycles_to_clocks(idle), ++ (unsigned long long)cycles_to_clocks(iowait)); ++ } ++ seq_printf(p, "intr 0\nswap 0 0\n"); ++} ++#endif ++ ++int show_stat(struct seq_file *p, void *v) ++{ ++ extern unsigned long total_forks; ++ unsigned long seq, jif; ++ struct ve_struct *env; ++ unsigned long __nr_running, __nr_iowait; ++ ++ do { ++ seq = read_seqbegin(&xtime_lock); ++ jif = - wall_to_monotonic.tv_sec; ++ if (wall_to_monotonic.tv_nsec) ++ --jif; ++ } while (read_seqretry(&xtime_lock, seq)); ++ ++ env = get_exec_env(); ++ if (ve_is_super(env)) { ++ show_stat_ve0(p); ++ __nr_running = nr_running(); ++ __nr_iowait = nr_iowait(); ++ } ++#ifdef CONFIG_VE ++ else { ++ show_stat_ve(p, env); ++ __nr_running = nr_running_ve(env); ++ __nr_iowait = nr_iowait_ve(env); ++ } ++#endif + + seq_printf(p, +- "\nctxt %llu\n" ++ "ctxt %llu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" +@@ -562,8 +582,8 @@ static int show_stat(struct seq_file *p, + nr_context_switches(), + (unsigned long)jif, + total_forks, +- nr_running(), +- nr_iowait()); ++ __nr_running, ++ __nr_iowait); + + return 0; + } +@@ -652,7 +672,8 @@ static int cmdline_read_proc(char *page, + { + int len; + +- len = sprintf(page, "%s\n", saved_command_line); ++ len = sprintf(page, "%s\n", ++ ve_is_super(get_exec_env()) ? saved_command_line : ""); + return proc_calc_metrics(page, start, off, count, eof, len); + } + +diff -upr linux-2.6.16.orig/fs/proc/proc_tty.c linux-2.6.16-026test015/fs/proc/proc_tty.c +--- linux-2.6.16.orig/fs/proc/proc_tty.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/proc_tty.c 2006-07-04 14:41:38.000000000 +0400 +@@ -6,6 +6,7 @@ + + #include <asm/uaccess.h> + ++#include <linux/ve_owner.h> + #include <linux/init.h> + #include <linux/errno.h> + #include <linux/time.h> +@@ -106,24 +107,35 @@ static int show_tty_driver(struct seq_fi + /* iterator */ + static void *t_start(struct seq_file *m, loff_t *pos) + { +- struct list_head *p; ++ struct tty_driver *drv; ++ + loff_t l = *pos; +- list_for_each(p, &tty_drivers) ++ read_lock(&tty_driver_guard); ++ list_for_each_entry(drv, &tty_drivers, tty_drivers) { ++ if (!ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env())) ++ continue; + if (!l--) +- return list_entry(p, struct tty_driver, tty_drivers); ++ return drv; ++ } + return NULL; + } + + static void *t_next(struct seq_file *m, void *v, loff_t *pos) + { +- struct list_head *p = ((struct tty_driver *)v)->tty_drivers.next; ++ struct tty_driver *drv; ++ + (*pos)++; +- return p==&tty_drivers ? NULL : +- list_entry(p, struct tty_driver, tty_drivers); ++ drv = (struct tty_driver *)v; ++ list_for_each_entry_continue(drv, &tty_drivers, tty_drivers) { ++ if (ve_accessible_strict(VE_OWNER_TTYDRV(drv), get_exec_env())) ++ return drv; ++ } ++ return NULL; + } + + static void t_stop(struct seq_file *m, void *v) + { ++ read_unlock(&tty_driver_guard); + } + + static struct seq_operations tty_drivers_op = { +diff -upr linux-2.6.16.orig/fs/proc/root.c linux-2.6.16-026test015/fs/proc/root.c +--- linux-2.6.16.orig/fs/proc/root.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/root.c 2006-07-04 14:41:38.000000000 +0400 +@@ -20,7 +20,10 @@ + + #include "internal.h" + +-struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; ++#ifndef CONFIG_VE ++struct proc_dir_entry *proc_net, *proc_net_stat; ++#endif ++struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; + + #ifdef CONFIG_SYSCTL + struct proc_dir_entry *proc_sys_root; +@@ -32,12 +35,14 @@ static struct super_block *proc_get_sb(s + return get_sb_single(fs_type, flags, data, proc_fill_super); + } + +-static struct file_system_type proc_fs_type = { ++struct file_system_type proc_fs_type = { + .name = "proc", + .get_sb = proc_get_sb, + .kill_sb = kill_anon_super, + }; + ++EXPORT_SYMBOL(proc_fs_type); ++ + void __init proc_root_init(void) + { + int err = proc_init_inodecache(); +@@ -157,7 +162,9 @@ EXPORT_SYMBOL(create_proc_entry); + EXPORT_SYMBOL(remove_proc_entry); + EXPORT_SYMBOL(proc_root); + EXPORT_SYMBOL(proc_root_fs); ++#ifndef CONFIG_VE + EXPORT_SYMBOL(proc_net); + EXPORT_SYMBOL(proc_net_stat); ++#endif + EXPORT_SYMBOL(proc_bus); + EXPORT_SYMBOL(proc_root_driver); +diff -upr linux-2.6.16.orig/fs/proc/task_mmu.c linux-2.6.16-026test015/fs/proc/task_mmu.c +--- linux-2.6.16.orig/fs/proc/task_mmu.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/task_mmu.c 2006-07-04 14:41:38.000000000 +0400 +@@ -90,9 +90,12 @@ int proc_exe_link(struct inode *inode, s + } + + if (vma) { +- *mnt = mntget(vma->vm_file->f_vfsmnt); +- *dentry = dget(vma->vm_file->f_dentry); +- result = 0; ++ result = d_root_check(vma->vm_file->f_dentry, ++ vma->vm_file->f_vfsmnt); ++ if (!result) { ++ *mnt = mntget(vma->vm_file->f_vfsmnt); ++ *dentry = dget(vma->vm_file->f_dentry); ++ } + } + + up_read(&mm->mmap_sem); +diff -upr linux-2.6.16.orig/fs/proc/task_nommu.c linux-2.6.16-026test015/fs/proc/task_nommu.c +--- linux-2.6.16.orig/fs/proc/task_nommu.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/task_nommu.c 2006-07-04 14:41:38.000000000 +0400 +@@ -126,9 +126,12 @@ int proc_exe_link(struct inode *inode, s + } + + if (vma) { +- *mnt = mntget(vma->vm_file->f_vfsmnt); +- *dentry = dget(vma->vm_file->f_dentry); +- result = 0; ++ result = d_root_check(vma->vm_file->f_dentry, ++ vma->vm_file->f_vfsmnt); ++ if (!result) { ++ *mnt = mntget(vma->vm_file->f_vfsmnt); ++ *dentry = dget(vma->vm_file->f_dentry); ++ } + } + + up_read(&mm->mmap_sem); +diff -upr linux-2.6.16.orig/fs/proc/vmcore.c linux-2.6.16-026test015/fs/proc/vmcore.c +--- linux-2.6.16.orig/fs/proc/vmcore.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/proc/vmcore.c 2006-07-04 14:41:36.000000000 +0400 +@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file * + size_t buflen, loff_t *fpos) + { + ssize_t acc = 0, tmp; +- size_t tsz, nr_bytes; +- u64 start; ++ size_t tsz; ++ u64 start, nr_bytes; + struct vmcore *curr_m = NULL; + + if (buflen == 0 || *fpos >= vmcore_size) +diff -upr linux-2.6.16.orig/fs/quota.c linux-2.6.16-026test015/fs/quota.c +--- linux-2.6.16.orig/fs/quota.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/quota.c 2006-07-04 14:41:39.000000000 +0400 +@@ -81,11 +81,11 @@ static int generic_quotactl_valid(struct + if (cmd == Q_GETQUOTA) { + if (((type == USRQUOTA && current->euid != id) || + (type == GRPQUOTA && !in_egroup_p(id))) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } + else if (cmd != Q_GETFMT && cmd != Q_SYNC && cmd != Q_GETINFO) +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + + return 0; +@@ -132,10 +132,10 @@ static int xqm_quotactl_valid(struct sup + if (cmd == Q_XGETQUOTA) { + if (((type == XQM_USRQUOTA && current->euid != id) || + (type == XQM_GRPQUOTA && !in_egroup_p(id))) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } else if (cmd != Q_XGETQSTAT && cmd != Q_XQUOTASYNC) { +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + } + +@@ -216,7 +216,7 @@ restart: + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); +- if (sb->s_root && sb->s_qcop->quota_sync) ++ if (sb->s_root && sb->s_qcop && sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); + up_read(&sb->s_umount); + spin_lock(&sb_lock); +@@ -337,6 +337,235 @@ static int do_quotactl(struct super_bloc + return 0; + } + ++static struct super_block *quota_get_sb(const char __user *special) ++{ ++ struct super_block *sb; ++ struct block_device *bdev; ++ char *tmp; ++ ++ tmp = getname(special); ++ if (IS_ERR(tmp)) ++ return (struct super_block *)tmp; ++ bdev = lookup_bdev(tmp, FMODE_QUOTACTL); ++ putname(tmp); ++ if (IS_ERR(bdev)) ++ return (struct super_block *)bdev; ++ sb = get_super(bdev); ++ bdput(bdev); ++ if (!sb) ++ return ERR_PTR(-ENODEV); ++ return sb; ++} ++ ++#ifdef CONFIG_QUOTA_COMPAT ++ ++#define QC_QUOTAON 0x0100 /* enable quotas */ ++#define QC_QUOTAOFF 0x0200 /* disable quotas */ ++/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */ ++#define QC_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ ++#define QC_SETQLIM 0x0700 /* set limits */ ++/* GETSTATS at 0x0800 is now longer... */ ++#define QC_GETINFO 0x0900 /* get info about quotas - graces, flags... */ ++#define QC_SETINFO 0x0A00 /* set info about quotas */ ++#define QC_SETGRACE 0x0B00 /* set inode and block grace */ ++#define QC_SETFLAGS 0x0C00 /* set flags for quota */ ++#define QC_GETQUOTA 0x0D00 /* get limits and usage */ ++#define QC_SETQUOTA 0x0E00 /* set limits and usage */ ++#define QC_SETUSE 0x0F00 /* set usage */ ++/* 0x1000 used by old RSQUASH */ ++#define QC_GETSTATS 0x1100 /* get collected stats */ ++ ++struct compat_dqblk { ++ unsigned int dqb_ihardlimit; ++ unsigned int dqb_isoftlimit; ++ unsigned int dqb_curinodes; ++ unsigned int dqb_bhardlimit; ++ unsigned int dqb_bsoftlimit; ++ qsize_t dqb_curspace; ++ __kernel_time_t dqb_btime; ++ __kernel_time_t dqb_itime; ++}; ++ ++struct compat_dqinfo { ++ unsigned int dqi_bgrace; ++ unsigned int dqi_igrace; ++ unsigned int dqi_flags; ++ unsigned int dqi_blocks; ++ unsigned int dqi_free_blk; ++ unsigned int dqi_free_entry; ++}; ++ ++struct compat_dqstats { ++ __u32 lookups; ++ __u32 drops; ++ __u32 reads; ++ __u32 writes; ++ __u32 cache_hits; ++ __u32 allocated_dquots; ++ __u32 free_dquots; ++ __u32 syncs; ++ __u32 version; ++}; ++ ++asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr); ++static long compat_quotactl(unsigned int cmds, unsigned int type, ++ const char __user *special, qid_t id, ++ void __user *addr) ++{ ++ struct super_block *sb; ++ long ret; ++ ++ sb = NULL; ++ switch (cmds) { ++ case QC_QUOTAON: ++ return sys_quotactl(QCMD(Q_QUOTAON, type), ++ special, id, addr); ++ ++ case QC_QUOTAOFF: ++ return sys_quotactl(QCMD(Q_QUOTAOFF, type), ++ special, id, addr); ++ ++ case QC_SYNC: ++ return sys_quotactl(QCMD(Q_SYNC, type), ++ special, id, addr); ++ ++ case QC_GETQUOTA: { ++ struct if_dqblk idq; ++ struct compat_dqblk cdq; ++ ++ sb = quota_get_sb(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); ++ if (ret) ++ break; ++ ret = sb->s_qcop->get_dqblk(sb, type, id, &idq); ++ if (ret) ++ break; ++ cdq.dqb_ihardlimit = idq.dqb_ihardlimit; ++ cdq.dqb_isoftlimit = idq.dqb_isoftlimit; ++ cdq.dqb_curinodes = idq.dqb_curinodes; ++ cdq.dqb_bhardlimit = idq.dqb_bhardlimit; ++ cdq.dqb_bsoftlimit = idq.dqb_bsoftlimit; ++ cdq.dqb_curspace = idq.dqb_curspace; ++ cdq.dqb_btime = idq.dqb_btime; ++ cdq.dqb_itime = idq.dqb_itime; ++ ret = 0; ++ if (copy_to_user(addr, &cdq, sizeof(cdq))) ++ ret = -EFAULT; ++ break; ++ } ++ ++ case QC_SETQUOTA: ++ case QC_SETUSE: ++ case QC_SETQLIM: { ++ struct if_dqblk idq; ++ struct compat_dqblk cdq; ++ ++ sb = quota_get_sb(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_SETQUOTA, id); ++ if (ret) ++ break; ++ ret = -EFAULT; ++ if (copy_from_user(&cdq, addr, sizeof(cdq))) ++ break; ++ idq.dqb_ihardlimit = cdq.dqb_ihardlimit; ++ idq.dqb_isoftlimit = cdq.dqb_isoftlimit; ++ idq.dqb_curinodes = cdq.dqb_curinodes; ++ idq.dqb_bhardlimit = cdq.dqb_bhardlimit; ++ idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit; ++ idq.dqb_curspace = cdq.dqb_curspace; ++ idq.dqb_valid = 0; ++ if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM) ++ idq.dqb_valid |= QIF_LIMITS; ++ if (cmds == QC_SETQUOTA || cmds == QC_SETUSE) ++ idq.dqb_valid |= QIF_USAGE; ++ ret = sb->s_qcop->set_dqblk(sb, type, id, &idq); ++ break; ++ } ++ ++ case QC_GETINFO: { ++ struct if_dqinfo iinf; ++ struct compat_dqinfo cinf; ++ ++ sb = quota_get_sb(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_GETQUOTA, id); ++ if (ret) ++ break; ++ ret = sb->s_qcop->get_info(sb, type, &iinf); ++ if (ret) ++ break; ++ cinf.dqi_bgrace = iinf.dqi_bgrace; ++ cinf.dqi_igrace = iinf.dqi_igrace; ++ cinf.dqi_flags = 0; ++ if (iinf.dqi_flags & DQF_INFO_DIRTY) ++ cinf.dqi_flags |= 0x0010; ++ cinf.dqi_blocks = 0; ++ cinf.dqi_free_blk = 0; ++ cinf.dqi_free_entry = 0; ++ ret = 0; ++ if (copy_to_user(addr, &cinf, sizeof(cinf))) ++ ret = -EFAULT; ++ break; ++ } ++ ++ case QC_SETINFO: ++ case QC_SETGRACE: ++ case QC_SETFLAGS: { ++ struct if_dqinfo iinf; ++ struct compat_dqinfo cinf; ++ ++ sb = quota_get_sb(special); ++ ret = PTR_ERR(sb); ++ if (IS_ERR(sb)) ++ break; ++ ret = check_quotactl_valid(sb, type, Q_SETINFO, id); ++ if (ret) ++ break; ++ ret = -EFAULT; ++ if (copy_from_user(&cinf, addr, sizeof(cinf))) ++ break; ++ iinf.dqi_bgrace = cinf.dqi_bgrace; ++ iinf.dqi_igrace = cinf.dqi_igrace; ++ iinf.dqi_flags = cinf.dqi_flags; ++ iinf.dqi_valid = 0; ++ if (cmds == QC_SETINFO || cmds == QC_SETGRACE) ++ iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE; ++ if (cmds == QC_SETINFO || cmds == QC_SETFLAGS) ++ iinf.dqi_valid |= IIF_FLAGS; ++ ret = sb->s_qcop->set_info(sb, type, &iinf); ++ break; ++ } ++ ++ case QC_GETSTATS: { ++ struct compat_dqstats stat; ++ ++ memset(&stat, 0, sizeof(stat)); ++ stat.version = 6*10000+5*100+0; ++ ret = 0; ++ if (copy_to_user(addr, &stat, sizeof(stat))) ++ ret = -EFAULT; ++ break; ++ } ++ ++ default: ++ ret = -ENOSYS; ++ break; ++ } ++ if (sb && !IS_ERR(sb)) ++ drop_super(sb); ++ return ret; ++} ++ ++#endif ++ + /* + * This is the system call interface. This communicates with + * the user-level programs. Currently this only supports diskquota +@@ -347,25 +576,20 @@ asmlinkage long sys_quotactl(unsigned in + { + uint cmds, type; + struct super_block *sb = NULL; +- struct block_device *bdev; +- char *tmp; + int ret; + + cmds = cmd >> SUBCMDSHIFT; + type = cmd & SUBCMDMASK; + ++#ifdef CONFIG_QUOTA_COMPAT ++ if (cmds >= 0x0100 && cmds < 0x3000) ++ return compat_quotactl(cmds, type, special, id, addr); ++#endif ++ + if (cmds != Q_SYNC || special) { +- tmp = getname(special); +- if (IS_ERR(tmp)) +- return PTR_ERR(tmp); +- bdev = lookup_bdev(tmp); +- putname(tmp); +- if (IS_ERR(bdev)) +- return PTR_ERR(bdev); +- sb = get_super(bdev); +- bdput(bdev); +- if (!sb) +- return -ENODEV; ++ sb = quota_get_sb(special); ++ if (IS_ERR(sb)) ++ return PTR_ERR(sb); + } + + ret = check_quotactl_valid(sb, type, cmds, id); +diff -upr linux-2.6.16.orig/fs/reiserfs/namei.c linux-2.6.16-026test015/fs/reiserfs/namei.c +--- linux-2.6.16.orig/fs/reiserfs/namei.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/reiserfs/namei.c 2006-07-04 14:41:39.000000000 +0400 +@@ -864,6 +864,9 @@ static int reiserfs_rmdir(struct inode * + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + ++ inode = dentry->d_inode; ++ DQUOT_INIT(inode); ++ + /* we will be doing 2 balancings and update 2 stat data, we change quotas + * of the owner of the directory and of the owner of the parent directory. + * The quota structure is possibly deleted only on last iput => outside +@@ -888,8 +891,6 @@ static int reiserfs_rmdir(struct inode * + goto end_rmdir; + } + +- inode = dentry->d_inode; +- + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + +@@ -952,6 +953,7 @@ static int reiserfs_unlink(struct inode + unsigned long savelink; + + inode = dentry->d_inode; ++ DQUOT_INIT(inode); + + /* in this transaction we can be doing at max two balancings and update + * two stat datas, we change quotas of the owner of the directory and of +@@ -1259,6 +1261,8 @@ static int reiserfs_rename(struct inode + + old_inode = old_dentry->d_inode; + new_dentry_inode = new_dentry->d_inode; ++ if (new_dentry_inode) ++ DQUOT_INIT(new_dentry_inode); + + // make sure, that oldname still exists and points to an object we + // are going to rename +diff -upr linux-2.6.16.orig/fs/reiserfs/xattr.c linux-2.6.16-026test015/fs/reiserfs/xattr.c +--- linux-2.6.16.orig/fs/reiserfs/xattr.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/reiserfs/xattr.c 2006-07-04 14:41:37.000000000 +0400 +@@ -1343,7 +1343,8 @@ static int reiserfs_check_acl(struct ino + return error; + } + +-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) ++int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + /* + * We don't do permission checks on the internal objects. +@@ -1356,7 +1357,7 @@ int reiserfs_permission(struct inode *in + * Stat data v1 doesn't support ACLs. + */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) +- return generic_permission(inode, mask, NULL); ++ return generic_permission(inode, mask, NULL, perm); + else +- return generic_permission(inode, mask, reiserfs_check_acl); ++ return generic_permission(inode, mask, reiserfs_check_acl, perm); + } +diff -upr linux-2.6.16.orig/fs/reiserfs/xattr_acl.c linux-2.6.16-026test015/fs/reiserfs/xattr_acl.c +--- linux-2.6.16.orig/fs/reiserfs/xattr_acl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/reiserfs/xattr_acl.c 2006-07-04 14:41:36.000000000 +0400 +@@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct in + acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); +- ret = acl ? 1 : 0; +- posix_acl_release(acl); ++ ret = (acl && !IS_ERR(acl)); ++ if (ret) ++ posix_acl_release(acl); + } + + return ret; +diff -upr linux-2.6.16.orig/fs/select.c linux-2.6.16-026test015/fs/select.c +--- linux-2.6.16.orig/fs/select.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/select.c 2006-07-04 14:41:37.000000000 +0400 +@@ -24,6 +24,8 @@ + #include <linux/fs.h> + #include <linux/rcupdate.h> + ++#include <ub/ub_mem.h> ++ + #include <asm/uaccess.h> + + #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) +@@ -286,7 +288,7 @@ int do_select(int n, fd_set_bits *fds, s + + static void *select_bits_alloc(int size) + { +- return kmalloc(6 * size, GFP_KERNEL); ++ return ub_kmalloc(6 * size, GFP_KERNEL); + } + + static void select_bits_free(void *bits, int size) +@@ -645,7 +647,7 @@ int do_sys_poll(struct pollfd __user *uf + err = -ENOMEM; + while(i!=0) { + struct poll_list *pp; +- pp = kmalloc(sizeof(struct poll_list)+ ++ pp = ub_kmalloc(sizeof(struct poll_list)+ + sizeof(struct pollfd)* + (i>POLLFD_PER_PAGE?POLLFD_PER_PAGE:i), + GFP_KERNEL); +diff -upr linux-2.6.16.orig/fs/seq_file.c linux-2.6.16-026test015/fs/seq_file.c +--- linux-2.6.16.orig/fs/seq_file.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/seq_file.c 2006-07-04 14:41:38.000000000 +0400 +@@ -345,6 +345,8 @@ int seq_path(struct seq_file *m, + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p = d_path(dentry, mnt, s, m->size - m->count); ++ if (IS_ERR(p) && PTR_ERR(p) != -ENAMETOOLONG) ++ return 0; + if (!IS_ERR(p)) { + while (s <= p) { + char c = *p++; +diff -upr linux-2.6.16.orig/fs/simfs.c linux-2.6.16-026test015/fs/simfs.c +--- linux-2.6.16.orig/fs/simfs.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/simfs.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,290 @@ ++/* ++ * fs/simfs.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/fs.h> ++#include <linux/file.h> ++#include <linux/init.h> ++#include <linux/namei.h> ++#include <linux/err.h> ++#include <linux/module.h> ++#include <linux/mount.h> ++#include <linux/vzquota.h> ++#include <linux/statfs.h> ++#include <linux/virtinfo.h> ++#include <linux/faudit.h> ++#include <linux/genhd.h> ++ ++#include <asm/unistd.h> ++#include <asm/uaccess.h> ++ ++#define SIMFS_GET_LOWER_FS_SB(sb) sb->s_root->d_sb ++ ++static struct super_operations sim_super_ops; ++ ++static int sim_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ struct super_block *sb; ++ struct inode *inode; ++ ++ inode = dentry->d_inode; ++ if (!inode->i_op->getattr) { ++ generic_fillattr(inode, stat); ++ if (!stat->blksize) { ++ unsigned blocks; ++ ++ sb = inode->i_sb; ++ blocks = (stat->size + sb->s_blocksize-1) >> ++ sb->s_blocksize_bits; ++ stat->blocks = (sb->s_blocksize / 512) * blocks; ++ stat->blksize = sb->s_blocksize; ++ } ++ } else { ++ int err; ++ ++ err = inode->i_op->getattr(mnt, dentry, stat); ++ if (err) ++ return err; ++ } ++ ++ sb = mnt->mnt_sb; ++ if (sb->s_op == &sim_super_ops) ++ stat->dev = sb->s_dev; ++ return 0; ++} ++ ++static void quota_get_stat(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ struct dq_stat qstat; ++ struct virt_info_quota q; ++ long free_file, adj_file; ++ s64 blk, free_blk, adj_blk; ++ int bsize_bits; ++ ++ q.super = sb; ++ q.qstat = &qstat; ++ err = virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_GETSTAT, &q); ++ if (err != NOTIFY_OK) ++ return; ++ ++ bsize_bits = ffs(buf->f_bsize) - 1; ++ free_blk = (s64)(qstat.bsoftlimit - qstat.bcurrent) >> bsize_bits; ++ if (free_blk < 0) ++ free_blk = 0; ++ /* ++ * In the regular case, we always set buf->f_bfree and buf->f_blocks to ++ * the values reported by quota. In case of real disk space shortage, ++ * we adjust the values. We want this adjustment to look as if the ++ * total disk space were reduced, not as if the usage were increased. ++ * -- SAW ++ */ ++ adj_blk = 0; ++ if (buf->f_bfree < free_blk) ++ adj_blk = free_blk - buf->f_bfree; ++ buf->f_bfree = (long)(free_blk - adj_blk); ++ ++ if (free_blk < buf->f_bavail) ++ buf->f_bavail = (long)free_blk; /* min(f_bavail, free_blk) */ ++ ++ blk = (qstat.bsoftlimit >> bsize_bits) - adj_blk; ++ buf->f_blocks = blk > LONG_MAX ? LONG_MAX : blk; ++ ++ free_file = qstat.isoftlimit - qstat.icurrent; ++ if (free_file < 0) ++ free_file = 0; ++ if (buf->f_ffree == -1) ++ /* ++ * One filesystem uses -1 to represent the fact that it doesn't ++ * have a detached limit for inode number. ++ * May be, because -1 is a good pretendent for the maximum value ++ * of signed long type, may be, because it's just nice to have ++ * an exceptional case... Guess what that filesystem is :-) ++ * -- SAW ++ */ ++ buf->f_ffree = free_file; ++ adj_file = 0; ++ if (buf->f_ffree < free_file) ++ adj_file = free_file - buf->f_ffree; ++ buf->f_ffree = free_file - adj_file; ++ buf->f_files = qstat.isoftlimit - adj_file; ++} ++ ++static int sim_statfs(struct super_block *sb, struct kstatfs *buf) ++{ ++ int err; ++ struct super_block *lsb; ++ struct kstatfs statbuf; ++ ++ err = 0; ++ if (sb->s_op != &sim_super_ops) ++ return 0; ++ ++ lsb = SIMFS_GET_LOWER_FS_SB(sb); ++ ++ err = -ENOSYS; ++ if (lsb && lsb->s_op && lsb->s_op->statfs) ++ err = lsb->s_op->statfs(lsb, &statbuf); ++ if (err) ++ return err; ++ ++ quota_get_stat(sb, &statbuf); ++ ++ buf->f_files = statbuf.f_files; ++ buf->f_ffree = statbuf.f_ffree; ++ buf->f_blocks = statbuf.f_blocks; ++ buf->f_bfree = statbuf.f_bfree; ++ buf->f_bavail = statbuf.f_bavail; ++ return 0; ++} ++ ++static int sim_systemcall(struct vnotifier_block *me, unsigned long n, ++ void *d, int old_ret) ++{ ++ int err; ++ ++ switch (n) { ++ case VIRTINFO_FAUDIT_STAT: { ++ struct faudit_stat_arg *arg; ++ ++ arg = (struct faudit_stat_arg *)d; ++ err = sim_getattr(arg->mnt, arg->dentry, arg->stat); ++ arg->err = err; ++ } ++ break; ++ case VIRTINFO_FAUDIT_STATFS: { ++ struct faudit_statfs_arg *arg; ++ ++ arg = (struct faudit_statfs_arg *)d; ++ err = sim_statfs(arg->sb, arg->stat); ++ arg->err = err; ++ } ++ break; ++ default: ++ return old_ret; ++ } ++ return (err ? NOTIFY_BAD : NOTIFY_OK); ++} ++ ++static struct inode *sim_quota_root(struct super_block *sb) ++{ ++ return sb->s_root->d_inode; ++} ++ ++void sim_put_super(struct super_block *sb) ++{ ++ struct virt_info_quota viq; ++ ++ viq.super = sb; ++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_OFF, &viq); ++ bdput(sb->s_bdev); ++} ++ ++static struct super_operations sim_super_ops = { ++ .get_quota_root = sim_quota_root, ++ .put_super = sim_put_super, ++}; ++ ++static int sim_fill_super(struct super_block *s, void *data) ++{ ++ int err; ++ struct nameidata *nd; ++ ++ err = set_anon_super(s, NULL); ++ if (err) ++ goto out; ++ ++ err = 0; ++ nd = (struct nameidata *)data; ++ s->s_root = dget(nd->dentry); ++ s->s_op = &sim_super_ops; ++out: ++ return err; ++} ++ ++struct super_block *sim_get_sb(struct file_system_type *type, ++ int flags, const char *dev_name, void *opt) ++{ ++ int err; ++ struct nameidata nd; ++ struct super_block *sb; ++ struct block_device *bd; ++ struct virt_info_quota viq; ++ static struct hd_struct fake_hds; ++ ++ sb = ERR_PTR(-EINVAL); ++ if (opt == NULL) ++ goto out; ++ ++ err = path_lookup(opt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); ++ sb = ERR_PTR(err); ++ if (err) ++ goto out; ++ ++ sb = sget(type, NULL, sim_fill_super, &nd); ++ if (IS_ERR(sb)) ++ goto out_path; ++ ++ bd = bdget(sb->s_dev); ++ if (!bd) ++ goto out_killsb; ++ ++ sb->s_bdev = bd; ++ bd->bd_part = &fake_hds; ++ viq.super = sb; ++ virtinfo_notifier_call(VITYPE_QUOTA, VIRTINFO_QUOTA_ON, &viq); ++out_path: ++ path_release(&nd); ++out: ++ return sb; ++ ++out_killsb: ++ up_write(&sb->s_umount); ++ deactivate_super(sb); ++ sb = ERR_PTR(-ENODEV); ++ goto out_path; ++} ++ ++static struct file_system_type sim_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "simfs", ++ .get_sb = sim_get_sb, ++ .kill_sb = kill_anon_super, ++}; ++ ++static struct vnotifier_block sim_syscalls = { ++ .notifier_call = sim_systemcall, ++}; ++ ++static int __init init_simfs(void) ++{ ++ int err; ++ ++ err = register_filesystem(&sim_fs_type); ++ if (err) ++ return err; ++ ++ virtinfo_notifier_register(VITYPE_FAUDIT, &sim_syscalls); ++ return 0; ++} ++ ++static void __exit exit_simfs(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_FAUDIT, &sim_syscalls); ++ unregister_filesystem(&sim_fs_type); ++} ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Open Virtuozzo Simulation of File System"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(init_simfs); ++module_exit(exit_simfs); +diff -upr linux-2.6.16.orig/fs/smbfs/dir.c linux-2.6.16-026test015/fs/smbfs/dir.c +--- linux-2.6.16.orig/fs/smbfs/dir.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/smbfs/dir.c 2006-07-04 14:41:36.000000000 +0400 +@@ -434,6 +434,11 @@ smb_lookup(struct inode *dir, struct den + if (dentry->d_name.len > SMB_MAXNAMELEN) + goto out; + ++ /* Do not allow lookup of names with backslashes in */ ++ error = -EINVAL; ++ if (memchr(dentry->d_name.name, '\\', dentry->d_name.len)) ++ goto out; ++ + lock_kernel(); + error = smb_proc_getattr(dentry, &finfo); + #ifdef SMBFS_PARANOIA +diff -upr linux-2.6.16.orig/fs/smbfs/file.c linux-2.6.16-026test015/fs/smbfs/file.c +--- linux-2.6.16.orig/fs/smbfs/file.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/smbfs/file.c 2006-07-04 14:41:37.000000000 +0400 +@@ -387,7 +387,8 @@ smb_file_release(struct inode *inode, st + * privileges, so we need our own check for this. + */ + static int +-smb_file_permission(struct inode *inode, int mask, struct nameidata *nd) ++smb_file_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *perm) + { + int mode = inode->i_mode; + int error = 0; +diff -upr linux-2.6.16.orig/fs/smbfs/inode.c linux-2.6.16-026test015/fs/smbfs/inode.c +--- linux-2.6.16.orig/fs/smbfs/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/smbfs/inode.c 2006-07-04 14:41:37.000000000 +0400 +@@ -233,7 +233,7 @@ smb_invalidate_inodes(struct smb_sb_info + { + VERBOSE("\n"); + shrink_dcache_sb(SB_of(server)); +- invalidate_inodes(SB_of(server)); ++ invalidate_inodes(SB_of(server), 0); + } + + /* +diff -upr linux-2.6.16.orig/fs/smbfs/request.c linux-2.6.16-026test015/fs/smbfs/request.c +--- linux-2.6.16.orig/fs/smbfs/request.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/smbfs/request.c 2006-07-04 14:41:36.000000000 +0400 +@@ -339,9 +339,11 @@ int smb_add_request(struct smb_request * + /* + * On timeout or on interrupt we want to try and remove the + * request from the recvq/xmitq. ++ * First check if the request is still part of a queue. (May ++ * have been removed by some error condition) + */ + smb_lock_server(server); +- if (!(req->rq_flags & SMB_REQ_RECEIVED)) { ++ if (!list_empty(&req->rq_queue)) { + list_del_init(&req->rq_queue); + smb_rput(req); + } +diff -upr linux-2.6.16.orig/fs/stat.c linux-2.6.16-026test015/fs/stat.c +--- linux-2.6.16.orig/fs/stat.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/stat.c 2006-07-04 14:41:39.000000000 +0400 +@@ -15,6 +15,7 @@ + #include <linux/namei.h> + #include <linux/security.h> + #include <linux/syscalls.h> ++#include <linux/faudit.h> + + #include <asm/uaccess.h> + #include <asm/unistd.h> +@@ -42,11 +43,19 @@ int vfs_getattr(struct vfsmount *mnt, st + { + struct inode *inode = dentry->d_inode; + int retval; ++ struct faudit_stat_arg arg; + + retval = security_inode_getattr(mnt, dentry); + if (retval) + return retval; + ++ arg.mnt = mnt; ++ arg.dentry = dentry; ++ arg.stat = stat; ++ if (virtinfo_notifier_call(VITYPE_FAUDIT, VIRTINFO_FAUDIT_STAT, &arg) ++ != NOTIFY_DONE) ++ return arg.err; ++ + if (inode->i_op->getattr) + return inode->i_op->getattr(mnt, dentry, stat); + +diff -upr linux-2.6.16.orig/fs/super.c linux-2.6.16-026test015/fs/super.c +--- linux-2.6.16.orig/fs/super.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/super.c 2006-07-04 14:41:38.000000000 +0400 +@@ -23,6 +23,7 @@ + #include <linux/config.h> + #include <linux/module.h> + #include <linux/slab.h> ++#include <linux/ve_owner.h> + #include <linux/init.h> + #include <linux/smp_lock.h> + #include <linux/acct.h> +@@ -231,13 +232,13 @@ void generic_shutdown_super(struct super + if (root) { + sb->s_root = NULL; + shrink_dcache_parent(root); +- shrink_dcache_anon(&sb->s_anon); ++ shrink_dcache_anon(sb); + dput(root); + fsync_super(sb); + lock_super(sb); + sb->s_flags &= ~MS_ACTIVE; + /* bad name - it should be evict_inodes() */ +- invalidate_inodes(sb); ++ invalidate_inodes(sb, 0); + lock_kernel(); + + if (sop->write_super && sb->s_dirt) +@@ -246,7 +247,7 @@ void generic_shutdown_super(struct super + sop->put_super(sb); + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk("VFS: Busy inodes after unmount of %s. " + "Self-destruct in 5 seconds. Have a nice day...\n", + sb->s_id); +@@ -481,11 +482,20 @@ asmlinkage long sys_ustat(unsigned dev, + struct super_block *s; + struct ustat tmp; + struct kstatfs sbuf; +- int err = -EINVAL; ++ dev_t kdev; ++ int err; ++ ++ kdev = new_decode_dev(dev); ++#ifdef CONFIG_VE ++ err = get_device_perms_ve(S_IFBLK, kdev, FMODE_READ); ++ if (err) ++ goto out; ++#endif + +- s = user_get_super(new_decode_dev(dev)); +- if (s == NULL) +- goto out; ++ err = -EINVAL; ++ s = user_get_super(kdev); ++ if (s == NULL) ++ goto out; + err = vfs_statfs(s, &sbuf); + drop_super(s); + if (err) +@@ -599,6 +609,13 @@ void emergency_remount(void) + static struct idr unnamed_dev_idr; + static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ + ++/* for compatibility with coreutils still unaware of new minor sizes */ ++int unnamed_dev_majors[] = { ++ 0, 144, 145, 146, 242, 243, 244, 245, ++ 246, 247, 248, 249, 250, 251, 252, 253 ++}; ++EXPORT_SYMBOL(unnamed_dev_majors); ++ + int set_anon_super(struct super_block *s, void *data) + { + int dev; +@@ -616,13 +633,13 @@ int set_anon_super(struct super_block *s + else if (error) + return -EAGAIN; + +- if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { ++ if ((dev & MAX_ID_MASK) >= (1 << MINORBITS)) { + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, dev); + spin_unlock(&unnamed_dev_lock); + return -EMFILE; + } +- s->s_dev = MKDEV(0, dev & MINORMASK); ++ s->s_dev = make_unnamed_dev(dev); + return 0; + } + +@@ -630,8 +647,9 @@ EXPORT_SYMBOL(set_anon_super); + + void kill_anon_super(struct super_block *sb) + { +- int slot = MINOR(sb->s_dev); ++ int slot; + ++ slot = unnamed_dev_idx(sb->s_dev); + generic_shutdown_super(sb); + spin_lock(&unnamed_dev_lock); + idr_remove(&unnamed_dev_idr, slot); +diff -upr linux-2.6.16.orig/fs/sysfs/bin.c linux-2.6.16-026test015/fs/sysfs/bin.c +--- linux-2.6.16.orig/fs/sysfs/bin.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/bin.c 2006-07-04 14:41:37.000000000 +0400 +@@ -120,6 +120,9 @@ static int open(struct inode * inode, st + struct bin_attribute * attr = to_bin_attr(file->f_dentry); + int error = -EINVAL; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + if (!kobj || !attr) + goto Done; + +@@ -196,6 +199,9 @@ int sysfs_create_bin_file(struct kobject + + int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + sysfs_hash_and_remove(kobj->dentry,attr->attr.name); + return 0; + } +diff -upr linux-2.6.16.orig/fs/sysfs/dir.c linux-2.6.16-026test015/fs/sysfs/dir.c +--- linux-2.6.16.orig/fs/sysfs/dir.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/dir.c 2006-07-04 14:41:37.000000000 +0400 +@@ -144,6 +144,9 @@ int sysfs_create_dir(struct kobject * ko + struct dentry * parent; + int error = 0; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj); + + if (kobj->parent) +@@ -278,10 +281,14 @@ void sysfs_remove_subdir(struct dentry * + + void sysfs_remove_dir(struct kobject * kobj) + { +- struct dentry * dentry = dget(kobj->dentry); ++ struct dentry * dentry; + struct sysfs_dirent * parent_sd; + struct sysfs_dirent * sd, * tmp; + ++ if (!ve_sysfs_alowed()) ++ return; ++ ++ dentry = dget(kobj->dentry); + if (!dentry) + return; + +@@ -302,6 +309,7 @@ void sysfs_remove_dir(struct kobject * k + * Drop reference from dget() on entrance. + */ + dput(dentry); ++ kobj->dentry = NULL; + } + + int sysfs_rename_dir(struct kobject * kobj, const char *new_name) +@@ -309,6 +317,9 @@ int sysfs_rename_dir(struct kobject * ko + int error = 0; + struct dentry * new_dentry, * parent; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + if (!strcmp(kobject_name(kobj), new_name)) + return -EINVAL; + +diff -upr linux-2.6.16.orig/fs/sysfs/file.c linux-2.6.16-026test015/fs/sysfs/file.c +--- linux-2.6.16.orig/fs/sysfs/file.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/file.c 2006-07-04 14:41:37.000000000 +0400 +@@ -183,7 +183,7 @@ fill_write_buffer(struct sysfs_buffer * + return -ENOMEM; + + if (count >= PAGE_SIZE) +- count = PAGE_SIZE; ++ count = PAGE_SIZE - 1; + error = copy_from_user(buffer->page,buf,count); + buffer->needs_read_fill = 1; + return error ? -EFAULT : count; +@@ -380,6 +380,9 @@ int sysfs_add_file(struct dentry * dir, + + int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj || !kobj->dentry || !attr); + + return sysfs_add_file(kobj->dentry, attr, SYSFS_KOBJ_ATTR); +@@ -398,6 +401,9 @@ int sysfs_update_file(struct kobject * k + struct dentry * victim; + int res = -ENOENT; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + mutex_lock(&dir->d_inode->i_mutex); + victim = lookup_one_len(attr->name, dir, strlen(attr->name)); + if (!IS_ERR(victim)) { +@@ -473,6 +479,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file); + + void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) + { ++ if (!ve_sysfs_alowed()) ++ return; ++ + sysfs_hash_and_remove(kobj->dentry,attr->name); + } + +diff -upr linux-2.6.16.orig/fs/sysfs/group.c linux-2.6.16-026test015/fs/sysfs/group.c +--- linux-2.6.16.orig/fs/sysfs/group.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/group.c 2006-07-04 14:41:37.000000000 +0400 +@@ -46,6 +46,9 @@ int sysfs_create_group(struct kobject * + struct dentry * dir; + int error; + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + BUG_ON(!kobj || !kobj->dentry); + + if (grp->name) { +@@ -68,6 +71,9 @@ void sysfs_remove_group(struct kobject * + { + struct dentry * dir; + ++ if (!ve_sysfs_alowed()) ++ return; ++ + if (grp->name) + dir = lookup_one_len(grp->name, kobj->dentry, + strlen(grp->name)); +diff -upr linux-2.6.16.orig/fs/sysfs/inode.c linux-2.6.16-026test015/fs/sysfs/inode.c +--- linux-2.6.16.orig/fs/sysfs/inode.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/inode.c 2006-07-04 14:41:37.000000000 +0400 +@@ -8,14 +8,13 @@ + + #undef DEBUG + ++#include <linux/config.h> + #include <linux/pagemap.h> + #include <linux/namei.h> + #include <linux/backing-dev.h> + #include <linux/capability.h> + #include "sysfs.h" + +-extern struct super_block * sysfs_sb; +- + static struct address_space_operations sysfs_aops = { + .readpage = simple_readpage, + .prepare_write = simple_prepare_write, +@@ -227,12 +226,16 @@ void sysfs_drop_dentry(struct sysfs_dire + void sysfs_hash_and_remove(struct dentry * dir, const char * name) + { + struct sysfs_dirent * sd; +- struct sysfs_dirent * parent_sd = dir->d_fsdata; ++ struct sysfs_dirent * parent_sd; ++ ++ if (!dir) ++ return; + + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + ++ parent_sd = dir->d_fsdata; + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (!sd->s_element) +diff -upr linux-2.6.16.orig/fs/sysfs/mount.c linux-2.6.16-026test015/fs/sysfs/mount.c +--- linux-2.6.16.orig/fs/sysfs/mount.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/mount.c 2006-07-04 14:41:38.000000000 +0400 +@@ -7,6 +7,7 @@ + #include <linux/fs.h> + #include <linux/mount.h> + #include <linux/pagemap.h> ++#include <linux/module.h> + #include <linux/init.h> + + #include "sysfs.h" +@@ -14,8 +15,11 @@ + /* Random magic number */ + #define SYSFS_MAGIC 0x62656572 + ++#ifndef CONFIG_VE + struct vfsmount *sysfs_mount; + struct super_block * sysfs_sb = NULL; ++#endif ++ + kmem_cache_t *sysfs_dir_cachep; + + static struct super_operations sysfs_ops = { +@@ -31,6 +35,15 @@ static struct sysfs_dirent sysfs_root = + .s_iattr = NULL, + }; + ++#ifdef CONFIG_VE ++static void init_ve0_sysfs_root(void) ++{ ++ get_ve0()->sysfs_root = &sysfs_root; ++} ++ ++#define sysfs_root (*(get_exec_env()->sysfs_root)) ++#endif ++ + static int sysfs_fill_super(struct super_block *sb, void *data, int silent) + { + struct inode *inode; +@@ -72,16 +85,21 @@ static struct super_block *sysfs_get_sb( + return get_sb_single(fs_type, flags, data, sysfs_fill_super); + } + +-static struct file_system_type sysfs_fs_type = { ++struct file_system_type sysfs_fs_type = { + .name = "sysfs", + .get_sb = sysfs_get_sb, + .kill_sb = kill_litter_super, + }; + ++EXPORT_SYMBOL(sysfs_fs_type); ++ + int __init sysfs_init(void) + { + int err = -ENOMEM; + ++#ifdef CONFIG_VE ++ init_ve0_sysfs_root(); ++#endif + sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache", + sizeof(struct sysfs_dirent), + 0, 0, NULL, NULL); +diff -upr linux-2.6.16.orig/fs/sysfs/symlink.c linux-2.6.16-026test015/fs/sysfs/symlink.c +--- linux-2.6.16.orig/fs/sysfs/symlink.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/symlink.c 2006-07-04 14:41:37.000000000 +0400 +@@ -66,6 +66,7 @@ static int sysfs_add_link(struct dentry + if (!error) + return 0; + ++ kobject_put(target); + kfree(sl->link_name); + exit2: + kfree(sl); +@@ -86,6 +87,9 @@ int sysfs_create_link(struct kobject * k + + BUG_ON(!kobj || !kobj->dentry || !name); + ++ if (!ve_sysfs_alowed()) ++ return 0; ++ + mutex_lock(&dentry->d_inode->i_mutex); + error = sysfs_add_link(dentry, name, target); + mutex_unlock(&dentry->d_inode->i_mutex); +@@ -101,6 +105,9 @@ int sysfs_create_link(struct kobject * k + + void sysfs_remove_link(struct kobject * kobj, const char * name) + { ++ if(!ve_sysfs_alowed()) ++ return; ++ + sysfs_hash_and_remove(kobj->dentry,name); + } + +diff -upr linux-2.6.16.orig/fs/sysfs/sysfs.h linux-2.6.16-026test015/fs/sysfs/sysfs.h +--- linux-2.6.16.orig/fs/sysfs/sysfs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/sysfs/sysfs.h 2006-07-04 14:41:38.000000000 +0400 +@@ -1,5 +1,14 @@ + +-extern struct vfsmount * sysfs_mount; ++#ifndef CONFIG_VE ++extern struct vfsmount *sysfs_mount; ++extern struct super_block *sysfs_sb; ++#define ve_sysfs_alowed() (1) ++#else ++#define sysfs_mount (get_exec_env()->sysfs_mnt) ++#define sysfs_sb (get_exec_env()->sysfs_sb) ++#define ve_sysfs_alowed() (sysfs_sb != NULL) ++#endif ++ + extern kmem_cache_t *sysfs_dir_cachep; + + extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); +@@ -19,7 +28,6 @@ extern void sysfs_drop_dentry(struct sys + extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); + + extern struct rw_semaphore sysfs_rename_sem; +-extern struct super_block * sysfs_sb; + extern struct file_operations sysfs_dir_operations; + extern struct file_operations sysfs_file_operations; + extern struct file_operations bin_fops; +diff -upr linux-2.6.16.orig/fs/vzdq_file.c linux-2.6.16-026test015/fs/vzdq_file.c +--- linux-2.6.16.orig/fs/vzdq_file.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/vzdq_file.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,851 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo quota files as proc entry implementation. ++ * It is required for std quota tools to work correctly as they are expecting ++ * aquota.user and aquota.group files. ++ */ ++ ++#include <linux/ctype.h> ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/module.h> ++#include <linux/proc_fs.h> ++#include <linux/sysctl.h> ++#include <linux/mount.h> ++#include <linux/namespace.h> ++#include <linux/quotaio_v2.h> ++#include <asm/uaccess.h> ++ ++#include <linux/ve.h> ++#include <linux/ve_proto.h> ++#include <linux/vzdq_tree.h> ++#include <linux/vzquota.h> ++ ++/* ---------------------------------------------------------------------- ++ * ++ * File read operation ++ * ++ * FIXME: functions in this section (as well as many functions in vzdq_ugid.c, ++ * perhaps) abuse vz_quota_sem. ++ * Taking a global semaphore for lengthy and user-controlled operations inside ++ * VPSs is not a good idea in general. ++ * In this case, the reasons for taking this semaphore are completely unclear, ++ * especially taking into account that the only function that has comments ++ * about the necessity to be called under this semaphore ++ * (create_proc_quotafile) is actually called OUTSIDE it. ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define DQBLOCK_SIZE 1024 ++#define DQUOTBLKNUM 21U ++#define DQTREE_DEPTH 4 ++#define TREENUM_2_BLKNUM(num) (((num) + 1) << 1) ++#define ISINDBLOCK(num) ((num)%2 != 0) ++#define FIRST_DATABLK 2 /* first even number */ ++#define LAST_IND_LEVEL (DQTREE_DEPTH - 1) ++#define CONVERT_LEVEL(level) ((level) * (QUOTAID_EBITS/QUOTAID_BBITS)) ++#define GETLEVINDX(ind, lev) (((ind) >> QUOTAID_BBITS*(lev)) \ ++ & QUOTATREE_BMASK) ++ ++#if (QUOTAID_EBITS / QUOTAID_BBITS) != (QUOTATREE_DEPTH / DQTREE_DEPTH) ++#error xBITS and DQTREE_DEPTH does not correspond ++#endif ++ ++#define BLOCK_NOT_FOUND 1 ++ ++/* data for quota file -- one per proc entry */ ++struct quotatree_data { ++ struct list_head list; ++ struct vz_quota_master *qmblk; ++ int type; /* type of the tree */ ++}; ++ ++/* serialized by vz_quota_sem */ ++static LIST_HEAD(qf_data_head); ++ ++static const u_int32_t vzquota_magics[] = V2_INITQMAGICS; ++static const u_int32_t vzquota_versions[] = V2_INITQVERSIONS; ++ ++static inline loff_t get_depoff(int depth) ++{ ++ loff_t res = 1; ++ while (depth) { ++ res += (1 << ((depth - 1)*QUOTAID_EBITS + 1)); ++ depth--; ++ } ++ return res; ++} ++ ++static inline loff_t get_blknum(loff_t num, int depth) ++{ ++ loff_t res; ++ res = (num << 1) + get_depoff(depth); ++ return res; ++} ++ ++static int get_depth(loff_t num) ++{ ++ int i; ++ for (i = 0; i < DQTREE_DEPTH; i++) { ++ if (num >= get_depoff(i) && (i == DQTREE_DEPTH - 1 ++ || num < get_depoff(i + 1))) ++ return i; ++ } ++ return -1; ++} ++ ++static inline loff_t get_offset(loff_t num) ++{ ++ loff_t res, tmp; ++ ++ tmp = get_depth(num); ++ if (tmp < 0) ++ return -1; ++ num -= get_depoff(tmp); ++ BUG_ON(num < 0); ++ res = num >> 1; ++ ++ return res; ++} ++ ++static inline loff_t get_quot_blk_num(struct quotatree_tree *tree, int level) ++{ ++ /* return maximum available block num */ ++ return tree->levels[level].freenum; ++} ++ ++static inline loff_t get_block_num(struct quotatree_tree *tree) ++{ ++ loff_t ind_blk_num, quot_blk_num, max_ind, max_quot; ++ ++ quot_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH) - 1); ++ max_quot = TREENUM_2_BLKNUM(quot_blk_num); ++ ind_blk_num = get_quot_blk_num(tree, CONVERT_LEVEL(DQTREE_DEPTH - 1)); ++ max_ind = (quot_blk_num) ? get_blknum(ind_blk_num, LAST_IND_LEVEL) ++ : get_blknum(ind_blk_num, 0); ++ ++ return (max_ind > max_quot) ? max_ind + 1 : max_quot + 1; ++} ++ ++/* Write quota file header */ ++static int read_header(void *buf, struct quotatree_tree *tree, ++ struct dq_info *dq_ugid_info, int type) ++{ ++ struct v2_disk_dqheader *dqh; ++ struct v2_disk_dqinfo *dq_disk_info; ++ ++ dqh = buf; ++ dq_disk_info = buf + sizeof(struct v2_disk_dqheader); ++ ++ dqh->dqh_magic = vzquota_magics[type]; ++ dqh->dqh_version = vzquota_versions[type]; ++ ++ dq_disk_info->dqi_bgrace = dq_ugid_info[type].bexpire; ++ dq_disk_info->dqi_igrace = dq_ugid_info[type].iexpire; ++ dq_disk_info->dqi_flags = 0; /* no flags */ ++ dq_disk_info->dqi_blocks = get_block_num(tree); ++ dq_disk_info->dqi_free_blk = 0; /* first block in the file */ ++ dq_disk_info->dqi_free_entry = FIRST_DATABLK; ++ ++ return 0; ++} ++ ++static int get_block_child(int depth, struct quotatree_node *p, u_int32_t *buf) ++{ ++ int i, j, lev_num; ++ ++ lev_num = QUOTATREE_DEPTH/DQTREE_DEPTH - 1; ++ for (i = 0; i < BLOCK_SIZE/sizeof(u_int32_t); i++) { ++ struct quotatree_node *next, *parent; ++ ++ parent = p; ++ next = p; ++ for (j = lev_num; j >= 0; j--) { ++ if (!next->blocks[GETLEVINDX(i,j)]) { ++ buf[i] = 0; ++ goto bad_branch; ++ } ++ parent = next; ++ next = next->blocks[GETLEVINDX(i,j)]; ++ } ++ buf[i] = (depth == DQTREE_DEPTH - 1) ? ++ TREENUM_2_BLKNUM(parent->num) ++ : get_blknum(next->num, depth + 1); ++ ++ bad_branch: ++ ; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Write index block to disk (or buffer) ++ * @buf has length 256*sizeof(u_int32_t) bytes ++ */ ++static int read_index_block(int num, u_int32_t *buf, ++ struct quotatree_tree *tree) ++{ ++ struct quotatree_node *p; ++ u_int32_t index; ++ loff_t off; ++ int depth, res; ++ ++ res = BLOCK_NOT_FOUND; ++ index = 0; ++ depth = get_depth(num); ++ off = get_offset(num); ++ if (depth < 0 || off < 0) ++ return -EINVAL; ++ ++ list_for_each_entry(p, &tree->levels[CONVERT_LEVEL(depth)].usedlh, ++ list) { ++ if (p->num >= off) ++ res = 0; ++ if (p->num != off) ++ continue; ++ get_block_child(depth, p, buf); ++ break; ++ } ++ ++ return res; ++} ++ ++static inline void convert_quot_format(struct v2_disk_dqblk *dq, ++ struct vz_quota_ugid *vzq) ++{ ++ dq->dqb_id = vzq->qugid_id; ++ dq->dqb_ihardlimit = vzq->qugid_stat.ihardlimit; ++ dq->dqb_isoftlimit = vzq->qugid_stat.isoftlimit; ++ dq->dqb_curinodes = vzq->qugid_stat.icurrent; ++ dq->dqb_bhardlimit = vzq->qugid_stat.bhardlimit / QUOTABLOCK_SIZE; ++ dq->dqb_bsoftlimit = vzq->qugid_stat.bsoftlimit / QUOTABLOCK_SIZE; ++ dq->dqb_curspace = vzq->qugid_stat.bcurrent; ++ dq->dqb_btime = vzq->qugid_stat.btime; ++ dq->dqb_itime = vzq->qugid_stat.itime; ++} ++ ++static int read_dquot(loff_t num, void *buf, struct quotatree_tree *tree) ++{ ++ int res, i, entries = 0; ++ struct v2_disk_dqdbheader *dq_header; ++ struct quotatree_node *p; ++ struct v2_disk_dqblk *blk = buf + sizeof(struct v2_disk_dqdbheader); ++ ++ res = BLOCK_NOT_FOUND; ++ dq_header = buf; ++ memset(dq_header, 0, sizeof(*dq_header)); ++ ++ list_for_each_entry(p, &(tree->levels[QUOTATREE_DEPTH - 1].usedlh), ++ list) { ++ if (TREENUM_2_BLKNUM(p->num) >= num) ++ res = 0; ++ if (TREENUM_2_BLKNUM(p->num) != num) ++ continue; ++ ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ if (!p->blocks[i]) ++ continue; ++ convert_quot_format(blk + entries, ++ (struct vz_quota_ugid *)p->blocks[i]); ++ entries++; ++ res = 0; ++ } ++ break; ++ } ++ dq_header->dqdh_entries = entries; ++ ++ return res; ++} ++ ++static int read_block(int num, void *buf, struct quotatree_tree *tree, ++ struct dq_info *dq_ugid_info, int magic) ++{ ++ int res; ++ ++ memset(buf, 0, DQBLOCK_SIZE); ++ if (!num) ++ res = read_header(buf, tree, dq_ugid_info, magic); ++ else if (ISINDBLOCK(num)) ++ res = read_index_block(num, (u_int32_t*)buf, tree); ++ else ++ res = read_dquot(num, buf, tree); ++ ++ return res; ++} ++ ++/* ++ * FIXME: this function can handle quota files up to 2GB only. ++ */ ++static int read_proc_quotafile(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ off_t blk_num, blk_off, buf_off; ++ char *tmp; ++ size_t buf_size; ++ struct quotatree_data *qtd; ++ struct quotatree_tree *tree; ++ struct dq_info *dqi; ++ int res; ++ ++ tmp = kmalloc(DQBLOCK_SIZE, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ qtd = data; ++ down(&vz_quota_sem); ++ down(&qtd->qmblk->dq_sem); ++ ++ res = 0; ++ tree = QUGID_TREE(qtd->qmblk, qtd->type); ++ if (!tree) { ++ *eof = 1; ++ goto out_dq; ++ } ++ ++ dqi = &qtd->qmblk->dq_ugid_info[qtd->type]; ++ ++ buf_off = 0; ++ buf_size = count; ++ blk_num = off / DQBLOCK_SIZE; ++ blk_off = off % DQBLOCK_SIZE; ++ ++ while (buf_size > 0) { ++ off_t len; ++ ++ len = min((size_t)(DQBLOCK_SIZE-blk_off), buf_size); ++ res = read_block(blk_num, tmp, tree, dqi, qtd->type); ++ if (res < 0) ++ goto out_err; ++ if (res == BLOCK_NOT_FOUND) { ++ *eof = 1; ++ break; ++ } ++ memcpy(page + buf_off, tmp + blk_off, len); ++ ++ blk_num++; ++ buf_size -= len; ++ blk_off = 0; ++ buf_off += len; ++ } ++ res = buf_off; ++ ++out_err: ++ *start = NULL + count; ++out_dq: ++ up(&qtd->qmblk->dq_sem); ++ up(&vz_quota_sem); ++ kfree(tmp); ++ ++ return res; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota/QID/aquota.* files ++ * ++ * FIXME: this code lacks serialization of read/readdir/lseek. ++ * However, this problem should be fixed after the mainstream issue of what ++ * appears to be non-atomic read and update of file position in sys_read. ++ * ++ * --------------------------------------------------------------------- */ ++ ++static inline unsigned long vzdq_aquot_getino(dev_t dev) ++{ ++ return 0xec000000UL + dev; ++} ++ ++static inline dev_t vzdq_aquot_getidev(struct inode *inode) ++{ ++ return (dev_t)(unsigned long)PROC_I(inode)->op.proc_get_link; ++} ++ ++static inline void vzdq_aquot_setidev(struct inode *inode, dev_t dev) ++{ ++ PROC_I(inode)->op.proc_get_link = (void *)(unsigned long)dev; ++} ++ ++static ssize_t vzdq_aquotf_read(struct file *file, ++ char __user *buf, size_t size, loff_t *ppos) ++{ ++ char *page; ++ size_t bufsize; ++ ssize_t l, l2, copied; ++ char *start; ++ struct inode *inode; ++ struct block_device *bdev; ++ struct super_block *sb; ++ struct quotatree_data data; ++ int eof, err; ++ ++ err = -ENOMEM; ++ page = (char *)__get_free_page(GFP_KERNEL); ++ if (page == NULL) ++ goto out_err; ++ ++ err = -ENODEV; ++ inode = file->f_dentry->d_inode; ++ bdev = bdget(vzdq_aquot_getidev(inode)); ++ if (bdev == NULL) ++ goto out_err; ++ sb = get_super(bdev); ++ bdput(bdev); ++ if (sb == NULL) ++ goto out_err; ++ data.qmblk = vzquota_find_qmblk(sb); ++ data.type = PROC_I(inode)->type - 1; ++ drop_super(sb); ++ if (data.qmblk == NULL || data.qmblk == VZ_QUOTA_BAD) ++ goto out_err; ++ ++ copied = 0; ++ l = l2 = 0; ++ while (1) { ++ bufsize = min(size, (size_t)PAGE_SIZE); ++ if (bufsize <= 0) ++ break; ++ ++ l = read_proc_quotafile(page, &start, *ppos, bufsize, ++ &eof, &data); ++ if (l <= 0) ++ break; ++ ++ l2 = copy_to_user(buf, page, l); ++ copied += l - l2; ++ if (l2) ++ break; ++ ++ buf += l; ++ size -= l; ++ *ppos += (unsigned long)start; ++ l = l2 = 0; ++ } ++ ++ qmblk_put(data.qmblk); ++ free_page((unsigned long)page); ++ if (copied) ++ return copied; ++ else if (l2) /* last copy_to_user failed */ ++ return -EFAULT; ++ else /* read error or EOF */ ++ return l; ++ ++out_err: ++ if (page != NULL) ++ free_page((unsigned long)page); ++ return err; ++} ++ ++static struct file_operations vzdq_aquotf_file_operations = { ++ .read = &vzdq_aquotf_read, ++}; ++ ++static struct inode_operations vzdq_aquotf_inode_operations = { ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota/QID directory ++ * ++ * --------------------------------------------------------------------- */ ++ ++static int vzdq_aquotq_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ loff_t n; ++ int err; ++ ++ n = file->f_pos; ++ for (err = 0; !err; n++) { ++ switch (n) { ++ case 0: ++ err = (*filler)(data, ".", 1, n, ++ file->f_dentry->d_inode->i_ino, ++ DT_DIR); ++ break; ++ case 1: ++ err = (*filler)(data, "..", 2, n, ++ parent_ino(file->f_dentry), DT_DIR); ++ break; ++ case 2: ++ err = (*filler)(data, "aquota.user", 11, n, ++ file->f_dentry->d_inode->i_ino ++ + USRQUOTA + 1, ++ DT_REG); ++ break; ++ case 3: ++ err = (*filler)(data, "aquota.group", 12, n, ++ file->f_dentry->d_inode->i_ino ++ + GRPQUOTA + 1, ++ DT_REG); ++ break; ++ default: ++ goto out; ++ } ++ } ++out: ++ file->f_pos = n; ++ return err; ++} ++ ++struct vzdq_aquotq_lookdata { ++ dev_t dev; ++ int type; ++}; ++ ++static int vzdq_aquotq_looktest(struct inode *inode, void *data) ++{ ++ struct vzdq_aquotq_lookdata *d; ++ ++ d = data; ++ return inode->i_op == &vzdq_aquotf_inode_operations && ++ vzdq_aquot_getidev(inode) == d->dev && ++ PROC_I(inode)->type == d->type + 1; ++} ++ ++static int vzdq_aquotq_lookset(struct inode *inode, void *data) ++{ ++ struct vzdq_aquotq_lookdata *d; ++ ++ d = data; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_ino = vzdq_aquot_getino(d->dev) + d->type + 1; ++ inode->i_mode = S_IFREG | S_IRUSR; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_op = &vzdq_aquotf_inode_operations; ++ inode->i_fop = &vzdq_aquotf_file_operations; ++ PROC_I(inode)->type = d->type + 1; ++ vzdq_aquot_setidev(inode, d->dev); ++ return 0; ++} ++ ++static struct dentry *vzdq_aquotq_lookup(struct inode *dir, ++ struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ struct vzdq_aquotq_lookdata d; ++ int k; ++ ++ if (dentry->d_name.len == 11) { ++ if (memcmp(dentry->d_name.name, "aquota.user", 11)) ++ goto out; ++ k = USRQUOTA; ++ } else if (dentry->d_name.len == 12) { ++ if (memcmp(dentry->d_name.name, "aquota.group", 11)) ++ goto out; ++ k = GRPQUOTA; ++ } else ++ goto out; ++ d.dev = vzdq_aquot_getidev(dir); ++ d.type = k; ++ inode = iget5_locked(dir->i_sb, dir->i_ino + k + 1, ++ vzdq_aquotq_looktest, vzdq_aquotq_lookset, &d); ++ if (inode == NULL) ++ goto out; ++ unlock_new_inode(inode); ++ d_add(dentry, inode); ++ return NULL; ++ ++out: ++ return ERR_PTR(-ENOENT); ++} ++ ++static struct file_operations vzdq_aquotq_file_operations = { ++ .read = &generic_read_dir, ++ .readdir = &vzdq_aquotq_readdir, ++}; ++ ++static struct inode_operations vzdq_aquotq_inode_operations = { ++ .lookup = &vzdq_aquotq_lookup, ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * /proc/vz/vzaquota directory ++ * ++ * --------------------------------------------------------------------- */ ++ ++struct vzdq_aquot_de { ++ struct list_head list; ++ struct vfsmount *mnt; ++}; ++ ++static int vzdq_aquot_buildmntlist(struct ve_struct *ve, ++ struct list_head *head) ++{ ++ struct vfsmount *rmnt, *mnt; ++ struct vzdq_aquot_de *p; ++ int err; ++ ++#ifdef CONFIG_VE ++ rmnt = mntget(ve->fs_rootmnt); ++#else ++ read_lock(¤t->fs->lock); ++ rmnt = mntget(current->fs->rootmnt); ++ read_unlock(¤t->fs->lock); ++#endif ++ mnt = rmnt; ++ spin_lock(&vfsmount_lock); ++ while (1) { ++ list_for_each_entry(p, head, list) { ++ if (p->mnt->mnt_sb == mnt->mnt_sb) ++ goto skip; ++ } ++ ++ err = -ENOMEM; ++ p = kmalloc(sizeof(*p), GFP_KERNEL); ++ if (p == NULL) ++ goto out; ++ p->mnt = mntget(mnt); ++ list_add_tail(&p->list, head); ++ ++skip: ++ err = 0; ++ if (list_empty(&mnt->mnt_mounts)) { ++ while (1) { ++ if (mnt == rmnt) ++ goto out; ++ if (mnt->mnt_child.next != ++ &mnt->mnt_parent->mnt_mounts) ++ break; ++ mnt = mnt->mnt_parent; ++ } ++ mnt = list_entry(mnt->mnt_child.next, ++ struct vfsmount, mnt_child); ++ } else ++ mnt = list_entry(mnt->mnt_mounts.next, ++ struct vfsmount, mnt_child); ++ } ++out: ++ spin_unlock(&vfsmount_lock); ++ mntput(rmnt); ++ return err; ++} ++ ++static void vzdq_aquot_releasemntlist(struct ve_struct *ve, ++ struct list_head *head) ++{ ++ struct vzdq_aquot_de *p; ++ ++ while (!list_empty(head)) { ++ p = list_entry(head->next, typeof(*p), list); ++ mntput(p->mnt); ++ list_del(&p->list); ++ kfree(p); ++ } ++} ++ ++static int vzdq_aquotd_readdir(struct file *file, void *data, filldir_t filler) ++{ ++ struct ve_struct *ve, *old_ve; ++ struct list_head mntlist; ++ struct vzdq_aquot_de *de; ++ struct super_block *sb; ++ struct vz_quota_master *qmblk; ++ loff_t i, n; ++ char buf[24]; ++ int l, err; ++ ++ i = 0; ++ n = file->f_pos; ++ ve = VE_OWNER_FSTYPE(file->f_dentry->d_sb->s_type); ++ old_ve = set_exec_env(ve); ++ ++ INIT_LIST_HEAD(&mntlist); ++#ifdef CONFIG_VE ++ /* ++ * The only reason of disabling readdir for the host system is that ++ * this readdir can be slow and CPU consuming with large number of VPSs ++ * (or just mount points). ++ */ ++ err = ve_is_super(ve); ++#else ++ err = 0; ++#endif ++ if (!err) { ++ err = vzdq_aquot_buildmntlist(ve, &mntlist); ++ if (err) ++ goto out_err; ++ } ++ ++ if (i >= n) { ++ if ((*filler)(data, ".", 1, i, ++ file->f_dentry->d_inode->i_ino, DT_DIR)) ++ goto out_fill; ++ } ++ i++; ++ ++ if (i >= n) { ++ if ((*filler)(data, "..", 2, i, ++ parent_ino(file->f_dentry), DT_DIR)) ++ goto out_fill; ++ } ++ i++; ++ ++ list_for_each_entry (de, &mntlist, list) { ++ sb = de->mnt->mnt_sb; ++#ifdef CONFIG_VE ++ if (get_device_perms_ve(S_IFBLK, sb->s_dev, FMODE_QUOTACTL)) ++ continue; ++#endif ++ qmblk = vzquota_find_qmblk(sb); ++ if (qmblk == NULL || qmblk == VZ_QUOTA_BAD) ++ continue; ++ ++ qmblk_put(qmblk); ++ i++; ++ if (i <= n) ++ continue; ++ ++ l = sprintf(buf, "%08x", new_encode_dev(sb->s_dev)); ++ if ((*filler)(data, buf, l, i - 1, ++ vzdq_aquot_getino(sb->s_dev), DT_DIR)) ++ break; ++ } ++ ++out_fill: ++ err = 0; ++ file->f_pos = i; ++out_err: ++ vzdq_aquot_releasemntlist(ve, &mntlist); ++ (void)set_exec_env(old_ve); ++ return err; ++} ++ ++static int vzdq_aquotd_looktest(struct inode *inode, void *data) ++{ ++ return inode->i_op == &vzdq_aquotq_inode_operations && ++ vzdq_aquot_getidev(inode) == (dev_t)(unsigned long)data; ++} ++ ++static int vzdq_aquotd_lookset(struct inode *inode, void *data) ++{ ++ dev_t dev; ++ ++ dev = (dev_t)(unsigned long)data; ++ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; ++ inode->i_ino = vzdq_aquot_getino(dev); ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 2; ++ inode->i_op = &vzdq_aquotq_inode_operations; ++ inode->i_fop = &vzdq_aquotq_file_operations; ++ vzdq_aquot_setidev(inode, dev); ++ return 0; ++} ++ ++static struct dentry *vzdq_aquotd_lookup(struct inode *dir, ++ struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct ve_struct *ve, *old_ve; ++ const unsigned char *s; ++ int l; ++ dev_t dev; ++ struct inode *inode; ++ ++ ve = VE_OWNER_FSTYPE(dir->i_sb->s_type); ++ old_ve = set_exec_env(ve); ++#ifdef CONFIG_VE ++ /* ++ * Lookup is much lighter than readdir, so it can be allowed for the ++ * host system. But it would be strange to be able to do lookup only ++ * without readdir... ++ */ ++ if (ve_is_super(ve)) ++ goto out; ++#endif ++ ++ dev = 0; ++ l = dentry->d_name.len; ++ if (l <= 0) ++ goto out; ++ for (s = dentry->d_name.name; l > 0; s++, l--) { ++ if (!isxdigit(*s)) ++ goto out; ++ if (dev & ~(~0UL >> 4)) ++ goto out; ++ dev <<= 4; ++ if (isdigit(*s)) ++ dev += *s - '0'; ++ else if (islower(*s)) ++ dev += *s - 'a' + 10; ++ else ++ dev += *s - 'A' + 10; ++ } ++ dev = new_decode_dev(dev); ++ ++#ifdef CONFIG_VE ++ if (get_device_perms_ve(S_IFBLK, dev, FMODE_QUOTACTL)) ++ goto out; ++#endif ++ ++ inode = iget5_locked(dir->i_sb, vzdq_aquot_getino(dev), ++ vzdq_aquotd_looktest, vzdq_aquotd_lookset, ++ (void *)(unsigned long)dev); ++ if (inode == NULL) ++ goto out; ++ unlock_new_inode(inode); ++ ++ d_add(dentry, inode); ++ (void)set_exec_env(old_ve); ++ return NULL; ++ ++out: ++ (void)set_exec_env(old_ve); ++ return ERR_PTR(-ENOENT); ++} ++ ++static struct file_operations vzdq_aquotd_file_operations = { ++ .read = &generic_read_dir, ++ .readdir = &vzdq_aquotd_readdir, ++}; ++ ++static struct inode_operations vzdq_aquotd_inode_operations = { ++ .lookup = &vzdq_aquotd_lookup, ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Initialization and deinitialization ++ * ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * FIXME: creation of proc entries here is unsafe with respect to module ++ * unloading. ++ */ ++void vzaquota_init(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = create_proc_glob_entry("vz/vzaquota", ++ S_IFDIR | S_IRUSR | S_IXUSR, NULL); ++ if (de != NULL) { ++ de->proc_iops = &vzdq_aquotd_inode_operations; ++ de->proc_fops = &vzdq_aquotd_file_operations; ++ } else ++ printk("VZDQ: vz/vzaquota creation failed\n"); ++#if defined(CONFIG_SYSCTL) ++ de = create_proc_glob_entry("sys/fs/quota", ++ S_IFDIR | S_IRUSR | S_IXUSR, NULL); ++ if (de == NULL) ++ printk("VZDQ: sys/fs/quota creation failed\n"); ++#endif ++} ++ ++void vzaquota_fini(void) ++{ ++} +diff -upr linux-2.6.16.orig/fs/vzdq_mgmt.c linux-2.6.16-026test015/fs/vzdq_mgmt.c +--- linux-2.6.16.orig/fs/vzdq_mgmt.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/vzdq_mgmt.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,735 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <linux/string.h> ++#include <linux/list.h> ++#include <asm/semaphore.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/dcache.h> ++#include <linux/mount.h> ++#include <linux/namei.h> ++#include <linux/writeback.h> ++#include <linux/gfp.h> ++#include <asm/uaccess.h> ++#include <linux/proc_fs.h> ++#include <linux/quota.h> ++#include <linux/vzctl_quota.h> ++#include <linux/vzquota.h> ++ ++ ++/* ---------------------------------------------------------------------- ++ * Switching quota on. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * check limits copied from user ++ */ ++int vzquota_check_sane_limits(struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ ++ /* softlimit must be less then hardlimit */ ++ if (qstat->bsoftlimit > qstat->bhardlimit) ++ goto out; ++ ++ if (qstat->isoftlimit > qstat->ihardlimit) ++ goto out; ++ ++ err = 0; ++out: ++ return err; ++} ++ ++/* ++ * check usage values copied from user ++ */ ++int vzquota_check_sane_values(struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ ++ /* expiration time must not be set if softlimit was not exceeded */ ++ if (qstat->bcurrent < qstat->bsoftlimit && qstat->btime != (time_t)0) ++ goto out; ++ ++ if (qstat->icurrent < qstat->isoftlimit && qstat->itime != (time_t)0) ++ goto out; ++ ++ err = vzquota_check_sane_limits(qstat); ++out: ++ return err; ++} ++ ++/* ++ * create new quota master block ++ * this function should: ++ * - copy limits and usage parameters from user buffer; ++ * - allock, initialize quota block and insert it to hash; ++ */ ++static int vzquota_create(unsigned int quota_id, struct vz_quota_stat *u_qstat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -EFAULT; ++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) ++ goto out; ++ ++ err = -EINVAL; ++ if (quota_id == 0) ++ goto out; ++ ++ if (vzquota_check_sane_values(&qstat.dq_stat)) ++ goto out; ++ err = 0; ++ qmblk = vzquota_alloc_master(quota_id, &qstat); ++ ++ if (IS_ERR(qmblk)) /* ENOMEM or EEXIST */ ++ err = PTR_ERR(qmblk); ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++/** ++ * vzquota_on - turn quota on ++ * ++ * This function should: ++ * - find and get refcnt of directory entry for quota root and corresponding ++ * mountpoint; ++ * - find corresponding quota block and mark it with given path; ++ * - check quota tree; ++ * - initialize quota for the tree root. ++ */ ++static int vzquota_on(unsigned int quota_id, const char *quota_root) ++{ ++ int err; ++ struct nameidata nd; ++ struct vz_quota_master *qmblk; ++ struct super_block *dqsb; ++ ++ dqsb = NULL; ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; ++ ++ err = user_path_walk(quota_root, &nd); ++ if (err) ++ goto out; ++ /* init path must be a directory */ ++ err = -ENOTDIR; ++ if (!S_ISDIR(nd.dentry->d_inode->i_mode)) ++ goto out_path; ++ ++ qmblk->dq_root_dentry = nd.dentry; ++ qmblk->dq_root_mnt = nd.mnt; ++ qmblk->dq_sb = nd.dentry->d_inode->i_sb; ++ err = vzquota_get_super(qmblk->dq_sb); ++ if (err) ++ goto out_super; ++ ++ /* ++ * Serialization with quota initialization and operations is performed ++ * through generation check: generation is memorized before qmblk is ++ * found and compared under inode_qmblk_lock with assignment. ++ * ++ * Note that the dentry tree is shrunk only for high-level logical ++ * serialization, purely as a courtesy to the user: to have consistent ++ * quota statistics, files should be closed etc. on quota on. ++ */ ++ err = vzquota_on_qmblk(qmblk->dq_sb, qmblk->dq_root_dentry->d_inode, ++ qmblk); ++ if (err) ++ goto out_init; ++ qmblk->dq_state = VZDQ_WORKING; ++ ++ up(&vz_quota_sem); ++ return 0; ++ ++out_init: ++ dqsb = qmblk->dq_sb; ++out_super: ++ /* clear for qmblk_put/quota_free_master */ ++ qmblk->dq_sb = NULL; ++ qmblk->dq_root_dentry = NULL; ++ qmblk->dq_root_mnt = NULL; ++out_path: ++ path_release(&nd); ++out: ++ if (dqsb) ++ vzquota_put_super(dqsb); ++ up(&vz_quota_sem); ++ return err; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Switching quota off. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * destroy quota block by ID ++ */ ++static int vzquota_destroy(unsigned int quota_id) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ struct dentry *dentry; ++ struct vfsmount *mnt; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state == VZDQ_WORKING) ++ goto out; /* quota_off first */ ++ ++ list_del_init(&qmblk->dq_hash); ++ dentry = qmblk->dq_root_dentry; ++ qmblk->dq_root_dentry = NULL; ++ mnt = qmblk->dq_root_mnt; ++ qmblk->dq_root_mnt = NULL; ++ ++ if (qmblk->dq_sb) ++ vzquota_put_super(qmblk->dq_sb); ++ up(&vz_quota_sem); ++ ++ qmblk_put(qmblk); ++ dput(dentry); ++ mntput(mnt); ++ return 0; ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/** ++ * vzquota_off - turn quota off ++ */ ++ ++static int __vzquota_sync_list(struct list_head *lh, ++ struct vz_quota_master *qmblk, ++ enum writeback_sync_modes sync_mode) ++{ ++ struct writeback_control wbc; ++ LIST_HEAD(list); ++ struct vz_quota_ilink *qlnk; ++ struct inode *inode; ++ int err; ++ ++ memset(&wbc, 0, sizeof(wbc)); ++ wbc.sync_mode = sync_mode; ++ ++ err = 0; ++ while (!list_empty(lh) && !err) { ++ if (need_resched()) { ++ inode_qmblk_unlock(qmblk->dq_sb); ++ schedule(); ++ inode_qmblk_lock(qmblk->dq_sb); ++ } ++ ++ qlnk = list_first_entry(lh, struct vz_quota_ilink, list); ++ list_move(&qlnk->list, &list); ++ ++ inode = igrab(QLNK_INODE(qlnk)); ++ if (!inode) ++ continue; ++ ++ inode_qmblk_unlock(qmblk->dq_sb); ++ ++ wbc.nr_to_write = LONG_MAX; ++ err = sync_inode(inode, &wbc); ++ iput(inode); ++ ++ inode_qmblk_lock(qmblk->dq_sb); ++ } ++ ++ list_splice(&list, lh); ++ return err; ++} ++ ++static int vzquota_sync_list(struct list_head *lh, ++ struct vz_quota_master *qmblk) ++{ ++ int err; ++ ++ err = __vzquota_sync_list(lh, qmblk, WB_SYNC_NONE); ++ if (err) ++ return err; ++ ++ err = __vzquota_sync_list(lh, qmblk, WB_SYNC_ALL); ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++static int vzquota_sync_inodes(struct vz_quota_master *qmblk) ++{ ++ int err; ++ LIST_HEAD(qlnk_list); ++ ++ list_splice_init(&qmblk->dq_ilink_list, &qlnk_list); ++ err = vzquota_sync_list(&qlnk_list, qmblk); ++ if (!err && !list_empty(&qmblk->dq_ilink_list)) ++ err = -EBUSY; ++ list_splice(&qlnk_list, &qmblk->dq_ilink_list); ++ ++ return err; ++} ++ ++static int vzquota_off(unsigned int quota_id) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EALREADY; ++ if (qmblk->dq_state != VZDQ_WORKING) ++ goto out; ++ ++ inode_qmblk_lock(qmblk->dq_sb); /* protects dq_ilink_list also */ ++ err = vzquota_sync_inodes(qmblk); ++ if (err) ++ goto out_unlock; ++ inode_qmblk_unlock(qmblk->dq_sb); ++ ++ err = vzquota_off_qmblk(qmblk->dq_sb, qmblk); ++ if (err) ++ goto out; ++ ++ /* vzquota_destroy will free resources */ ++ qmblk->dq_state = VZDQ_STOPING; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++ ++out_unlock: ++ inode_qmblk_unlock(qmblk->dq_sb); ++ goto out; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Other VZQUOTA ioctl's. ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * this function should: ++ * - set new limits/buffer under quota master block lock ++ * - if new softlimit less then usage, then set expiration time ++ * - no need to alloc ugid hash table - we'll do that on demand ++ */ ++int vzquota_update_limit(struct dq_stat *_qstat, ++ struct dq_stat *qstat) ++{ ++ int err; ++ ++ err = -EINVAL; ++ if (vzquota_check_sane_limits(qstat)) ++ goto out; ++ ++ err = 0; ++ ++ /* limits */ ++ _qstat->bsoftlimit = qstat->bsoftlimit; ++ _qstat->bhardlimit = qstat->bhardlimit; ++ /* ++ * If the soft limit is exceeded, administrator can override the moment ++ * when the grace period for limit exceeding ends. ++ * Specifying the moment may be useful if the soft limit is set to be ++ * lower than the current usage. In the latter case, if the grace ++ * period end isn't specified, the grace period will start from the ++ * moment of the first write operation. ++ * There is a race with the user level. Soft limit may be already ++ * exceeded before the limit change, and grace period end calculated by ++ * the kernel will be overriden. User level may check if the limit is ++ * already exceeded, but check and set calls are not atomic. ++ * This race isn't dangerous. Under normal cicrumstances, the ++ * difference between the grace period end calculated by the kernel and ++ * the user level should be not greater than as the difference between ++ * the moments of check and set calls, i.e. not bigger than the quota ++ * timer resolution - 1 sec. ++ */ ++ if (qstat->btime != (time_t)0 && ++ _qstat->bcurrent >= _qstat->bsoftlimit) ++ _qstat->btime = qstat->btime; ++ ++ _qstat->isoftlimit = qstat->isoftlimit; ++ _qstat->ihardlimit = qstat->ihardlimit; ++ if (qstat->itime != (time_t)0 && ++ _qstat->icurrent >= _qstat->isoftlimit) ++ _qstat->itime = qstat->itime; ++ ++out: ++ return err; ++} ++ ++/* ++ * set new quota limits. ++ * this function should: ++ * copy new limits from user level ++ * - find quota block ++ * - set new limits and flags. ++ */ ++static int vzquota_setlimit(unsigned int quota_id, ++ struct vz_quota_stat *u_qstat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); /* for hash list protection */ ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&qstat, u_qstat, sizeof(qstat))) ++ goto out; ++ ++ qmblk_data_write_lock(qmblk); ++ err = vzquota_update_limit(&qmblk->dq_stat, &qstat.dq_stat); ++ if (err == 0) ++ qmblk->dq_info = qstat.dq_info; ++ qmblk_data_write_unlock(qmblk); ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/* ++ * get quota limits. ++ * very simple - just return stat buffer to user ++ */ ++static int vzquota_getstat(unsigned int quota_id, ++ struct vz_quota_stat *u_qstat) ++{ ++ int err; ++ struct vz_quota_stat qstat; ++ struct vz_quota_master *qmblk; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ qmblk_data_read_lock(qmblk); ++ /* copy whole buffer under lock */ ++ memcpy(&qstat.dq_stat, &qmblk->dq_stat, sizeof(qstat.dq_stat)); ++ memcpy(&qstat.dq_info, &qmblk->dq_info, sizeof(qstat.dq_info)); ++ qmblk_data_read_unlock(qmblk); ++ ++ err = copy_to_user(u_qstat, &qstat, sizeof(qstat)); ++ if (err) ++ err = -EFAULT; ++ ++out: ++ up(&vz_quota_sem); ++ return err; ++} ++ ++/* ++ * This is a system call to turn per-VE disk quota on. ++ * Note this call is allowed to run ONLY from VE0 ++ */ ++long do_vzquotactl(int cmd, unsigned int quota_id, ++ struct vz_quota_stat *qstat, const char *ve_root) ++{ ++ int ret; ++ ++ ret = -EPERM; ++ /* access allowed only from root of VE0 */ ++ if (!capable(CAP_SYS_RESOURCE) || ++ !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ switch (cmd) { ++ case VZ_DQ_CREATE: ++ ret = vzquota_create(quota_id, qstat); ++ break; ++ case VZ_DQ_DESTROY: ++ ret = vzquota_destroy(quota_id); ++ break; ++ case VZ_DQ_ON: ++ ret = vzquota_on(quota_id, ve_root); ++ break; ++ case VZ_DQ_OFF: ++ ret = vzquota_off(quota_id); ++ break; ++ case VZ_DQ_SETLIMIT: ++ ret = vzquota_setlimit(quota_id, qstat); ++ break; ++ case VZ_DQ_GETSTAT: ++ ret = vzquota_getstat(quota_id, qstat); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ goto out; ++ } ++ ++out: ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Proc filesystem routines ++ * ---------------------------------------------------------------------*/ ++ ++#if defined(CONFIG_PROC_FS) ++ ++#define QUOTA_UINT_LEN 15 ++#define QUOTA_TIME_LEN_FMT_UINT "%11u" ++#define QUOTA_NUM_LEN_FMT_UINT "%15u" ++#define QUOTA_NUM_LEN_FMT_ULL "%15Lu" ++#define QUOTA_TIME_LEN_FMT_STR "%11s" ++#define QUOTA_NUM_LEN_FMT_STR "%15s" ++#define QUOTA_PROC_MAX_LINE_LEN 2048 ++ ++/* ++ * prints /proc/ve_dq header line ++ */ ++static int print_proc_header(char * buffer) ++{ ++ return sprintf(buffer, ++ "%-11s" ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_NUM_LEN_FMT_STR ++ QUOTA_TIME_LEN_FMT_STR ++ QUOTA_TIME_LEN_FMT_STR ++ "\n", ++ "qid: path", ++ "usage", "softlimit", "hardlimit", "time", "expire"); ++} ++ ++/* ++ * prints proc master record id, dentry path ++ */ ++static int print_proc_master_id(char * buffer, char * path_buf, ++ struct vz_quota_master * qp) ++{ ++ char *path; ++ int over; ++ ++ path = NULL; ++ switch (qp->dq_state) { ++ case VZDQ_WORKING: ++ if (!path_buf) { ++ path = ""; ++ break; ++ } ++ path = d_path(qp->dq_root_dentry, ++ qp->dq_root_mnt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ path = ""; ++ break; ++ } ++ /* do not print large path, truncate it */ ++ over = strlen(path) - ++ (QUOTA_PROC_MAX_LINE_LEN - 3 - 3 - ++ QUOTA_UINT_LEN); ++ if (over > 0) { ++ path += over - 3; ++ path[0] = path[1] = path[3] = '.'; ++ } ++ break; ++ case VZDQ_STARTING: ++ path = "-- started --"; ++ break; ++ case VZDQ_STOPING: ++ path = "-- stopped --"; ++ break; ++ } ++ ++ return sprintf(buffer, "%u: %s\n", qp->dq_id, path); ++} ++ ++/* ++ * prints struct vz_quota_stat data ++ */ ++static int print_proc_stat(char * buffer, struct dq_stat *qs, ++ struct dq_info *qi) ++{ ++ return sprintf(buffer, ++ "%11s" ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_NUM_LEN_FMT_ULL ++ QUOTA_TIME_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ "\n" ++ "%11s" ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_NUM_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ QUOTA_TIME_LEN_FMT_UINT ++ "\n", ++ "1k-blocks", ++ qs->bcurrent >> 10, ++ qs->bsoftlimit >> 10, ++ qs->bhardlimit >> 10, ++ (unsigned int)qs->btime, ++ (unsigned int)qi->bexpire, ++ "inodes", ++ qs->icurrent, ++ qs->isoftlimit, ++ qs->ihardlimit, ++ (unsigned int)qs->itime, ++ (unsigned int)qi->iexpire); ++} ++ ++ ++/* ++ * for /proc filesystem output ++ */ ++static int vzquota_read_proc(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ int len, i; ++ off_t printed = 0; ++ char *p = page; ++ struct vz_quota_master *qp; ++ struct vz_quota_ilink *ql2; ++ struct list_head *listp; ++ char *path_buf; ++ ++ path_buf = (char*)__get_free_page(GFP_KERNEL); ++ if (path_buf == NULL) ++ return -ENOMEM; ++ ++ len = print_proc_header(p); ++ printed += len; ++ if (off < printed) /* keep header in output */ { ++ *start = p + off; ++ p += len; ++ } ++ ++ down(&vz_quota_sem); ++ ++ /* traverse master hash table for all records */ ++ for (i = 0; i < vzquota_hash_size; i++) { ++ list_for_each(listp, &vzquota_hash_table[i]) { ++ qp = list_entry(listp, ++ struct vz_quota_master, dq_hash); ++ ++ /* Skip other VE's information if not root of VE0 */ ++ if ((!capable(CAP_SYS_ADMIN) || ++ !capable(CAP_SYS_RESOURCE))) { ++ ql2 = INODE_QLNK(current->fs->root->d_inode); ++ if (ql2 == NULL || qp != ql2->qmblk) ++ continue; ++ } ++ /* ++ * Now print the next record ++ */ ++ len = 0; ++ /* we print quotaid and path only in VE0 */ ++ if (capable(CAP_SYS_ADMIN)) ++ len += print_proc_master_id(p+len,path_buf, qp); ++ len += print_proc_stat(p+len, &qp->dq_stat, ++ &qp->dq_info); ++ printed += len; ++ /* skip unnecessary lines */ ++ if (printed <= off) ++ continue; ++ p += len; ++ /* provide start offset */ ++ if (*start == NULL) ++ *start = p + (off - printed); ++ /* have we printed all requested size? */ ++ if (PAGE_SIZE - (p - page) < QUOTA_PROC_MAX_LINE_LEN || ++ (p - *start) >= count) ++ goto out; ++ } ++ } ++ ++ *eof = 1; /* checked all hash */ ++out: ++ up(&vz_quota_sem); ++ ++ len = 0; ++ if (*start != NULL) { ++ len = (p - *start); ++ if (len > count) ++ len = count; ++ } ++ ++ if (path_buf) ++ free_page((unsigned long) path_buf); ++ ++ return len; ++} ++ ++/* ++ * Register procfs read callback ++ */ ++int vzquota_proc_init(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = create_proc_entry("vz/vzquota", S_IFREG|S_IRUSR, NULL); ++ if (de == NULL) { ++ /* create "vz" subdirectory, if not exist */ ++ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ if (de == NULL) ++ goto out_err; ++ de = create_proc_entry("vzquota", S_IFREG|S_IRUSR, de); ++ if (de == NULL) ++ goto out_err; ++ } ++ de->read_proc = vzquota_read_proc; ++ de->data = NULL; ++ return 0; ++out_err: ++ return -EBUSY; ++} ++ ++void vzquota_proc_release(void) ++{ ++ /* Unregister procfs read callback */ ++ remove_proc_entry("vz/vzquota", NULL); ++} ++ ++#endif +diff -upr linux-2.6.16.orig/fs/vzdq_ops.c linux-2.6.16-026test015/fs/vzdq_ops.c +--- linux-2.6.16.orig/fs/vzdq_ops.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/vzdq_ops.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,565 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <linux/types.h> ++#include <asm/semaphore.h> ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/quota.h> ++#include <linux/vzquota.h> ++ ++ ++/* ---------------------------------------------------------------------- ++ * Quota superblock operations - helper functions. ++ * --------------------------------------------------------------------- */ ++ ++static inline void vzquota_incr_inodes(struct dq_stat *dqstat, ++ unsigned long number) ++{ ++ dqstat->icurrent += number; ++} ++ ++static inline void vzquota_incr_space(struct dq_stat *dqstat, ++ __u64 number) ++{ ++ dqstat->bcurrent += number; ++} ++ ++static inline void vzquota_decr_inodes(struct dq_stat *dqstat, ++ unsigned long number) ++{ ++ if (dqstat->icurrent > number) ++ dqstat->icurrent -= number; ++ else ++ dqstat->icurrent = 0; ++ if (dqstat->icurrent < dqstat->isoftlimit) ++ dqstat->itime = (time_t) 0; ++} ++ ++static inline void vzquota_decr_space(struct dq_stat *dqstat, ++ __u64 number) ++{ ++ if (dqstat->bcurrent > number) ++ dqstat->bcurrent -= number; ++ else ++ dqstat->bcurrent = 0; ++ if (dqstat->bcurrent < dqstat->bsoftlimit) ++ dqstat->btime = (time_t) 0; ++} ++ ++/* ++ * better printk() message or use /proc/vzquotamsg interface ++ * similar to /proc/kmsg ++ */ ++static inline void vzquota_warn(struct dq_info *dq_info, int dq_id, int flag, ++ const char *fmt) ++{ ++ if (dq_info->flags & flag) /* warning already printed for this ++ masterblock */ ++ return; ++ printk(fmt, dq_id); ++ dq_info->flags |= flag; ++} ++ ++/* ++ * ignore_hardlimit - ++ * ++ * Intended to allow superuser of VE0 to overwrite hardlimits. ++ * ++ * ignore_hardlimit() has a very bad feature: ++ * ++ * writepage() operation for writable mapping of a file with holes ++ * may trigger get_block() with wrong current and as a consequence, ++ * opens a possibility to overcommit hardlimits ++ */ ++/* for the reason above, it is disabled now */ ++static inline int ignore_hardlimit(struct dq_info *dqstat) ++{ ++#if 0 ++ return ve_is_super(get_exec_env()) && ++ capable(CAP_SYS_RESOURCE) && ++ (dqstat->options & VZ_QUOTA_OPT_RSQUASH); ++#else ++ return 0; ++#endif ++} ++ ++static int vzquota_check_inodes(struct dq_info *dq_info, ++ struct dq_stat *dqstat, ++ unsigned long number, int dq_id) ++{ ++ if (number == 0) ++ return QUOTA_OK; ++ ++ if (dqstat->icurrent + number > dqstat->ihardlimit && ++ !ignore_hardlimit(dq_info)) { ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, ++ "VZ QUOTA: file hardlimit reached for id=%d\n"); ++ return NO_QUOTA; ++ } ++ ++ if (dqstat->icurrent + number > dqstat->isoftlimit) { ++ if (dqstat->itime == (time_t)0) { ++ vzquota_warn(dq_info, dq_id, 0, ++ "VZ QUOTA: file softlimit exceeded " ++ "for id=%d\n"); ++ dqstat->itime = CURRENT_TIME_SECONDS + ++ dq_info->iexpire; ++ } else if (CURRENT_TIME_SECONDS >= dqstat->itime && ++ !ignore_hardlimit(dq_info)) { ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_INODES, ++ "VZ QUOTA: file softlimit expired " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_space(struct dq_info *dq_info, ++ struct dq_stat *dqstat, ++ __u64 number, int dq_id, char prealloc) ++{ ++ if (number == 0) ++ return QUOTA_OK; ++ ++ if (dqstat->bcurrent + number > dqstat->bhardlimit && ++ !ignore_hardlimit(dq_info)) { ++ if (!prealloc) ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, ++ "VZ QUOTA: disk hardlimit reached " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ ++ if (dqstat->bcurrent + number > dqstat->bsoftlimit) { ++ if (dqstat->btime == (time_t)0) { ++ if (!prealloc) { ++ vzquota_warn(dq_info, dq_id, 0, ++ "VZ QUOTA: disk softlimit exceeded " ++ "for id=%d\n"); ++ dqstat->btime = CURRENT_TIME_SECONDS ++ + dq_info->bexpire; ++ } else { ++ /* ++ * Original Linux quota doesn't allow ++ * preallocation to exceed softlimit so ++ * exceeding will be always printed ++ */ ++ return NO_QUOTA; ++ } ++ } else if (CURRENT_TIME_SECONDS >= dqstat->btime && ++ !ignore_hardlimit(dq_info)) { ++ if (!prealloc) ++ vzquota_warn(dq_info, dq_id, VZ_QUOTA_SPACE, ++ "VZ QUOTA: disk quota " ++ "softlimit expired " ++ "for id=%d\n"); ++ return NO_QUOTA; ++ } ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_ugid_inodes(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ int type, unsigned long number) ++{ ++ struct dq_info *dqinfo; ++ struct dq_stat *dqstat; ++ ++ if (qugid[type] == NULL) ++ return QUOTA_OK; ++ if (qugid[type] == VZ_QUOTA_UGBAD) ++ return NO_QUOTA; ++ ++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) ++ return QUOTA_OK; ++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) ++ return QUOTA_OK; ++ if (number == 0) ++ return QUOTA_OK; ++ ++ dqinfo = &qmblk->dq_ugid_info[type]; ++ dqstat = &qugid[type]->qugid_stat; ++ ++ if (dqstat->ihardlimit != 0 && ++ dqstat->icurrent + number > dqstat->ihardlimit) ++ return NO_QUOTA; ++ ++ if (dqstat->isoftlimit != 0 && ++ dqstat->icurrent + number > dqstat->isoftlimit) { ++ if (dqstat->itime == (time_t)0) ++ dqstat->itime = CURRENT_TIME_SECONDS + ++ dqinfo->iexpire; ++ else if (CURRENT_TIME_SECONDS >= dqstat->itime) ++ return NO_QUOTA; ++ } ++ ++ return QUOTA_OK; ++} ++ ++static int vzquota_check_ugid_space(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ int type, __u64 number, char prealloc) ++{ ++ struct dq_info *dqinfo; ++ struct dq_stat *dqstat; ++ ++ if (qugid[type] == NULL) ++ return QUOTA_OK; ++ if (qugid[type] == VZ_QUOTA_UGBAD) ++ return NO_QUOTA; ++ ++ if (type == USRQUOTA && !(qmblk->dq_flags & VZDQ_USRQUOTA)) ++ return QUOTA_OK; ++ if (type == GRPQUOTA && !(qmblk->dq_flags & VZDQ_GRPQUOTA)) ++ return QUOTA_OK; ++ if (number == 0) ++ return QUOTA_OK; ++ ++ dqinfo = &qmblk->dq_ugid_info[type]; ++ dqstat = &qugid[type]->qugid_stat; ++ ++ if (dqstat->bhardlimit != 0 && ++ dqstat->bcurrent + number > dqstat->bhardlimit) ++ return NO_QUOTA; ++ ++ if (dqstat->bsoftlimit != 0 && ++ dqstat->bcurrent + number > dqstat->bsoftlimit) { ++ if (dqstat->btime == (time_t)0) { ++ if (!prealloc) ++ dqstat->btime = CURRENT_TIME_SECONDS ++ + dqinfo->bexpire; ++ else ++ /* ++ * Original Linux quota doesn't allow ++ * preallocation to exceed softlimit so ++ * exceeding will be always printed ++ */ ++ return NO_QUOTA; ++ } else if (CURRENT_TIME_SECONDS >= dqstat->btime) ++ return NO_QUOTA; ++ } ++ ++ return QUOTA_OK; ++} ++ ++/* ---------------------------------------------------------------------- ++ * Quota superblock operations ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * S_NOQUOTA note. ++ * In the current kernel (2.6.8.1), S_NOQUOTA flag is set only for ++ * - quota file (absent in our case) ++ * - after explicit DQUOT_DROP (earlier than clear_inode) in functions like ++ * filesystem-specific new_inode, before the inode gets outside links. ++ * For the latter case, the only quota operation where care about S_NOQUOTA ++ * might be required is vzquota_drop, but there S_NOQUOTA has already been ++ * checked in DQUOT_DROP(). ++ * So, S_NOQUOTA may be ignored for now in the VZDQ code. ++ * ++ * The above note is not entirely correct. ++ * Both for ext2 and ext3 filesystems, DQUOT_FREE_INODE is called from ++ * delete_inode if new_inode fails (for example, because of inode quota ++ * limits), so S_NOQUOTA check is needed in free_inode. ++ * This seems to be the dark corner of the current quota API. ++ */ ++ ++/* ++ * Initialize quota operations for the specified inode. ++ */ ++static int vzquota_initialize(struct inode *inode, int type) ++{ ++ vzquota_inode_init_call(inode); ++ return 0; /* ignored by caller */ ++} ++ ++/* ++ * Release quota for the specified inode. ++ */ ++static int vzquota_drop(struct inode *inode) ++{ ++ vzquota_inode_drop_call(inode); ++ return 0; /* ignored by caller */ ++} ++ ++/* ++ * Allocate block callback. ++ * ++ * If (prealloc) disk quota exceeding warning is not printed. ++ * See Linux quota to know why. ++ * ++ * Return: ++ * QUOTA_OK == 0 on SUCCESS ++ * NO_QUOTA == 1 if allocation should fail ++ */ ++static int vzquota_alloc_space(struct inode *inode, ++ qsize_t number, int prealloc) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ int ret = QUOTA_OK; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid[MAXQUOTAS]; ++#endif ++ ++ /* checking first */ ++ ret = vzquota_check_space(&qmblk->dq_info, &qmblk->dq_stat, ++ number, qmblk->dq_id, prealloc); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; ++ ret = vzquota_check_ugid_space(qmblk, qugid, ++ cnt, number, prealloc); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++ } ++ /* check ok, may increment */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (qugid[cnt] == NULL) ++ continue; ++ vzquota_incr_space(&qugid[cnt]->qugid_stat, number); ++ } ++#endif ++ vzquota_incr_space(&qmblk->dq_stat, number); ++ vzquota_data_unlock(inode, &data); ++ } ++ ++ inode_add_bytes(inode, number); ++ might_sleep(); ++ return QUOTA_OK; ++ ++no_quota: ++ vzquota_data_unlock(inode, &data); ++ return NO_QUOTA; ++} ++ ++/* ++ * Allocate inodes callback. ++ * ++ * Return: ++ * QUOTA_OK == 0 on SUCCESS ++ * NO_QUOTA == 1 if allocation should fail ++ */ ++static int vzquota_alloc_inode(const struct inode *inode, unsigned long number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ int ret = QUOTA_OK; ++ ++ qmblk = vzquota_inode_data((struct inode *)inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid *qugid[MAXQUOTAS]; ++#endif ++ ++ /* checking first */ ++ ret = vzquota_check_inodes(&qmblk->dq_info, &qmblk->dq_stat, ++ number, qmblk->dq_id); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid[cnt] = INODE_QLNK(inode)->qugid[cnt]; ++ ret = vzquota_check_ugid_inodes(qmblk, qugid, ++ cnt, number); ++ if (ret == NO_QUOTA) ++ goto no_quota; ++ } ++ /* check ok, may increment */ ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ if (qugid[cnt] == NULL) ++ continue; ++ vzquota_incr_inodes(&qugid[cnt]->qugid_stat, number); ++ } ++#endif ++ vzquota_incr_inodes(&qmblk->dq_stat, number); ++ vzquota_data_unlock((struct inode *)inode, &data); ++ } ++ ++ might_sleep(); ++ return QUOTA_OK; ++ ++no_quota: ++ vzquota_data_unlock((struct inode *)inode, &data); ++ return NO_QUOTA; ++} ++ ++/* ++ * Free space callback. ++ */ ++static int vzquota_free_space(struct inode *inode, qsize_t number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; /* isn't checked by the caller */ ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ vzquota_decr_space(&qmblk->dq_stat, number); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_space(&qugid->qugid_stat, number); ++ } ++#endif ++ vzquota_data_unlock(inode, &data); ++ } ++ inode_sub_bytes(inode, number); ++ might_sleep(); ++ return QUOTA_OK; ++} ++ ++/* ++ * Free inodes callback. ++ */ ++static int vzquota_free_inode(const struct inode *inode, unsigned long number) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ if (IS_NOQUOTA(inode)) ++ return QUOTA_OK; ++ ++ qmblk = vzquota_inode_data((struct inode *)inode, &data); ++ if (qmblk == VZ_QUOTA_BAD) ++ return NO_QUOTA; ++ if (qmblk != NULL) { ++#ifdef CONFIG_VZ_QUOTA_UGID ++ int cnt; ++ struct vz_quota_ugid * qugid; ++#endif ++ ++ vzquota_decr_inodes(&qmblk->dq_stat, number); ++#ifdef CONFIG_VZ_QUOTA_UGID ++ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { ++ qugid = INODE_QLNK(inode)->qugid[cnt]; ++ if (qugid == NULL || qugid == VZ_QUOTA_UGBAD) ++ continue; ++ vzquota_decr_inodes(&qugid->qugid_stat, number); ++ } ++#endif ++ vzquota_data_unlock((struct inode *)inode, &data); ++ } ++ might_sleep(); ++ return QUOTA_OK; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ ++/* ++ * helper function for quota_transfer ++ * check that we can add inode to this quota_id ++ */ ++static int vzquota_transfer_check(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid[], ++ unsigned int type, __u64 size) ++{ ++ if (vzquota_check_ugid_space(qmblk, qugid, type, size, 0) != QUOTA_OK || ++ vzquota_check_ugid_inodes(qmblk, qugid, type, 1) != QUOTA_OK) ++ return -1; ++ return 0; ++} ++ ++int vzquota_transfer_usage(struct inode *inode, ++ int mask, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct vz_quota_ugid *qugid_old; ++ __u64 space; ++ int i; ++ ++ space = inode_get_bytes(inode); ++ for (i = 0; i < MAXQUOTAS; i++) { ++ if (!(mask & (1 << i))) ++ continue; ++ if (vzquota_transfer_check(qlnk->qmblk, qlnk->qugid, i, space)) ++ return -1; ++ } ++ ++ for (i = 0; i < MAXQUOTAS; i++) { ++ if (!(mask & (1 << i))) ++ continue; ++ qugid_old = INODE_QLNK(inode)->qugid[i]; ++ vzquota_decr_space(&qugid_old->qugid_stat, space); ++ vzquota_decr_inodes(&qugid_old->qugid_stat, 1); ++ vzquota_incr_space(&qlnk->qugid[i]->qugid_stat, space); ++ vzquota_incr_inodes(&qlnk->qugid[i]->qugid_stat, 1); ++ } ++ return 0; ++} ++ ++/* ++ * Transfer the inode between diffent user/group quotas. ++ */ ++static int vzquota_transfer(struct inode *inode, struct iattr *iattr) ++{ ++ return vzquota_inode_transfer_call(inode, iattr) ? ++ NO_QUOTA : QUOTA_OK; ++} ++ ++#else /* CONFIG_VZ_QUOTA_UGID */ ++ ++static int vzquota_transfer(struct inode *inode, struct iattr *iattr) ++{ ++ return QUOTA_OK; ++} ++ ++#endif ++ ++/* ++ * Called under following semaphores: ++ * old_d->d_inode->i_sb->s_vfs_rename_sem ++ * old_d->d_inode->i_sem ++ * new_d->d_inode->i_sem ++ * [not verified --SAW] ++ */ ++static int vzquota_rename(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ return vzquota_rename_check(inode, old_dir, new_dir) ? ++ NO_QUOTA : QUOTA_OK; ++} ++ ++/* ++ * Structure of superblock diskquota operations. ++ */ ++struct dquot_operations vz_quota_operations = { ++ initialize: vzquota_initialize, ++ drop: vzquota_drop, ++ alloc_space: vzquota_alloc_space, ++ alloc_inode: vzquota_alloc_inode, ++ free_space: vzquota_free_space, ++ free_inode: vzquota_free_inode, ++ transfer: vzquota_transfer, ++ rename: vzquota_rename ++}; +diff -upr linux-2.6.16.orig/fs/vzdq_tree.c linux-2.6.16-026test015/fs/vzdq_tree.c +--- linux-2.6.16.orig/fs/vzdq_tree.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/vzdq_tree.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,286 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo quota tree implementation ++ */ ++ ++#include <linux/errno.h> ++#include <linux/slab.h> ++#include <linux/vzdq_tree.h> ++ ++struct quotatree_tree *quotatree_alloc(void) ++{ ++ int l; ++ struct quotatree_tree *tree; ++ ++ tree = kmalloc(sizeof(struct quotatree_tree), GFP_KERNEL); ++ if (tree == NULL) ++ goto out; ++ ++ for (l = 0; l < QUOTATREE_DEPTH; l++) { ++ INIT_LIST_HEAD(&tree->levels[l].usedlh); ++ INIT_LIST_HEAD(&tree->levels[l].freelh); ++ tree->levels[l].freenum = 0; ++ } ++ tree->root = NULL; ++ tree->leaf_num = 0; ++out: ++ return tree; ++} ++ ++static struct quotatree_node * ++quotatree_follow(struct quotatree_tree *tree, quotaid_t id, int level, ++ struct quotatree_find_state *st) ++{ ++ void **block; ++ struct quotatree_node *parent; ++ int l, index; ++ ++ parent = NULL; ++ block = (void **)&tree->root; ++ l = 0; ++ while (l < level && *block != NULL) { ++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; ++ parent = *block; ++ block = parent->blocks + index; ++ l++; ++ } ++ if (st != NULL) { ++ st->block = block; ++ st->level = l; ++ } ++ ++ return parent; ++} ++ ++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st) ++{ ++ quotatree_follow(tree, id, QUOTATREE_DEPTH, st); ++ if (st->level == QUOTATREE_DEPTH) ++ return *st->block; ++ else ++ return NULL; ++} ++ ++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index) ++{ ++ int i, count; ++ struct quotatree_node *p; ++ void *leaf; ++ ++ if (QTREE_LEAFNUM(tree) <= index) ++ return NULL; ++ ++ count = 0; ++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ leaf = p->blocks[i]; ++ if (leaf == NULL) ++ continue; ++ if (count == index) ++ return leaf; ++ count++; ++ } ++ } ++ return NULL; ++} ++ ++/* returns data leaf (vz_quota_ugid) after _existent_ ugid (@id) ++ * in the tree... */ ++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id) ++{ ++ int off; ++ struct quotatree_node *parent, *p; ++ struct list_head *lh; ++ ++ /* get parent refering correct quota tree node of the last level */ ++ parent = quotatree_follow(tree, id, QUOTATREE_DEPTH, NULL); ++ if (!parent) ++ return NULL; ++ ++ off = (id & QUOTATREE_BMASK) + 1; /* next ugid */ ++ lh = &parent->list; ++ do { ++ p = list_entry(lh, struct quotatree_node, list); ++ for ( ; off < QUOTATREE_BSIZE; off++) ++ if (p->blocks[off]) ++ return p->blocks[off]; ++ off = 0; ++ lh = lh->next; ++ } while (lh != &QTREE_LEAFLVL(tree)->usedlh); ++ ++ return NULL; ++} ++ ++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st, void *data) ++{ ++ struct quotatree_node *p; ++ int l, index; ++ ++ while (st->level < QUOTATREE_DEPTH) { ++ l = st->level; ++ if (!list_empty(&tree->levels[l].freelh)) { ++ p = list_entry(tree->levels[l].freelh.next, ++ struct quotatree_node, list); ++ list_del(&p->list); ++ } else { ++ p = kmalloc(sizeof(struct quotatree_node), GFP_NOFS | __GFP_NOFAIL); ++ if (p == NULL) ++ return -ENOMEM; ++ /* save block number in the l-level ++ * it uses for quota file generation */ ++ p->num = tree->levels[l].freenum++; ++ } ++ list_add(&p->list, &tree->levels[l].usedlh); ++ memset(p->blocks, 0, sizeof(p->blocks)); ++ *st->block = p; ++ ++ index = (id >> QUOTATREE_BSHIFT(l)) & QUOTATREE_BMASK; ++ st->block = p->blocks + index; ++ st->level++; ++ } ++ tree->leaf_num++; ++ *st->block = data; ++ ++ return 0; ++} ++ ++static struct quotatree_node * ++quotatree_remove_ptr(struct quotatree_tree *tree, quotaid_t id, ++ int level) ++{ ++ struct quotatree_node *parent; ++ struct quotatree_find_state st; ++ ++ parent = quotatree_follow(tree, id, level, &st); ++ if (st.level == QUOTATREE_DEPTH) ++ tree->leaf_num--; ++ *st.block = NULL; ++ return parent; ++} ++ ++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id) ++{ ++ struct quotatree_node *p; ++ int level, i; ++ ++ p = quotatree_remove_ptr(tree, id, QUOTATREE_DEPTH); ++ for (level = QUOTATREE_DEPTH - 1; level >= QUOTATREE_CDEPTH; level--) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) ++ if (p->blocks[i] != NULL) ++ return; ++ list_move(&p->list, &tree->levels[level].freelh); ++ p = quotatree_remove_ptr(tree, id, level); ++ } ++} ++ ++#if 0 ++static void quotatree_walk(struct quotatree_tree *tree, ++ struct quotatree_node *node_start, ++ quotaid_t id_start, ++ int level_start, int level_end, ++ int (*callback)(struct quotatree_tree *, ++ quotaid_t id, ++ int level, ++ void *ptr, ++ void *data), ++ void *data) ++{ ++ struct quotatree_node *p; ++ int l, shift, index; ++ quotaid_t id; ++ struct quotatree_find_state st; ++ ++ p = node_start; ++ l = level_start; ++ shift = (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; ++ id = id_start; ++ index = 0; ++ ++ /* ++ * Invariants: ++ * shift == (QUOTATREE_DEPTH - l) * QUOTAID_BBITS; ++ * id & ((1 << shift) - 1) == 0 ++ * p is l-level node corresponding to id ++ */ ++ do { ++ if (!p) ++ break; ++ ++ if (l < level_end) { ++ for (; index < QUOTATREE_BSIZE; index++) ++ if (p->blocks[index] != NULL) ++ break; ++ if (index < QUOTATREE_BSIZE) { ++ /* descend */ ++ p = p->blocks[index]; ++ l++; ++ shift -= QUOTAID_BBITS; ++ id += (quotaid_t)index << shift; ++ index = 0; ++ continue; ++ } ++ } ++ ++ if ((*callback)(tree, id, l, p, data)) ++ break; ++ ++ /* ascend and to the next node */ ++ p = quotatree_follow(tree, id, l, &st); ++ ++ index = ((id >> shift) & QUOTATREE_BMASK) + 1; ++ l--; ++ shift += QUOTAID_BBITS; ++ id &= ~(((quotaid_t)1 << shift) - 1); ++ } while (l >= level_start); ++} ++#endif ++ ++static void free_list(struct list_head *node_list) ++{ ++ struct quotatree_node *p, *tmp; ++ ++ list_for_each_entry_safe(p, tmp, node_list, list) { ++ list_del(&p->list); ++ kfree(p); ++ } ++} ++ ++static inline void quotatree_free_nodes(struct quotatree_tree *tree) ++{ ++ int i; ++ ++ for (i = 0; i < QUOTATREE_DEPTH; i++) { ++ free_list(&tree->levels[i].usedlh); ++ free_list(&tree->levels[i].freelh); ++ } ++} ++ ++static void quotatree_free_leafs(struct quotatree_tree *tree, ++ void (*dtor)(void *)) ++{ ++ int i; ++ struct quotatree_node *p; ++ ++ list_for_each_entry(p, &QTREE_LEAFLVL(tree)->usedlh, list) { ++ for (i = 0; i < QUOTATREE_BSIZE; i++) { ++ if (p->blocks[i] == NULL) ++ continue; ++ ++ dtor(p->blocks[i]); ++ } ++ } ++} ++ ++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)) ++{ ++ quotatree_free_leafs(tree, dtor); ++ quotatree_free_nodes(tree); ++ kfree(tree); ++} +diff -upr linux-2.6.16.orig/fs/vzdq_ugid.c linux-2.6.16-026test015/fs/vzdq_ugid.c +--- linux-2.6.16.orig/fs/vzdq_ugid.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/vzdq_ugid.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1130 @@ ++/* ++ * Copyright (C) 2002 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo UID/GID disk quota implementation ++ */ ++ ++#include <linux/config.h> ++#include <linux/string.h> ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/smp_lock.h> ++#include <linux/rcupdate.h> ++#include <asm/uaccess.h> ++#include <linux/proc_fs.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/quota.h> ++#include <linux/quotaio_v2.h> ++#include <linux/virtinfo.h> ++ ++#include <linux/vzctl.h> ++#include <linux/vzctl_quota.h> ++#include <linux/vzquota.h> ++ ++/* ++ * XXX ++ * may be something is needed for sb->s_dquot->info[]? ++ */ ++ ++#define USRQUOTA_MASK (1 << USRQUOTA) ++#define GRPQUOTA_MASK (1 << GRPQUOTA) ++#define QTYPE2MASK(type) (1 << (type)) ++ ++static kmem_cache_t *vz_quota_ugid_cachep; ++ ++/* guard to protect vz_quota_master from destroy in quota_on/off. Also protects ++ * list on the hash table */ ++extern struct semaphore vz_quota_sem; ++ ++inline struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid) ++{ ++ if (qugid != VZ_QUOTA_UGBAD) ++ atomic_inc(&qugid->qugid_count); ++ return qugid; ++} ++ ++/* we don't limit users with zero limits */ ++static inline int vzquota_fake_stat(struct dq_stat *stat) ++{ ++ return stat->bhardlimit == 0 && stat->bsoftlimit == 0 && ++ stat->ihardlimit == 0 && stat->isoftlimit == 0; ++} ++ ++/* callback function for quotatree_free() */ ++static inline void vzquota_free_qugid(void *ptr) ++{ ++ kmem_cache_free(vz_quota_ugid_cachep, ptr); ++} ++ ++/* ++ * destroy ugid, if it have zero refcount, limits and usage ++ * must be called under qmblk->dq_sem ++ */ ++void vzquota_put_ugid(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid) ++{ ++ if (qugid == VZ_QUOTA_UGBAD) ++ return; ++ qmblk_data_read_lock(qmblk); ++ if (atomic_dec_and_test(&qugid->qugid_count) && ++ (qmblk->dq_flags & VZDQUG_FIXED_SET) == 0 && ++ vzquota_fake_stat(&qugid->qugid_stat) && ++ qugid->qugid_stat.bcurrent == 0 && ++ qugid->qugid_stat.icurrent == 0) { ++ quotatree_remove(QUGID_TREE(qmblk, qugid->qugid_type), ++ qugid->qugid_id); ++ qmblk->dq_ugid_count--; ++ vzquota_free_qugid(qugid); ++ } ++ qmblk_data_read_unlock(qmblk); ++} ++ ++/* ++ * Get ugid block by its index, like it would present in array. ++ * In reality, this is not array - this is leafs chain of the tree. ++ * NULL if index is out of range. ++ * qmblk semaphore is required to protect the tree. ++ */ ++static inline struct vz_quota_ugid * ++vzquota_get_byindex(struct vz_quota_master *qmblk, unsigned int index, int type) ++{ ++ return quotatree_leaf_byindex(QUGID_TREE(qmblk, type), index); ++} ++ ++/* ++ * get next element from ugid "virtual array" ++ * ugid must be in current array and this array may not be changed between ++ * two accesses (quaranteed by "stopped" quota state and quota semaphore) ++ * qmblk semaphore is required to protect the tree ++ */ ++static inline struct vz_quota_ugid * ++vzquota_get_next(struct vz_quota_master *qmblk, struct vz_quota_ugid *qugid) ++{ ++ return quotatree_get_next(QUGID_TREE(qmblk, qugid->qugid_type), ++ qugid->qugid_id); ++} ++ ++/* ++ * requires dq_sem ++ */ ++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags) ++{ ++ struct vz_quota_ugid *qugid; ++ struct quotatree_tree *tree; ++ struct quotatree_find_state st; ++ ++ tree = QUGID_TREE(qmblk, type); ++ qugid = quotatree_find(tree, quota_id, &st); ++ if (qugid) ++ goto success; ++ ++ /* caller does not want alloc */ ++ if (flags & VZDQUG_FIND_DONT_ALLOC) ++ goto fail; ++ ++ if (flags & VZDQUG_FIND_FAKE) ++ goto doit; ++ ++ /* check limit */ ++ if (qmblk->dq_ugid_count >= qmblk->dq_ugid_max) ++ goto fail; ++ ++ /* see comment at VZDQUG_FIXED_SET define */ ++ if (qmblk->dq_flags & VZDQUG_FIXED_SET) ++ goto fail; ++ ++doit: ++ /* alloc new structure */ ++ qugid = kmem_cache_alloc(vz_quota_ugid_cachep, ++ SLAB_NOFS | __GFP_NOFAIL); ++ if (qugid == NULL) ++ goto fail; ++ ++ /* initialize new structure */ ++ qugid->qugid_id = quota_id; ++ memset(&qugid->qugid_stat, 0, sizeof(qugid->qugid_stat)); ++ qugid->qugid_type = type; ++ atomic_set(&qugid->qugid_count, 0); ++ ++ /* insert in tree */ ++ if (quotatree_insert(tree, quota_id, &st, qugid) < 0) ++ goto fail_insert; ++ qmblk->dq_ugid_count++; ++ ++success: ++ vzquota_get_ugid(qugid); ++ return qugid; ++ ++fail_insert: ++ vzquota_free_qugid(qugid); ++fail: ++ return VZ_QUOTA_UGBAD; ++} ++ ++/* ++ * takes dq_sem, may schedule ++ */ ++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags) ++{ ++ struct vz_quota_ugid *qugid; ++ ++ down(&qmblk->dq_sem); ++ qugid = __vzquota_find_ugid(qmblk, quota_id, type, flags); ++ up(&qmblk->dq_sem); ++ ++ return qugid; ++} ++ ++/* ++ * destroy all ugid records on given quota master ++ */ ++void vzquota_kill_ugid(struct vz_quota_master *qmblk) ++{ ++ BUG_ON((qmblk->dq_gid_tree == NULL && qmblk->dq_uid_tree != NULL) || ++ (qmblk->dq_uid_tree == NULL && qmblk->dq_gid_tree != NULL)); ++ ++ if (qmblk->dq_uid_tree != NULL) { ++ quotatree_free(qmblk->dq_uid_tree, vzquota_free_qugid); ++ quotatree_free(qmblk->dq_gid_tree, vzquota_free_qugid); ++ } ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * Management interface to ugid quota for (super)users. ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_find_qmblk - helper to emulate quota on virtual filesystems ++ * ++ * This function finds a quota master block corresponding to the root of ++ * a virtual filesystem. ++ * Returns a quota master block with reference taken, or %NULL if not under ++ * quota, or %VZ_QUOTA_BAD if quota inconsistency is found (and all allocation ++ * operations will fail). ++ * ++ * Note: this function uses vzquota_inode_qmblk(). ++ * The latter is a rather confusing function: it returns qmblk that used to be ++ * on the inode some time ago (without guarantee that it still has any ++ * relations to the inode). So, vzquota_find_qmblk() leaves it up to the ++ * caller to think whether the inode could have changed its qmblk and what to ++ * do in that case. ++ * Currently, the callers appear to not care :( ++ */ ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *sb) ++{ ++ struct inode *qrinode; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ qrinode = NULL; ++ if (sb->s_op->get_quota_root != NULL) ++ qrinode = sb->s_op->get_quota_root(sb); ++ if (qrinode != NULL) ++ qmblk = vzquota_inode_qmblk(qrinode); ++ return qmblk; ++} ++ ++static int vzquota_initialize2(struct inode *inode, int type) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_drop2(struct inode *inode) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_alloc_space2(struct inode *inode, ++ qsize_t number, int prealloc) ++{ ++ inode_add_bytes(inode, number); ++ return QUOTA_OK; ++} ++ ++static int vzquota_alloc_inode2(const struct inode *inode, unsigned long number) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_free_space2(struct inode *inode, qsize_t number) ++{ ++ inode_sub_bytes(inode, number); ++ return QUOTA_OK; ++} ++ ++static int vzquota_free_inode2(const struct inode *inode, unsigned long number) ++{ ++ return QUOTA_OK; ++} ++ ++static int vzquota_transfer2(struct inode *inode, struct iattr *iattr) ++{ ++ return QUOTA_OK; ++} ++ ++struct dquot_operations vz_quota_operations2 = { ++ initialize: vzquota_initialize2, ++ drop: vzquota_drop2, ++ alloc_space: vzquota_alloc_space2, ++ alloc_inode: vzquota_alloc_inode2, ++ free_space: vzquota_free_space2, ++ free_inode: vzquota_free_inode2, ++ transfer: vzquota_transfer2 ++}; ++ ++static int vz_quota_on(struct super_block *sb, int type, ++ int format_id, char *path) ++{ ++ struct vz_quota_master *qmblk; ++ int mask, mask2; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ mask = 0; ++ mask2 = 0; ++ sb->dq_op = &vz_quota_operations2; ++ sb->s_qcop = &vz_quotactl_operations; ++ if (type == USRQUOTA) { ++ mask = DQUOT_USR_ENABLED; ++ mask2 = VZDQ_USRQUOTA; ++ } ++ if (type == GRPQUOTA) { ++ mask = DQUOT_GRP_ENABLED; ++ mask2 = VZDQ_GRPQUOTA; ++ } ++ err = -EBUSY; ++ if (qmblk->dq_flags & mask2) ++ goto out; ++ ++ err = 0; ++ qmblk->dq_flags |= mask2; ++ sb->s_dquot.flags |= mask; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_quota_off(struct super_block *sb, int type) ++{ ++ struct vz_quota_master *qmblk; ++ int mask2; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ mask2 = 0; ++ if (type == USRQUOTA) ++ mask2 = VZDQ_USRQUOTA; ++ if (type == GRPQUOTA) ++ mask2 = VZDQ_GRPQUOTA; ++ err = -EINVAL; ++ if (!(qmblk->dq_flags & mask2)) ++ goto out; ++ ++ qmblk->dq_flags &= ~mask2; ++ err = 0; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_quota_sync(struct super_block *sb, int type) ++{ ++ return 0; /* vz quota is always uptodate */ ++} ++ ++static int vz_get_dqblk(struct super_block *sb, int type, ++ qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *ugid; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = 0; ++ ugid = vzquota_find_ugid(qmblk, id, type, VZDQUG_FIND_DONT_ALLOC); ++ if (ugid != VZ_QUOTA_UGBAD) { ++ qmblk_data_read_lock(qmblk); ++ di->dqb_bhardlimit = ugid->qugid_stat.bhardlimit >> 10; ++ di->dqb_bsoftlimit = ugid->qugid_stat.bsoftlimit >> 10; ++ di->dqb_curspace = ugid->qugid_stat.bcurrent; ++ di->dqb_ihardlimit = ugid->qugid_stat.ihardlimit; ++ di->dqb_isoftlimit = ugid->qugid_stat.isoftlimit; ++ di->dqb_curinodes = ugid->qugid_stat.icurrent; ++ di->dqb_btime = ugid->qugid_stat.btime; ++ di->dqb_itime = ugid->qugid_stat.itime; ++ qmblk_data_read_unlock(qmblk); ++ di->dqb_valid = QIF_ALL; ++ vzquota_put_ugid(qmblk, ugid); ++ } else { ++ memset(di, 0, sizeof(*di)); ++ di->dqb_valid = QIF_ALL; ++ } ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++/* must be called under vz_quota_sem */ ++static int __vz_set_dqblk(struct vz_quota_master *qmblk, ++ int type, qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_ugid *ugid; ++ ++ ugid = vzquota_find_ugid(qmblk, id, type, 0); ++ if (ugid == VZ_QUOTA_UGBAD) ++ return -ESRCH; ++ ++ qmblk_data_write_lock(qmblk); ++ /* ++ * Subtle compatibility breakage. ++ * ++ * Some old non-vz kernel quota didn't start grace period ++ * if the new soft limit happens to be below the usage. ++ * Non-vz kernel quota in 2.4.20 starts the grace period ++ * (if it hasn't been started). ++ * Current non-vz kernel performs even more complicated ++ * manipulations... ++ * ++ * Also, current non-vz kernels have inconsistency related to ++ * the grace time start. In regular operations the grace period ++ * is started if the usage is greater than the soft limit (and, ++ * strangely, is cancelled if the usage is less). ++ * However, set_dqblk starts the grace period if the usage is greater ++ * or equal to the soft limit. ++ * ++ * Here we try to mimic the behavior of the current non-vz kernel. ++ */ ++ if (di->dqb_valid & QIF_BLIMITS) { ++ ugid->qugid_stat.bhardlimit = ++ (__u64)di->dqb_bhardlimit << 10; ++ ugid->qugid_stat.bsoftlimit = ++ (__u64)di->dqb_bsoftlimit << 10; ++ if (di->dqb_bsoftlimit == 0 || ++ ugid->qugid_stat.bcurrent < ugid->qugid_stat.bsoftlimit) ++ ugid->qugid_stat.btime = 0; ++ else if (!(di->dqb_valid & QIF_BTIME)) ++ ugid->qugid_stat.btime = CURRENT_TIME_SECONDS ++ + qmblk->dq_ugid_info[type].bexpire; ++ else ++ ugid->qugid_stat.btime = di->dqb_btime; ++ } ++ if (di->dqb_valid & QIF_ILIMITS) { ++ ugid->qugid_stat.ihardlimit = di->dqb_ihardlimit; ++ ugid->qugid_stat.isoftlimit = di->dqb_isoftlimit; ++ if (di->dqb_isoftlimit == 0 || ++ ugid->qugid_stat.icurrent < ugid->qugid_stat.isoftlimit) ++ ugid->qugid_stat.itime = 0; ++ else if (!(di->dqb_valid & QIF_ITIME)) ++ ugid->qugid_stat.itime = CURRENT_TIME_SECONDS ++ + qmblk->dq_ugid_info[type].iexpire; ++ else ++ ugid->qugid_stat.itime = di->dqb_itime; ++ } ++ qmblk_data_write_unlock(qmblk); ++ vzquota_put_ugid(qmblk, ugid); ++ ++ return 0; ++} ++ ++static int vz_set_dqblk(struct super_block *sb, int type, ++ qid_t id, struct if_dqblk *di) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ err = __vz_set_dqblk(qmblk, type, id, di); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++static int vz_get_dqinfo(struct super_block *sb, int type, ++ struct if_dqinfo *ii) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = 0; ++ ii->dqi_bgrace = qmblk->dq_ugid_info[type].bexpire; ++ ii->dqi_igrace = qmblk->dq_ugid_info[type].iexpire; ++ ii->dqi_flags = 0; ++ ii->dqi_valid = IIF_ALL; ++ ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++/* must be called under vz_quota_sem */ ++static int __vz_set_dqinfo(struct vz_quota_master *qmblk, ++ int type, struct if_dqinfo *ii) ++{ ++ if (ii->dqi_valid & IIF_FLAGS) ++ if (ii->dqi_flags & DQF_MASK) ++ return -EINVAL; ++ ++ if (ii->dqi_valid & IIF_BGRACE) ++ qmblk->dq_ugid_info[type].bexpire = ii->dqi_bgrace; ++ if (ii->dqi_valid & IIF_IGRACE) ++ qmblk->dq_ugid_info[type].iexpire = ii->dqi_igrace; ++ return 0; ++} ++ ++static int vz_set_dqinfo(struct super_block *sb, int type, ++ struct if_dqinfo *ii) ++{ ++ struct vz_quota_master *qmblk; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ down(&vz_quota_sem); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ err = __vz_set_dqinfo(qmblk, type, ii); ++out: ++ up(&vz_quota_sem); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ return err; ++} ++ ++#ifdef CONFIG_QUOTA_COMPAT ++ ++#define Q_GETQUOTI_SIZE 1024 ++ ++#define UGID2DQBLK(dst, src) \ ++ do { \ ++ (dst)->dqb_ihardlimit = (src)->qugid_stat.ihardlimit; \ ++ (dst)->dqb_isoftlimit = (src)->qugid_stat.isoftlimit; \ ++ (dst)->dqb_curinodes = (src)->qugid_stat.icurrent; \ ++ /* in 1K blocks */ \ ++ (dst)->dqb_bhardlimit = (src)->qugid_stat.bhardlimit >> 10; \ ++ /* in 1K blocks */ \ ++ (dst)->dqb_bsoftlimit = (src)->qugid_stat.bsoftlimit >> 10; \ ++ /* in bytes, 64 bit */ \ ++ (dst)->dqb_curspace = (src)->qugid_stat.bcurrent; \ ++ (dst)->dqb_btime = (src)->qugid_stat.btime; \ ++ (dst)->dqb_itime = (src)->qugid_stat.itime; \ ++ } while (0) ++ ++static int vz_get_quoti(struct super_block *sb, int type, qid_t idx, ++ struct v2_disk_dqblk *dqblk) ++{ ++ struct vz_quota_master *qmblk; ++ struct v2_disk_dqblk *data, *kbuf; ++ struct vz_quota_ugid *ugid; ++ int count; ++ int err; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ err = -ESRCH; ++ if (qmblk == NULL) ++ goto out; ++ err = -EIO; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out; ++ ++ err = -ENOMEM; ++ kbuf = vmalloc(Q_GETQUOTI_SIZE * sizeof(*kbuf)); ++ if (!kbuf) ++ goto out; ++ ++ down(&vz_quota_sem); ++ down(&qmblk->dq_sem); ++ for (ugid = vzquota_get_byindex(qmblk, idx, type), count = 0; ++ ugid != NULL && count < Q_GETQUOTI_SIZE; ++ count++) ++ { ++ data = kbuf + count; ++ qmblk_data_read_lock(qmblk); ++ UGID2DQBLK(data, ugid); ++ qmblk_data_read_unlock(qmblk); ++ data->dqb_id = ugid->qugid_id; ++ ++ /* Find next entry */ ++ ugid = vzquota_get_next(qmblk, ugid); ++ BUG_ON(ugid != NULL && ugid->qugid_type != type); ++ } ++ up(&qmblk->dq_sem); ++ up(&vz_quota_sem); ++ ++ err = count; ++ if (copy_to_user(dqblk, kbuf, count * sizeof(*kbuf))) ++ err = -EFAULT; ++ ++ vfree(kbuf); ++out: ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qmblk); ++ ++ return err; ++} ++ ++#endif ++ ++struct quotactl_ops vz_quotactl_operations = { ++ quota_on: vz_quota_on, ++ quota_off: vz_quota_off, ++ quota_sync: vz_quota_sync, ++ get_info: vz_get_dqinfo, ++ set_info: vz_set_dqinfo, ++ get_dqblk: vz_get_dqblk, ++ set_dqblk: vz_set_dqblk, ++#ifdef CONFIG_QUOTA_COMPAT ++ get_quoti: vz_get_quoti ++#endif ++}; ++ ++ ++/* ---------------------------------------------------------------------- ++ * Management interface for host system admins. ++ * --------------------------------------------------------------------- */ ++ ++static int quota_ugid_addstat(unsigned int quota_id, unsigned int ugid_size, ++ struct vz_quota_iface *u_ugid_buf) ++{ ++ struct vz_quota_master *qmblk; ++ int ret; ++ ++ down(&vz_quota_sem); ++ ++ ret = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ ret = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; /* working quota doesn't accept new ugids */ ++ ++ ret = 0; ++ /* start to add ugids */ ++ for (ret = 0; ret < ugid_size; ret++) { ++ struct vz_quota_iface ugid_buf; ++ struct vz_quota_ugid *ugid; ++ ++ if (copy_from_user(&ugid_buf, u_ugid_buf, sizeof(ugid_buf))) ++ break; ++ ++ if (ugid_buf.qi_type >= MAXQUOTAS) ++ break; /* bad quota type - this is the only check */ ++ ++ ugid = vzquota_find_ugid(qmblk, ++ ugid_buf.qi_id, ugid_buf.qi_type, 0); ++ if (ugid == VZ_QUOTA_UGBAD) { ++ qmblk->dq_flags |= VZDQUG_FIXED_SET; ++ break; /* limit reached */ ++ } ++ ++ /* update usage/limits ++ * we can copy the data without the lock, because the data ++ * cannot be modified in VZDQ_STARTING state */ ++ ugid->qugid_stat = ugid_buf.qi_stat; ++ ++ vzquota_put_ugid(qmblk, ugid); ++ ++ u_ugid_buf++; /* next user buffer */ ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return ret; ++} ++ ++static int quota_ugid_setgrace(unsigned int quota_id, ++ struct dq_info u_dq_info[]) ++{ ++ struct vz_quota_master *qmblk; ++ struct dq_info dq_info[MAXQUOTAS]; ++ struct dq_info *target; ++ int err, type; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EBUSY; ++ if (qmblk->dq_state != VZDQ_STARTING) ++ goto out; /* working quota doesn't accept changing options */ ++ ++ err = -EFAULT; ++ if (copy_from_user(dq_info, u_dq_info, sizeof(dq_info))) ++ goto out; ++ ++ err = 0; ++ ++ /* update in qmblk */ ++ for (type = 0; type < MAXQUOTAS; type ++) { ++ target = &qmblk->dq_ugid_info[type]; ++ target->bexpire = dq_info[type].bexpire; ++ target->iexpire = dq_info[type].iexpire; ++ } ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int do_quota_ugid_getstat(struct vz_quota_master *qmblk, int index, int size, ++ struct vz_quota_iface *u_ugid_buf) ++{ ++ int type, count; ++ struct vz_quota_ugid *ugid; ++ ++ if (QTREE_LEAFNUM(qmblk->dq_uid_tree) + ++ QTREE_LEAFNUM(qmblk->dq_gid_tree) ++ <= index) ++ return 0; ++ ++ count = 0; ++ ++ type = index < QTREE_LEAFNUM(qmblk->dq_uid_tree) ? USRQUOTA : GRPQUOTA; ++ if (type == GRPQUOTA) ++ index -= QTREE_LEAFNUM(qmblk->dq_uid_tree); ++ ++ /* loop through ugid and then qgid quota */ ++repeat: ++ for (ugid = vzquota_get_byindex(qmblk, index, type); ++ ugid != NULL && count < size; ++ ugid = vzquota_get_next(qmblk, ugid), count++) ++ { ++ struct vz_quota_iface ugid_buf; ++ ++ /* form interface buffer and send in to user-level */ ++ qmblk_data_read_lock(qmblk); ++ memcpy(&ugid_buf.qi_stat, &ugid->qugid_stat, ++ sizeof(ugid_buf.qi_stat)); ++ qmblk_data_read_unlock(qmblk); ++ ugid_buf.qi_id = ugid->qugid_id; ++ ugid_buf.qi_type = ugid->qugid_type; ++ ++ memcpy(u_ugid_buf, &ugid_buf, sizeof(ugid_buf)); ++ u_ugid_buf++; /* next portion of user buffer */ ++ } ++ ++ if (type == USRQUOTA && count < size) { ++ type = GRPQUOTA; ++ index = 0; ++ goto repeat; ++ } ++ ++ return count; ++} ++ ++static int quota_ugid_getstat(unsigned int quota_id, ++ int index, int size, struct vz_quota_iface *u_ugid_buf) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_iface *k_ugid_buf; ++ int err; ++ ++ if (index < 0 || size < 0) ++ return -EINVAL; ++ ++ if (size > INT_MAX / sizeof(struct vz_quota_iface)) ++ return -EINVAL; ++ ++ k_ugid_buf = vmalloc(size * sizeof(struct vz_quota_iface)); ++ if (k_ugid_buf == NULL) ++ return -ENOMEM; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ down(&qmblk->dq_sem); ++ err = do_quota_ugid_getstat(qmblk, index, size, k_ugid_buf); ++ up(&qmblk->dq_sem); ++ if (err < 0) ++ goto out; ++ ++ if (copy_to_user(u_ugid_buf, k_ugid_buf, ++ size * sizeof(struct vz_quota_iface))) ++ err = -EFAULT; ++ ++out: ++ up(&vz_quota_sem); ++ vfree(k_ugid_buf); ++ return err; ++} ++ ++static int quota_ugid_getgrace(unsigned int quota_id, ++ struct dq_info u_dq_info[]) ++{ ++ struct vz_quota_master *qmblk; ++ struct dq_info dq_info[MAXQUOTAS]; ++ struct dq_info *target; ++ int err, type; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = 0; ++ /* update from qmblk */ ++ for (type = 0; type < MAXQUOTAS; type ++) { ++ target = &qmblk->dq_ugid_info[type]; ++ dq_info[type].bexpire = target->bexpire; ++ dq_info[type].iexpire = target->iexpire; ++ dq_info[type].flags = target->flags; ++ } ++ ++ if (copy_to_user(u_dq_info, dq_info, sizeof(dq_info))) ++ err = -EFAULT; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_getconfig(unsigned int quota_id, ++ struct vz_quota_ugid_stat *info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_stat kinfo; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = 0; ++ kinfo.limit = qmblk->dq_ugid_max; ++ kinfo.count = qmblk->dq_ugid_count; ++ kinfo.flags = qmblk->dq_flags; ++ ++ if (copy_to_user(info, &kinfo, sizeof(kinfo))) ++ err = -EFAULT; ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setconfig(unsigned int quota_id, ++ struct vz_quota_ugid_stat *info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_stat kinfo; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ENOENT; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&kinfo, info, sizeof(kinfo))) ++ goto out; ++ ++ err = 0; ++ qmblk->dq_ugid_max = kinfo.limit; ++ if (qmblk->dq_state == VZDQ_STARTING) { ++ qmblk->dq_flags = kinfo.flags; ++ if (qmblk->dq_flags & VZDQUG_ON) ++ qmblk->dq_flags |= VZDQ_USRQUOTA | VZDQ_GRPQUOTA; ++ } ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setlimit(unsigned int quota_id, ++ struct vz_quota_ugid_setlimit *u_lim) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_setlimit lim; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ESRCH; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&lim, u_lim, sizeof(lim))) ++ goto out; ++ ++ err = __vz_set_dqblk(qmblk, lim.type, lim.id, &lim.dqb); ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++static int quota_ugid_setinfo(unsigned int quota_id, ++ struct vz_quota_ugid_setinfo *u_info) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid_setinfo info; ++ int err; ++ ++ down(&vz_quota_sem); ++ ++ err = -ESRCH; ++ qmblk = vzquota_find_master(quota_id); ++ if (qmblk == NULL) ++ goto out; ++ ++ err = -EFAULT; ++ if (copy_from_user(&info, u_info, sizeof(info))) ++ goto out; ++ ++ err = __vz_set_dqinfo(qmblk, info.type, &info.dqi); ++ ++out: ++ up(&vz_quota_sem); ++ ++ return err; ++} ++ ++/* ++ * This is a system call to maintain UGID quotas ++ * Note this call is allowed to run ONLY from VE0 ++ */ ++long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub) ++{ ++ int ret; ++ ++ ret = -EPERM; ++ /* access allowed only from root of VE0 */ ++ if (!capable(CAP_SYS_RESOURCE) || ++ !capable(CAP_SYS_ADMIN)) ++ goto out; ++ ++ switch (qub->cmd) { ++ case VZ_DQ_UGID_GETSTAT: ++ ret = quota_ugid_getstat(qub->quota_id, ++ qub->ugid_index, qub->ugid_size, ++ (struct vz_quota_iface *)qub->addr); ++ break; ++ case VZ_DQ_UGID_ADDSTAT: ++ ret = quota_ugid_addstat(qub->quota_id, qub->ugid_size, ++ (struct vz_quota_iface *)qub->addr); ++ break; ++ case VZ_DQ_UGID_GETGRACE: ++ ret = quota_ugid_getgrace(qub->quota_id, ++ (struct dq_info *)qub->addr); ++ break; ++ case VZ_DQ_UGID_SETGRACE: ++ ret = quota_ugid_setgrace(qub->quota_id, ++ (struct dq_info *)qub->addr); ++ break; ++ case VZ_DQ_UGID_GETCONFIG: ++ ret = quota_ugid_getconfig(qub->quota_id, ++ (struct vz_quota_ugid_stat *)qub->addr); ++ break; ++ case VZ_DQ_UGID_SETCONFIG: ++ ret = quota_ugid_setconfig(qub->quota_id, ++ (struct vz_quota_ugid_stat *)qub->addr); ++ break; ++ case VZ_DQ_UGID_SETLIMIT: ++ ret = quota_ugid_setlimit(qub->quota_id, ++ (struct vz_quota_ugid_setlimit *) ++ qub->addr); ++ break; ++ case VZ_DQ_UGID_SETINFO: ++ ret = quota_ugid_setinfo(qub->quota_id, ++ (struct vz_quota_ugid_setinfo *) ++ qub->addr); ++ break; ++ default: ++ ret = -EINVAL; ++ goto out; ++ } ++out: ++ return ret; ++} ++ ++static void ugid_quota_on_sb(struct super_block *sb) ++{ ++ struct super_block *real_sb; ++ struct vz_quota_master *qmblk; ++ ++ if (!sb->s_op->get_quota_root) ++ return; ++ ++ real_sb = sb->s_op->get_quota_root(sb)->i_sb; ++ if (real_sb->dq_op != &vz_quota_operations) ++ return; ++ ++ sb->dq_op = &vz_quota_operations2; ++ sb->s_qcop = &vz_quotactl_operations; ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ ++ qmblk = vzquota_find_qmblk(sb); ++ if ((qmblk == NULL) || (qmblk == VZ_QUOTA_BAD)) ++ return; ++ down(&vz_quota_sem); ++ if (qmblk->dq_flags & VZDQ_USRQUOTA) ++ sb->s_dquot.flags |= DQUOT_USR_ENABLED; ++ if (qmblk->dq_flags & VZDQ_GRPQUOTA) ++ sb->s_dquot.flags |= DQUOT_GRP_ENABLED; ++ up(&vz_quota_sem); ++ qmblk_put(qmblk); ++} ++ ++static void ugid_quota_off_sb(struct super_block *sb) ++{ ++ /* can't make quota off on mounted super block */ ++ BUG_ON(sb->s_root != NULL); ++} ++ ++static int ugid_notifier_call(struct vnotifier_block *self, ++ unsigned long n, void *data, int old_ret) ++{ ++ struct virt_info_quota *viq; ++ ++ viq = (struct virt_info_quota *)data; ++ ++ switch (n) { ++ case VIRTINFO_QUOTA_ON: ++ ugid_quota_on_sb(viq->super); ++ break; ++ case VIRTINFO_QUOTA_OFF: ++ ugid_quota_off_sb(viq->super); ++ break; ++ case VIRTINFO_QUOTA_GETSTAT: ++ break; ++ default: ++ return old_ret; ++ } ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block ugid_notifier_block = { ++ .notifier_call = ugid_notifier_call, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * Init/exit. ++ * --------------------------------------------------------------------- */ ++ ++struct quota_format_type vz_quota_empty_v2_format = { ++ qf_fmt_id: QFMT_VFS_V0, ++ qf_ops: NULL, ++ qf_owner: THIS_MODULE ++}; ++ ++int vzquota_ugid_init() ++{ ++ int err; ++ ++ vz_quota_ugid_cachep = kmem_cache_create("vz_quota_ugid", ++ sizeof(struct vz_quota_ugid), ++ 0, SLAB_HWCACHE_ALIGN, ++ NULL, NULL); ++ if (vz_quota_ugid_cachep == NULL) ++ goto err_slab; ++ ++ err = register_quota_format(&vz_quota_empty_v2_format); ++ if (err) ++ goto err_reg; ++ ++ virtinfo_notifier_register(VITYPE_QUOTA, &ugid_notifier_block); ++ return 0; ++ ++err_reg: ++ kmem_cache_destroy(vz_quota_ugid_cachep); ++ return err; ++ ++err_slab: ++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); ++ return -ENOMEM; ++} ++ ++void vzquota_ugid_release() ++{ ++ virtinfo_notifier_unregister(VITYPE_QUOTA, &ugid_notifier_block); ++ unregister_quota_format(&vz_quota_empty_v2_format); ++ ++ if (kmem_cache_destroy(vz_quota_ugid_cachep)) ++ printk(KERN_ERR "VZQUOTA: kmem_cache_destroy failed\n"); ++} +diff -upr linux-2.6.16.orig/fs/vzdquot.c linux-2.6.16-026test015/fs/vzdquot.c +--- linux-2.6.16.orig/fs/vzdquot.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/fs/vzdquot.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1705 @@ ++/* ++ * Copyright (C) 2001, 2002, 2004, 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains the core of Virtuozzo disk quota implementation: ++ * maintenance of VZDQ information in inodes, ++ * external interfaces, ++ * module entry. ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <linux/string.h> ++#include <linux/list.h> ++#include <asm/atomic.h> ++#include <linux/spinlock.h> ++#include <asm/semaphore.h> ++#include <linux/slab.h> ++#include <linux/fs.h> ++#include <linux/dcache.h> ++#include <linux/quota.h> ++#include <linux/rcupdate.h> ++#include <linux/module.h> ++#include <asm/uaccess.h> ++#include <linux/vzctl.h> ++#include <linux/vzctl_quota.h> ++#include <linux/vzquota.h> ++#include <linux/virtinfo.h> ++#include <linux/vzdq_tree.h> ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Locking ++ * ++ * ---------------------------------------------------------------------- */ ++ ++/* ++ * Serializes on/off and all other do_vzquotactl operations. ++ * Protects qmblk hash. ++ */ ++struct semaphore vz_quota_sem; ++ ++/* ++ * Data access locks ++ * inode_qmblk ++ * protects qmblk pointers in all inodes and qlnk content in general ++ * (but not qmblk content); ++ * also protects related qmblk invalidation procedures; ++ * can't be per-inode because of vzquota_dtree_qmblk complications ++ * and problems with serialization with quota_on, ++ * but can be per-superblock; ++ * qmblk_data ++ * protects qmblk fields (such as current usage) ++ * quota_data ++ * protects charge/uncharge operations, thus, implies ++ * qmblk_data lock and, if CONFIG_VZ_QUOTA_UGID, inode_qmblk lock ++ * (to protect ugid pointers). ++ * ++ * Lock order: ++ * inode_qmblk_lock -> dcache_lock ++ * inode_qmblk_lock -> qmblk_data ++ */ ++static spinlock_t vzdq_qmblk_lock = SPIN_LOCK_UNLOCKED; ++ ++inline void inode_qmblk_lock(struct super_block *sb) ++{ ++ spin_lock(&vzdq_qmblk_lock); ++} ++ ++inline void inode_qmblk_unlock(struct super_block *sb) ++{ ++ spin_unlock(&vzdq_qmblk_lock); ++} ++ ++inline void qmblk_data_read_lock(struct vz_quota_master *qmblk) ++{ ++ spin_lock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_read_unlock(struct vz_quota_master *qmblk) ++{ ++ spin_unlock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_write_lock(struct vz_quota_master *qmblk) ++{ ++ spin_lock(&qmblk->dq_data_lock); ++} ++ ++inline void qmblk_data_write_unlock(struct vz_quota_master *qmblk) ++{ ++ spin_unlock(&qmblk->dq_data_lock); ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Master hash table handling. ++ * ++ * SMP not safe, serialied by vz_quota_sem within quota syscalls ++ * ++ * --------------------------------------------------------------------- */ ++ ++static kmem_cache_t *vzquota_cachep; ++ ++/* ++ * Hash function. ++ */ ++#define QHASH_BITS 6 ++#define VZ_QUOTA_HASH_SIZE (1 << QHASH_BITS) ++#define QHASH_MASK (VZ_QUOTA_HASH_SIZE - 1) ++ ++struct list_head vzquota_hash_table[VZ_QUOTA_HASH_SIZE]; ++int vzquota_hash_size = VZ_QUOTA_HASH_SIZE; ++ ++static inline int vzquota_hash_func(unsigned int qid) ++{ ++ return (((qid >> QHASH_BITS) ^ qid) & QHASH_MASK); ++} ++ ++/** ++ * vzquota_alloc_master - alloc and instantiate master quota record ++ * ++ * Returns: ++ * pointer to newly created record if SUCCESS ++ * -ENOMEM if out of memory ++ * -EEXIST if record with given quota_id already exist ++ */ ++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, ++ struct vz_quota_stat *qstat) ++{ ++ int err; ++ struct vz_quota_master *qmblk; ++ ++ err = -EEXIST; ++ if (vzquota_find_master(quota_id) != NULL) ++ goto out; ++ ++ err = -ENOMEM; ++ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); ++ if (qmblk == NULL) ++ goto out; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ qmblk->dq_uid_tree = quotatree_alloc(); ++ if (!qmblk->dq_uid_tree) ++ goto out_free; ++ ++ qmblk->dq_gid_tree = quotatree_alloc(); ++ if (!qmblk->dq_gid_tree) ++ goto out_free_tree; ++#endif ++ ++ qmblk->dq_state = VZDQ_STARTING; ++ init_MUTEX(&qmblk->dq_sem); ++ spin_lock_init(&qmblk->dq_data_lock); ++ ++ qmblk->dq_id = quota_id; ++ qmblk->dq_stat = qstat->dq_stat; ++ qmblk->dq_info = qstat->dq_info; ++ qmblk->dq_root_dentry = NULL; ++ qmblk->dq_root_mnt = NULL; ++ qmblk->dq_sb = NULL; ++ qmblk->dq_ugid_count = 0; ++ qmblk->dq_ugid_max = 0; ++ qmblk->dq_flags = 0; ++ memset(qmblk->dq_ugid_info, 0, sizeof(qmblk->dq_ugid_info)); ++ INIT_LIST_HEAD(&qmblk->dq_ilink_list); ++ ++ atomic_set(&qmblk->dq_count, 1); ++ ++ /* insert in hash chain */ ++ list_add(&qmblk->dq_hash, ++ &vzquota_hash_table[vzquota_hash_func(quota_id)]); ++ ++ /* success */ ++ return qmblk; ++ ++out_free_tree: ++ quotatree_free(qmblk->dq_uid_tree, NULL); ++out_free: ++ kmem_cache_free(vzquota_cachep, qmblk); ++out: ++ return ERR_PTR(err); ++} ++ ++static struct vz_quota_master *vzquota_alloc_fake(void) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = kmem_cache_alloc(vzquota_cachep, SLAB_KERNEL); ++ if (qmblk == NULL) ++ return NULL; ++ memset(qmblk, 0, sizeof(*qmblk)); ++ qmblk->dq_state = VZDQ_STOPING; ++ qmblk->dq_flags = VZDQ_NOQUOT; ++ spin_lock_init(&qmblk->dq_data_lock); ++ INIT_LIST_HEAD(&qmblk->dq_ilink_list); ++ atomic_set(&qmblk->dq_count, 1); ++ return qmblk; ++} ++ ++/** ++ * vzquota_find_master - find master record with given id ++ * ++ * Returns qmblk without touching its refcounter. ++ * Called under vz_quota_sem. ++ */ ++struct vz_quota_master *vzquota_find_master(unsigned int quota_id) ++{ ++ int i; ++ struct vz_quota_master *qp; ++ ++ i = vzquota_hash_func(quota_id); ++ list_for_each_entry(qp, &vzquota_hash_table[i], dq_hash) { ++ if (qp->dq_id == quota_id) ++ return qp; ++ } ++ return NULL; ++} ++ ++/** ++ * vzquota_free_master - release resources taken by qmblk, freeing memory ++ * ++ * qmblk is assumed to be already taken out from the hash. ++ * Should be called outside vz_quota_sem. ++ */ ++void vzquota_free_master(struct vz_quota_master *qmblk) ++{ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ vzquota_kill_ugid(qmblk); ++#endif ++ BUG_ON(!list_empty(&qmblk->dq_ilink_list)); ++ kmem_cache_free(vzquota_cachep, qmblk); ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Passing quota information through current ++ * ++ * Used in inode -> qmblk lookup at inode creation stage (since at that ++ * time there are no links between the inode being created and its parent ++ * directory). ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define VZDQ_CUR_MAGIC 0x57d0fee2 ++ ++static inline int vzquota_cur_qmblk_check(void) ++{ ++ return current->magic == VZDQ_CUR_MAGIC; ++} ++ ++static inline struct inode *vzquota_cur_qmblk_fetch(void) ++{ ++ return current->ino; ++} ++ ++static inline void vzquota_cur_qmblk_set(struct inode *data) ++{ ++ struct task_struct *tsk; ++ ++ tsk = current; ++ tsk->magic = VZDQ_CUR_MAGIC; ++ tsk->ino = data; ++} ++ ++#if 0 ++static inline void vzquota_cur_qmblk_reset(void) ++{ ++ current->magic = 0; ++} ++#endif ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Superblock quota operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/* ++ * Kernel structure abuse. ++ * We use files[0] pointer as an int variable: ++ * reference counter of how many quota blocks uses this superblock. ++ * files[1] is used for generations structure which helps us to track ++ * when traversing of dentries is really required. ++ */ ++#define __VZ_QUOTA_NOQUOTA(sb) sb->s_dquot.vzdq_master ++#define __VZ_QUOTA_TSTAMP(sb) ((struct timeval *)\ ++ &sb->s_dquot.dqio_sem) ++ ++#if defined(VZ_QUOTA_UNLOAD) ++ ++#define __VZ_QUOTA_SBREF(sb) sb->s_dquot.vzdq_count ++ ++struct dquot_operations *orig_dq_op; ++struct quotactl_ops *orig_dq_cop; ++ ++/** ++ * quota_get_super - account for new a quoted tree under the superblock ++ * ++ * One superblock can have multiple directory subtrees with different VZ ++ * quotas. We keep a counter of such subtrees and set VZ quota operations or ++ * reset the default ones. ++ * ++ * Called under vz_quota_sem (from quota_on). ++ */ ++int vzquota_get_super(struct super_block *sb) ++{ ++ if (sb->dq_op != &vz_quota_operations) { ++ down(&sb->s_dquot.dqonoff_sem); ++ if (sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) { ++ up(&sb->s_dquot.dqonoff_sem); ++ return -EEXIST; ++ } ++ if (orig_dq_op == NULL && sb->dq_op != NULL) ++ orig_dq_op = sb->dq_op; ++ sb->dq_op = &vz_quota_operations; ++ if (orig_dq_cop == NULL && sb->s_qcop != NULL) ++ orig_dq_cop = sb->s_qcop; ++ /* XXX this may race with sys_quotactl */ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ sb->s_qcop = &vz_quotactl_operations; ++#else ++ sb->s_qcop = NULL; ++#endif ++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = &vz_quota_empty_v2_format; ++ /* ++ * To get quotaops.h call us we need to mark superblock ++ * as having quota. These flags mark the moment when ++ * our dq_op start to be called. ++ * ++ * The ordering of dq_op and s_dquot.flags assignment ++ * needs to be enforced, but other CPUs do not do rmb() ++ * between s_dquot.flags and dq_op accesses. ++ */ ++ wmb(); synchronize_sched(); ++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; ++ __module_get(THIS_MODULE); ++ up(&sb->s_dquot.dqonoff_sem); ++ } ++ /* protected by vz_quota_sem */ ++ __VZ_QUOTA_SBREF(sb)++; ++ return 0; ++} ++ ++/** ++ * quota_put_super - release superblock when one quota tree goes away ++ * ++ * Called under vz_quota_sem. ++ */ ++void vzquota_put_super(struct super_block *sb) ++{ ++ int count; ++ ++ count = --__VZ_QUOTA_SBREF(sb); ++ if (count == 0) { ++ down(&sb->s_dquot.dqonoff_sem); ++ sb->s_dquot.flags = 0; ++ wmb(); synchronize_sched(); ++ sema_init(&sb->s_dquot.dqio_sem, 1); ++ sb->s_qcop = orig_dq_cop; ++ sb->dq_op = orig_dq_op; ++ inode_qmblk_lock(sb); ++ quota_gen_put(SB_QGEN(sb)); ++ SB_QGEN(sb) = NULL; ++ /* release qlnk's without qmblk */ ++ remove_inode_quota_links_list(&non_vzquota_inodes_lh, ++ sb, NULL); ++ /* ++ * Races with quota initialization: ++ * after this inode_qmblk_unlock all inode's generations are ++ * invalidated, quota_inode_qmblk checks superblock operations. ++ */ ++ inode_qmblk_unlock(sb); ++ /* ++ * Module refcounting: in theory, this is the best place ++ * to call module_put(THIS_MODULE). ++ * In reality, it can't be done because we can't be sure that ++ * other CPUs do not enter our code segment through dq_op ++ * cached long time ago. Quotaops interface isn't supposed to ++ * go into modules currently (that is, into unloadable ++ * modules). By omitting module_put, our module isn't ++ * unloadable. ++ */ ++ up(&sb->s_dquot.dqonoff_sem); ++ } ++} ++ ++#else ++ ++struct vzquota_new_sop { ++ struct super_operations new_op; ++ struct super_operations *old_op; ++}; ++ ++/** ++ * vzquota_shutdown_super - callback on umount ++ */ ++void vzquota_shutdown_super(struct super_block *sb) ++{ ++ struct vz_quota_master *qmblk; ++ struct vzquota_new_sop *sop; ++ ++ qmblk = __VZ_QUOTA_NOQUOTA(sb); ++ __VZ_QUOTA_NOQUOTA(sb) = NULL; ++ if (qmblk != NULL) ++ qmblk_put(qmblk); ++ sop = container_of(sb->s_op, struct vzquota_new_sop, new_op); ++ sb->s_op = sop->old_op; ++ kfree(sop); ++ (*sb->s_op->put_super)(sb); ++} ++ ++/** ++ * vzquota_get_super - account for new a quoted tree under the superblock ++ * ++ * One superblock can have multiple directory subtrees with different VZ ++ * quotas. ++ * ++ * Called under vz_quota_sem (from vzquota_on). ++ */ ++int vzquota_get_super(struct super_block *sb) ++{ ++ struct vz_quota_master *qnew; ++ struct vzquota_new_sop *sop; ++ int err; ++ ++ down(&sb->s_dquot.dqonoff_sem); ++ err = -EEXIST; ++ if ((sb->s_dquot.flags & (DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED)) && ++ sb->dq_op != &vz_quota_operations) ++ goto out_up; ++ ++ /* ++ * This allocation code should be under sb->dq_op check below, but ++ * it doesn't really matter... ++ */ ++ if (__VZ_QUOTA_NOQUOTA(sb) == NULL) { ++ qnew = vzquota_alloc_fake(); ++ if (qnew == NULL) ++ goto out_up; ++ __VZ_QUOTA_NOQUOTA(sb) = qnew; ++ } ++ ++ if (sb->dq_op != &vz_quota_operations) { ++ sop = kmalloc(sizeof(*sop), GFP_KERNEL); ++ if (sop == NULL) { ++ vzquota_free_master(__VZ_QUOTA_NOQUOTA(sb)); ++ __VZ_QUOTA_NOQUOTA(sb) = NULL; ++ goto out_up; ++ } ++ memcpy(&sop->new_op, sb->s_op, sizeof(sop->new_op)); ++ sop->new_op.put_super = &vzquota_shutdown_super; ++ sop->old_op = sb->s_op; ++ sb->s_op = &sop->new_op; ++ ++ sb->dq_op = &vz_quota_operations; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ sb->s_qcop = &vz_quotactl_operations; ++#else ++ sb->s_qcop = NULL; ++#endif ++ do_gettimeofday(__VZ_QUOTA_TSTAMP(sb)); ++ ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ /* these 2 list heads are checked in sync_dquots() */ ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ sb->s_dquot.info[USRQUOTA].dqi_format = ++ &vz_quota_empty_v2_format; ++ sb->s_dquot.info[GRPQUOTA].dqi_format = ++ &vz_quota_empty_v2_format; ++ ++ /* ++ * To get quotaops.h to call us we need to mark superblock ++ * as having quota. These flags mark the moment when ++ * our dq_op start to be called. ++ * ++ * The ordering of dq_op and s_dquot.flags assignment ++ * needs to be enforced, but other CPUs do not do rmb() ++ * between s_dquot.flags and dq_op accesses. ++ */ ++ wmb(); synchronize_sched(); ++ sb->s_dquot.flags = DQUOT_USR_ENABLED|DQUOT_GRP_ENABLED; ++ } ++ err = 0; ++ ++out_up: ++ up(&sb->s_dquot.dqonoff_sem); ++ return err; ++} ++ ++/** ++ * vzquota_put_super - one quota tree less on this superblock ++ * ++ * Called under vz_quota_sem. ++ */ ++void vzquota_put_super(struct super_block *sb) ++{ ++ /* ++ * Even if this put is the last one, ++ * sb->s_dquot.flags can't be cleared, because otherwise vzquota_drop ++ * won't be called and the remaining qmblk references won't be put. ++ */ ++} ++ ++#endif ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Helpers for inode -> qmblk link maintenance ++ * ++ * --------------------------------------------------------------------- */ ++ ++#define __VZ_QUOTA_EMPTY ((void *)0xbdbdbdbd) ++#define VZ_QUOTA_IS_NOQUOTA(qm, sb) ((qm)->dq_flags & VZDQ_NOQUOT) ++#define VZ_QUOTA_EMPTY_IOPS (&vfs_empty_iops) ++extern struct inode_operations vfs_empty_iops; ++ ++static int VZ_QUOTA_IS_ACTUAL(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk == VZ_QUOTA_BAD) ++ return 1; ++ if (qmblk == __VZ_QUOTA_EMPTY) ++ return 0; ++ if (qmblk->dq_flags & VZDQ_NOACT) ++ /* not actual (invalidated) qmblk */ ++ return 0; ++ return 1; ++} ++ ++static inline int vzquota_qlnk_is_empty(struct vz_quota_ilink *qlnk) ++{ ++ return qlnk->qmblk == __VZ_QUOTA_EMPTY; ++} ++ ++static inline void vzquota_qlnk_set_empty(struct vz_quota_ilink *qlnk) ++{ ++ qlnk->qmblk = __VZ_QUOTA_EMPTY; ++ qlnk->origin = VZ_QUOTAO_SETE; ++} ++ ++void vzquota_qlnk_init(struct vz_quota_ilink *qlnk) ++{ ++ memset(qlnk, 0, sizeof(*qlnk)); ++ INIT_LIST_HEAD(&qlnk->list); ++ vzquota_qlnk_set_empty(qlnk); ++ qlnk->origin = VZ_QUOTAO_INIT; ++} ++ ++void vzquota_qlnk_destroy(struct vz_quota_ilink *qlnk) ++{ ++ might_sleep(); ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return; ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) { ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *quid, *qgid; ++ qmblk = qlnk->qmblk; ++ quid = qlnk->qugid[USRQUOTA]; ++ qgid = qlnk->qugid[GRPQUOTA]; ++ if (quid != NULL || qgid != NULL) { ++ down(&qmblk->dq_sem); ++ if (qgid != NULL) ++ vzquota_put_ugid(qmblk, qgid); ++ if (quid != NULL) ++ vzquota_put_ugid(qmblk, quid); ++ up(&qmblk->dq_sem); ++ } ++ } ++#endif ++ if (qlnk->qmblk != NULL && qlnk->qmblk != VZ_QUOTA_BAD) ++ qmblk_put(qlnk->qmblk); ++ qlnk->origin = VZ_QUOTAO_DESTR; ++} ++ ++/** ++ * vzquota_qlnk_swap - swap inode's and temporary vz_quota_ilink contents ++ * @qlt: temporary ++ * @qli: inode's ++ * ++ * Locking is provided by the caller (depending on the context). ++ * After swap, @qli is inserted into the corresponding dq_ilink_list, ++ * @qlt list is reinitialized. ++ */ ++static void vzquota_qlnk_swap(struct vz_quota_ilink *qlt, ++ struct vz_quota_ilink *qli) ++{ ++ struct vz_quota_master *qb; ++ struct vz_quota_ugid *qu; ++ int i; ++ ++ qb = qlt->qmblk; ++ qlt->qmblk = qli->qmblk; ++ qli->qmblk = qb; ++ list_del_init(&qli->list); ++ if (qb != __VZ_QUOTA_EMPTY && qb != VZ_QUOTA_BAD) ++ list_add(&qli->list, &qb->dq_ilink_list); ++ INIT_LIST_HEAD(&qlt->list); ++ qli->origin = VZ_QUOTAO_SWAP; ++ ++ for (i = 0; i < MAXQUOTAS; i++) { ++ qu = qlt->qugid[i]; ++ qlt->qugid[i] = qli->qugid[i]; ++ qli->qugid[i] = qu; ++ } ++} ++ ++/** ++ * vzquota_qlnk_reinit_locked - destroy qlnk content, called under locks ++ * ++ * Called under dcache_lock and inode_qmblk locks. ++ * Returns 1 if locks were dropped inside, 0 if atomic. ++ */ ++static int vzquota_qlnk_reinit_locked(struct vz_quota_ilink *qlnk, ++ struct inode *inode) ++{ ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return 0; ++ if (qlnk->qmblk == VZ_QUOTA_BAD) { ++ vzquota_qlnk_set_empty(qlnk); ++ return 0; ++ } ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(qlnk); ++ vzquota_qlnk_init(qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ spin_lock(&dcache_lock); ++ return 1; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_qlnk_reinit_attr - destroy and reinit qlnk content ++ * ++ * Similar to vzquota_qlnk_reinit_locked, called under different locks. ++ */ ++static int vzquota_qlnk_reinit_attr(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ if (vzquota_qlnk_is_empty(qlnk)) ++ return 0; ++ /* may be optimized if qlnk->qugid all NULLs */ ++ qmblk_data_write_unlock(qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(qlnk); ++ vzquota_qlnk_init(qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ qmblk_data_write_lock(qmblk); ++ return 1; ++} ++#endif ++ ++/** ++ * vzquota_qlnk_fill - fill vz_quota_ilink content ++ * @qlnk: vz_quota_ilink to fill ++ * @inode: inode for which @qlnk is filled (i_sb, i_uid, i_gid) ++ * @qmblk: qmblk to which this @qlnk will belong ++ * ++ * Called under dcache_lock and inode_qmblk locks. ++ * Returns 1 if locks were dropped inside, 0 if atomic. ++ * @qlnk is expected to be empty. ++ */ ++static int vzquota_qlnk_fill(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ if (qmblk != VZ_QUOTA_BAD) ++ qmblk_get(qmblk); ++ qlnk->qmblk = qmblk; ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++ if (qmblk != VZ_QUOTA_BAD && ++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && ++ (qmblk->dq_flags & VZDQUG_ON)) { ++ struct vz_quota_ugid *quid, *qgid; ++ ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(inode->i_sb); ++ ++ down(&qmblk->dq_sem); ++ quid = __vzquota_find_ugid(qmblk, inode->i_uid, USRQUOTA, 0); ++ qgid = __vzquota_find_ugid(qmblk, inode->i_gid, GRPQUOTA, 0); ++ up(&qmblk->dq_sem); ++ ++ inode_qmblk_lock(inode->i_sb); ++ spin_lock(&dcache_lock); ++ qlnk->qugid[USRQUOTA] = quid; ++ qlnk->qugid[GRPQUOTA] = qgid; ++ return 1; ++ } ++#endif ++ ++ return 0; ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_qlnk_fill_attr - fill vz_quota_ilink content for uid, gid ++ * ++ * This function is a helper for vzquota_transfer, and differs from ++ * vzquota_qlnk_fill only by locking. ++ */ ++static int vzquota_qlnk_fill_attr(struct vz_quota_ilink *qlnk, ++ struct inode *inode, ++ struct iattr *iattr, ++ int mask, ++ struct vz_quota_master *qmblk) ++{ ++ qmblk_get(qmblk); ++ qlnk->qmblk = qmblk; ++ ++ if (mask) { ++ struct vz_quota_ugid *quid, *qgid; ++ ++ quid = qgid = NULL; /* to make gcc happy */ ++ if (!(mask & (1 << USRQUOTA))) ++ quid = vzquota_get_ugid(INODE_QLNK(inode)-> ++ qugid[USRQUOTA]); ++ if (!(mask & (1 << GRPQUOTA))) ++ qgid = vzquota_get_ugid(INODE_QLNK(inode)-> ++ qugid[GRPQUOTA]); ++ ++ qmblk_data_write_unlock(qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ ++ down(&qmblk->dq_sem); ++ if (mask & (1 << USRQUOTA)) ++ quid = __vzquota_find_ugid(qmblk, iattr->ia_uid, ++ USRQUOTA, 0); ++ if (mask & (1 << GRPQUOTA)) ++ qgid = __vzquota_find_ugid(qmblk, iattr->ia_gid, ++ GRPQUOTA, 0); ++ up(&qmblk->dq_sem); ++ ++ inode_qmblk_lock(inode->i_sb); ++ qmblk_data_write_lock(qmblk); ++ qlnk->qugid[USRQUOTA] = quid; ++ qlnk->qugid[GRPQUOTA] = qgid; ++ return 1; ++ } ++ ++ return 0; ++} ++#endif ++ ++/** ++ * __vzquota_inode_init - make sure inode's qlnk is initialized ++ * ++ * May be called if qlnk is already initialized, detects this situation itself. ++ * Called under inode_qmblk_lock. ++ */ ++static void __vzquota_inode_init(struct inode *inode, unsigned char origin) ++{ ++ if (inode->i_dquot[USRQUOTA] == NODQUOT) { ++ vzquota_qlnk_init(INODE_QLNK(inode)); ++ inode->i_dquot[USRQUOTA] = (void *)~(unsigned long)NODQUOT; ++ } ++ INODE_QLNK(inode)->origin = origin; ++} ++ ++/** ++ * vzquota_inode_drop - destroy VZ quota information in the inode ++ * ++ * Inode must not be externally accessible or dirty. ++ */ ++static void vzquota_inode_drop(struct inode *inode) ++{ ++ struct vz_quota_ilink qlnk; ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ vzquota_qlnk_swap(&qlnk, INODE_QLNK(inode)); ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DRCAL; ++ inode->i_dquot[USRQUOTA] = NODQUOT; ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++} ++ ++/** ++ * vzquota_inode_qmblk_set - initialize inode's qlnk ++ * @inode: inode to be initialized ++ * @qmblk: quota master block to which this inode should belong (may be BAD) ++ * @qlnk: placeholder to store data to resolve locking issues ++ * ++ * Returns 1 if locks were dropped and rechecks possibly needed, 0 otherwise. ++ * Called under dcache_lock and inode_qmblk locks. ++ * @qlnk will be destroyed in the caller chain. ++ * ++ * It is not mandatory to restart parent checks since quota on/off currently ++ * shrinks dentry tree and checks that there are not outside references. ++ * But if at some time that shink is removed, restarts will be required. ++ * Additionally, the restarts prevent inconsistencies if the dentry tree ++ * changes (inode is moved). This is not a big deal, but anyway... ++ */ ++static int vzquota_inode_qmblk_set(struct inode *inode, ++ struct vz_quota_master *qmblk, ++ struct vz_quota_ilink *qlnk) ++{ ++ if (qmblk == NULL) { ++ printk(KERN_ERR "VZDQ: NULL in set, " ++ "orig %u, dev %s, inode %lu, fs %s\n", ++ INODE_QLNK(inode)->origin, ++ inode->i_sb->s_id, inode->i_ino, ++ inode->i_sb->s_type->name); ++ printk(KERN_ERR "current %d (%s), VE %d\n", ++ current->pid, current->comm, ++ VEID(get_exec_env())); ++ dump_stack(); ++ qmblk = VZ_QUOTA_BAD; ++ } ++ while (1) { ++ if (vzquota_qlnk_is_empty(qlnk) && ++ vzquota_qlnk_fill(qlnk, inode, qmblk)) ++ return 1; ++ if (qlnk->qmblk == qmblk) ++ break; ++ if (vzquota_qlnk_reinit_locked(qlnk, inode)) ++ return 1; ++ } ++ vzquota_qlnk_swap(qlnk, INODE_QLNK(inode)); ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_QSET; ++ return 0; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * vzquota_inode_qmblk (inode -> qmblk lookup) parts ++ * ++ * --------------------------------------------------------------------- */ ++ ++static int vzquota_dparents_check_attach(struct inode *inode) ++{ ++ if (!list_empty(&inode->i_dentry)) ++ return 0; ++ printk(KERN_ERR "VZDQ: no parent for " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ return -1; ++} ++ ++static struct inode *vzquota_dparents_check_actual(struct inode *inode) ++{ ++ struct dentry *de; ++ ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent == de) /* detached dentry, perhaps */ ++ continue; ++ /* first access to parent, make sure its qlnk initialized */ ++ __vzquota_inode_init(de->d_parent->d_inode, VZ_QUOTAO_ACT); ++ if (!VZ_QUOTA_IS_ACTUAL(de->d_parent->d_inode)) ++ return de->d_parent->d_inode; ++ } ++ return NULL; ++} ++ ++static struct vz_quota_master *vzquota_dparents_check_same(struct inode *inode) ++{ ++ struct dentry *de; ++ struct vz_quota_master *qmblk; ++ ++ qmblk = NULL; ++ list_for_each_entry(de, &inode->i_dentry, d_alias) { ++ if (de->d_parent == de) /* detached dentry, perhaps */ ++ continue; ++ if (qmblk == NULL) { ++ qmblk = INODE_QLNK(de->d_parent->d_inode)->qmblk; ++ continue; ++ } ++ if (INODE_QLNK(de->d_parent->d_inode)->qmblk != qmblk) { ++ printk(KERN_WARNING "VZDQ: multiple quotas for " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ qmblk = VZ_QUOTA_BAD; ++ break; ++ } ++ } ++ if (qmblk == NULL) { ++ printk(KERN_WARNING "VZDQ: not attached to tree, " ++ "dev %s, inode %lu, fs %s\n", ++ inode->i_sb->s_id, ++ inode->i_ino, ++ inode->i_sb->s_type->name); ++ qmblk = VZ_QUOTA_BAD; ++ } ++ return qmblk; ++} ++ ++static void vzquota_dbranch_actualize(struct inode *inode, ++ struct inode *refinode) ++{ ++ struct inode *pinode; ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk; ++ ++ vzquota_qlnk_init(&qlnk); ++ ++start: ++ if (inode == inode->i_sb->s_root->d_inode) { ++ /* filesystem root */ ++ atomic_inc(&inode->i_count); ++ do { ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ } while (vzquota_inode_qmblk_set(inode, qmblk, &qlnk)); ++ goto out; ++ } ++ ++ if (!vzquota_dparents_check_attach(inode)) { ++ pinode = vzquota_dparents_check_actual(inode); ++ if (pinode != NULL) { ++ inode = pinode; ++ goto start; ++ } ++ } ++ ++ atomic_inc(&inode->i_count); ++ while (1) { ++ if (VZ_QUOTA_IS_ACTUAL(inode)) /* actualized without us */ ++ break; ++ /* ++ * Need to check parents again if we have slept inside ++ * vzquota_inode_qmblk_set() in the loop. ++ * If the state of parents is different, just return and repeat ++ * the actualizing process again from the inode passed to ++ * vzquota_inode_qmblk_recalc(). ++ */ ++ if (!vzquota_dparents_check_attach(inode)) { ++ if (vzquota_dparents_check_actual(inode) != NULL) ++ break; ++ qmblk = vzquota_dparents_check_same(inode); ++ } else ++ qmblk = VZ_QUOTA_BAD; ++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)){/* success */ ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_ACT; ++ break; ++ } ++ } ++ ++out: ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(refinode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++ iput(inode); ++ inode_qmblk_lock(refinode->i_sb); ++ spin_lock(&dcache_lock); ++} ++ ++static void vzquota_dtree_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct inode *pinode; ++ struct vz_quota_master *qmblk; ++ ++ if (inode == inode->i_sb->s_root->d_inode) { ++ /* filesystem root */ ++ do { ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ } while (vzquota_inode_qmblk_set(inode, qmblk, qlnk)); ++ return; ++ } ++ ++start: ++ if (VZ_QUOTA_IS_ACTUAL(inode)) ++ return; ++ /* ++ * Here qmblk is (re-)initialized for all ancestors. ++ * This is not a very efficient procedure, but it guarantees that ++ * the quota tree is consistent (that is, the inode doesn't have two ++ * ancestors with different qmblk). ++ */ ++ if (!vzquota_dparents_check_attach(inode)) { ++ pinode = vzquota_dparents_check_actual(inode); ++ if (pinode != NULL) { ++ vzquota_dbranch_actualize(pinode, inode); ++ goto start; ++ } ++ qmblk = vzquota_dparents_check_same(inode); ++ } else ++ qmblk = VZ_QUOTA_BAD; ++ ++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) ++ goto start; ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DTREE; ++} ++ ++static void vzquota_det_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ struct inode *parent; ++ struct vz_quota_master *qmblk; ++ char *msg; ++ int cnt; ++ time_t timeout; ++ ++ cnt = 0; ++ parent = NULL; ++start: ++ /* ++ * qmblk of detached inodes shouldn't be considered as not actual. ++ * They are not in any dentry tree, so quota on/off shouldn't affect ++ * them. ++ */ ++ if (!vzquota_qlnk_is_empty(INODE_QLNK(inode))) ++ return; ++ ++ timeout = 3; ++ qmblk = __VZ_QUOTA_NOQUOTA(inode->i_sb); ++ msg = "detached inode not in creation"; ++ if (inode->i_op != VZ_QUOTA_EMPTY_IOPS) ++ goto fail; ++ qmblk = VZ_QUOTA_BAD; ++ msg = "unexpected creation context"; ++ if (!vzquota_cur_qmblk_check()) ++ goto fail; ++ timeout = 0; ++ parent = vzquota_cur_qmblk_fetch(); ++ msg = "uninitialized parent"; ++ if (vzquota_qlnk_is_empty(INODE_QLNK(parent))) ++ goto fail; ++ msg = "parent not in tree"; ++ if (list_empty(&parent->i_dentry)) ++ goto fail; ++ msg = "parent has 0 refcount"; ++ if (!atomic_read(&parent->i_count)) ++ goto fail; ++ msg = "parent has different sb"; ++ if (parent->i_sb != inode->i_sb) ++ goto fail; ++ if (!VZ_QUOTA_IS_ACTUAL(parent)) { ++ vzquota_dbranch_actualize(parent, inode); ++ goto start; ++ } ++ ++ qmblk = INODE_QLNK(parent)->qmblk; ++set: ++ if (vzquota_inode_qmblk_set(inode, qmblk, qlnk)) ++ goto start; ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_DET; ++ return; ++ ++fail: ++ { ++ struct timeval tv, tvo; ++ do_gettimeofday(&tv); ++ memcpy(&tvo, __VZ_QUOTA_TSTAMP(inode->i_sb), sizeof(tvo)); ++ tv.tv_sec -= tvo.tv_sec; ++ if (tv.tv_usec < tvo.tv_usec) { ++ tv.tv_sec--; ++ tv.tv_usec += USEC_PER_SEC - tvo.tv_usec; ++ } else ++ tv.tv_usec -= tvo.tv_usec; ++ if (tv.tv_sec < timeout) ++ goto set; ++ printk(KERN_ERR "VZDQ: %s, orig %u," ++ " dev %s, inode %lu, fs %s\n", ++ msg, INODE_QLNK(inode)->origin, ++ inode->i_sb->s_id, inode->i_ino, ++ inode->i_sb->s_type->name); ++ if (!cnt++) { ++ printk(KERN_ERR "current %d (%s), VE %d," ++ " time %ld.%06ld\n", ++ current->pid, current->comm, ++ VEID(get_exec_env()), ++ tv.tv_sec, tv.tv_usec); ++ dump_stack(); ++ } ++ if (parent != NULL) ++ printk(KERN_ERR "VZDQ: parent of %lu is %lu\n", ++ inode->i_ino, parent->i_ino); ++ } ++ goto set; ++} ++ ++static void vzquota_inode_qmblk_recalc(struct inode *inode, ++ struct vz_quota_ilink *qlnk) ++{ ++ spin_lock(&dcache_lock); ++ if (!list_empty(&inode->i_dentry)) ++ vzquota_dtree_qmblk_recalc(inode, qlnk); ++ else ++ vzquota_det_qmblk_recalc(inode, qlnk); ++ spin_unlock(&dcache_lock); ++} ++ ++/** ++ * vzquota_inode_qmblk - obtain inode's qmblk ++ * ++ * Returns qmblk with refcounter taken, %NULL if not under ++ * VZ quota or %VZ_QUOTA_BAD. ++ * ++ * FIXME: This function should be removed when vzquota_find_qmblk / ++ * get_quota_root / vzquota_dstat code is cleaned up. ++ */ ++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk; ++ ++ might_sleep(); ++ ++ if (inode->i_sb->dq_op != &vz_quota_operations) ++ return NULL; ++#if defined(VZ_QUOTA_UNLOAD) ++#error Make sure qmblk does not disappear ++#endif ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) ++ vzquota_inode_qmblk_recalc(inode, &qlnk); ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) ++ qmblk_get(qmblk); ++ else ++ qmblk = NULL; ++ } ++ ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk); ++ return qmblk; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Calls from quota operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_inode_init_call - call from DQUOT_INIT ++ */ ++void vzquota_inode_init_call(struct inode *inode) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ ++ /* initializes inode's quota inside */ ++ qmblk = vzquota_inode_data(inode, &data); ++ if (qmblk != NULL && qmblk != VZ_QUOTA_BAD) ++ vzquota_data_unlock(inode, &data); ++ ++ /* ++ * The check is needed for repeated new_inode() calls from a single ++ * ext3 call like create or mkdir in case of -ENOSPC. ++ */ ++ spin_lock(&dcache_lock); ++ if (!list_empty(&inode->i_dentry)) ++ vzquota_cur_qmblk_set(inode); ++ spin_unlock(&dcache_lock); ++} ++ ++/** ++ * vzquota_inode_drop_call - call from DQUOT_DROP ++ */ ++void vzquota_inode_drop_call(struct inode *inode) ++{ ++ vzquota_inode_drop(inode); ++} ++ ++/** ++ * vzquota_inode_data - initialize (if nec.) and lock inode quota ptrs ++ * @inode: the inode ++ * @data: storage space ++ * ++ * Returns: qmblk is NULL or VZ_QUOTA_BAD or actualized qmblk. ++ * On return if qmblk is neither NULL nor VZ_QUOTA_BAD: ++ * qmblk in inode's qlnk is the same as returned, ++ * ugid pointers inside inode's qlnk are valid, ++ * some locks are taken (and should be released by vzquota_data_unlock). ++ * If qmblk is NULL or VZ_QUOTA_BAD, locks are NOT taken. ++ */ ++struct vz_quota_master *vzquota_inode_data(struct inode *inode, ++ struct vz_quota_datast *data) ++{ ++ struct vz_quota_master *qmblk; ++ ++ might_sleep(); ++ ++ vzquota_qlnk_init(&data->qlnk); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) ++ vzquota_inode_qmblk_recalc(inode, &data->qlnk); ++ ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != VZ_QUOTA_BAD) { ++ if (!VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb)) { ++ /* ++ * Note that in the current implementation, ++ * inode_qmblk_lock can theoretically be dropped here. ++ * This place is serialized with quota_off because ++ * quota_off fails when there are extra dentry ++ * references and syncs inodes before removing quota ++ * information from them. ++ * However, quota usage information should stop being ++ * updated immediately after vzquota_off. ++ */ ++ qmblk_data_write_lock(qmblk); ++ } else { ++ inode_qmblk_unlock(inode->i_sb); ++ qmblk = NULL; ++ } ++ } else { ++ inode_qmblk_unlock(inode->i_sb); ++ } ++ return qmblk; ++} ++ ++void vzquota_data_unlock(struct inode *inode, ++ struct vz_quota_datast *data) ++{ ++ qmblk_data_write_unlock(INODE_QLNK(inode)->qmblk); ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&data->qlnk); ++} ++ ++#if defined(CONFIG_VZ_QUOTA_UGID) ++/** ++ * vzquota_inode_transfer_call - call from vzquota_transfer ++ */ ++int vzquota_inode_transfer_call(struct inode *inode, struct iattr *iattr) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_datast data; ++ struct vz_quota_ilink qlnew; ++ int mask; ++ int ret; ++ ++ might_sleep(); ++ vzquota_qlnk_init(&qlnew); ++start: ++ qmblk = vzquota_inode_data(inode, &data); ++ ret = NO_QUOTA; ++ if (qmblk == VZ_QUOTA_BAD) ++ goto out_destr; ++ ret = QUOTA_OK; ++ if (qmblk == NULL) ++ goto out_destr; ++ qmblk_get(qmblk); ++ ++ ret = QUOTA_OK; ++ if (!(qmblk->dq_flags & VZDQUG_ON)) ++ /* no ugid quotas */ ++ goto out_unlock; ++ ++ mask = 0; ++ if ((iattr->ia_valid & ATTR_UID) && iattr->ia_uid != inode->i_uid) ++ mask |= 1 << USRQUOTA; ++ if ((iattr->ia_valid & ATTR_GID) && iattr->ia_gid != inode->i_gid) ++ mask |= 1 << GRPQUOTA; ++ while (1) { ++ if (vzquota_qlnk_is_empty(&qlnew) && ++ vzquota_qlnk_fill_attr(&qlnew, inode, iattr, mask, qmblk)) ++ break; ++ if (qlnew.qmblk == INODE_QLNK(inode)->qmblk && ++ qlnew.qmblk == qmblk) ++ goto finish; ++ if (vzquota_qlnk_reinit_attr(&qlnew, inode, qmblk)) ++ break; ++ } ++ ++ /* prepare for restart */ ++ vzquota_data_unlock(inode, &data); ++ qmblk_put(qmblk); ++ goto start; ++ ++finish: ++ /* all references obtained successfully */ ++ ret = vzquota_transfer_usage(inode, mask, &qlnew); ++ if (!ret) { ++ vzquota_qlnk_swap(&qlnew, INODE_QLNK(inode)); ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_TRANS; ++ } ++out_unlock: ++ vzquota_data_unlock(inode, &data); ++ qmblk_put(qmblk); ++out_destr: ++ vzquota_qlnk_destroy(&qlnew); ++ return ret; ++} ++#endif ++ ++int vzquota_rename_check(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ilink qlnk1, qlnk2; ++ int c, ret; ++ ++ if (inode->i_sb != old_dir->i_sb || inode->i_sb != new_dir->i_sb) ++ return -1; ++ ++ might_sleep(); ++ ++ vzquota_qlnk_init(&qlnk1); ++ vzquota_qlnk_init(&qlnk2); ++ inode_qmblk_lock(inode->i_sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ __vzquota_inode_init(old_dir, VZ_QUOTAO_INICAL); ++ __vzquota_inode_init(new_dir, VZ_QUOTAO_INICAL); ++ ++ do { ++ c = 0; ++ if (vzquota_qlnk_is_empty(INODE_QLNK(inode)) || ++ !VZ_QUOTA_IS_ACTUAL(inode)) { ++ vzquota_inode_qmblk_recalc(inode, &qlnk1); ++ c++; ++ } ++ if (vzquota_qlnk_is_empty(INODE_QLNK(new_dir)) || ++ !VZ_QUOTA_IS_ACTUAL(new_dir)) { ++ vzquota_inode_qmblk_recalc(new_dir, &qlnk2); ++ c++; ++ } ++ } while (c); ++ ++ ret = 0; ++ qmblk = INODE_QLNK(inode)->qmblk; ++ if (qmblk != INODE_QLNK(new_dir)->qmblk) { ++ ret = -1; ++ if (qmblk != VZ_QUOTA_BAD && ++ !VZ_QUOTA_IS_NOQUOTA(qmblk, inode->i_sb) && ++ qmblk->dq_root_dentry->d_inode == inode && ++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(new_dir)->qmblk, ++ inode->i_sb) && ++ VZ_QUOTA_IS_NOQUOTA(INODE_QLNK(old_dir)->qmblk, ++ inode->i_sb)) ++ /* quota root rename is allowed */ ++ ret = 0; ++ } ++ ++ inode_qmblk_unlock(inode->i_sb); ++ vzquota_qlnk_destroy(&qlnk2); ++ vzquota_qlnk_destroy(&qlnk1); ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * qmblk-related parts of on/off operations ++ * ++ * --------------------------------------------------------------------- */ ++ ++/** ++ * vzquota_check_dtree - check dentry tree if quota on/off is allowed ++ * ++ * This function doesn't allow quota to be turned on/off if some dentries in ++ * the tree have external references. ++ * In addition to technical reasons, it enforces user-space correctness: ++ * current usage (taken from or reported to the user space) can be meaningful ++ * and accurate only if the tree is not being modified. ++ * Side effect: additional vfsmount structures referencing the tree (bind ++ * mounts of tree nodes to some other places) are not allowed at on/off time. ++ */ ++int vzquota_check_dtree(struct vz_quota_master *qmblk, int off) ++{ ++ struct dentry *dentry; ++ int err, count; ++ ++ err = -EBUSY; ++ dentry = qmblk->dq_root_dentry; ++ ++ if (d_unhashed(dentry) && dentry != dentry->d_sb->s_root) ++ goto unhashed; ++ ++ /* attempt to shrink */ ++ if (!list_empty(&dentry->d_subdirs)) { ++ spin_unlock(&dcache_lock); ++ inode_qmblk_unlock(dentry->d_sb); ++ shrink_dcache_parent(dentry); ++ inode_qmblk_lock(dentry->d_sb); ++ spin_lock(&dcache_lock); ++ if (!list_empty(&dentry->d_subdirs)) ++ goto out; ++ ++ count = 1; ++ if (dentry == dentry->d_sb->s_root) ++ count += 2; /* sb and mnt refs */ ++ if (atomic_read(&dentry->d_count) < count) { ++ printk(KERN_ERR "%s: too small count %d vs %d.\n", ++ __FUNCTION__, ++ atomic_read(&dentry->d_count), count); ++ goto out; ++ } ++ if (atomic_read(&dentry->d_count) > count) ++ goto out; ++ } ++ ++ err = 0; ++out: ++ return err; ++ ++unhashed: ++ /* ++ * Quota root is removed. ++ * Allow to turn quota off, but not on. ++ */ ++ if (off) ++ err = 0; ++ goto out; ++} ++ ++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, ++ struct vz_quota_master *qmblk) ++{ ++ struct vz_quota_ilink qlnk; ++ struct vz_quota_master *qold, *qnew; ++ int err; ++ ++ might_sleep(); ++ ++ qold = NULL; ++ qnew = vzquota_alloc_fake(); ++ if (qnew == NULL) ++ return -ENOMEM; ++ ++ vzquota_qlnk_init(&qlnk); ++ inode_qmblk_lock(sb); ++ __vzquota_inode_init(inode, VZ_QUOTAO_INICAL); ++ ++ spin_lock(&dcache_lock); ++ while (1) { ++ err = vzquota_check_dtree(qmblk, 0); ++ if (err) ++ break; ++ if (!vzquota_inode_qmblk_set(inode, qmblk, &qlnk)) ++ break; ++ } ++ INODE_QLNK(inode)->origin = VZ_QUOTAO_ON; ++ spin_unlock(&dcache_lock); ++ ++ if (!err) { ++ qold = __VZ_QUOTA_NOQUOTA(sb); ++ qold->dq_flags |= VZDQ_NOACT; ++ __VZ_QUOTA_NOQUOTA(sb) = qnew; ++ } ++ ++ inode_qmblk_unlock(sb); ++ vzquota_qlnk_destroy(&qlnk); ++ if (qold != NULL) ++ qmblk_put(qold); ++ ++ return err; ++} ++ ++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk) ++{ ++ int ret; ++ ++ ret = 0; ++ inode_qmblk_lock(sb); ++ ++ spin_lock(&dcache_lock); ++ if (vzquota_check_dtree(qmblk, 1)) ++ ret = -EBUSY; ++ spin_unlock(&dcache_lock); ++ ++ if (!ret) ++ qmblk->dq_flags |= VZDQ_NOACT | VZDQ_NOQUOT; ++ inode_qmblk_unlock(sb); ++ return ret; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * External interfaces ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int vzquota_ioctl(struct inode *ino, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ int err; ++ struct vzctl_quotactl qb; ++ struct vzctl_quotaugidctl qub; ++ ++ switch (cmd) { ++ case VZCTL_QUOTA_CTL: ++ err = -ENOTTY; ++ break; ++ case VZCTL_QUOTA_NEW_CTL: ++ err = -EFAULT; ++ if (copy_from_user(&qb, (void *)arg, sizeof(qb))) ++ break; ++ err = do_vzquotactl(qb.cmd, qb.quota_id, ++ qb.qstat, qb.ve_root); ++ break; ++#ifdef CONFIG_VZ_QUOTA_UGID ++ case VZCTL_QUOTA_UGID_CTL: ++ err = -EFAULT; ++ if (copy_from_user(&qub, (void *)arg, sizeof(qub))) ++ break; ++ err = do_vzquotaugidctl(&qub); ++ break; ++#endif ++ default: ++ err = -ENOTTY; ++ } ++ might_sleep(); /* debug */ ++ return err; ++} ++ ++static struct vzioctlinfo vzdqcalls = { ++ .type = VZDQCTLTYPE, ++ .func = vzquota_ioctl, ++ .owner = THIS_MODULE, ++}; ++ ++/** ++ * vzquota_dstat - get quota usage info for virtual superblock ++ */ ++static int vzquota_dstat(struct super_block *super, struct dq_stat *qstat) ++{ ++ struct vz_quota_master *qmblk; ++ ++ qmblk = vzquota_find_qmblk(super); ++ if (qmblk == NULL) ++ return -ENOENT; ++ if (qmblk == VZ_QUOTA_BAD) { ++ memset(qstat, 0, sizeof(*qstat)); ++ return 0; ++ } ++ ++ qmblk_data_read_lock(qmblk); ++ memcpy(qstat, &qmblk->dq_stat, sizeof(*qstat)); ++ qmblk_data_read_unlock(qmblk); ++ qmblk_put(qmblk); ++ return 0; ++} ++ ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Init/exit helpers ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int vzquota_cache_init(void) ++{ ++ int i; ++ ++ vzquota_cachep = kmem_cache_create("vz_quota_master", ++ sizeof(struct vz_quota_master), ++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); ++ if (vzquota_cachep == NULL) { ++ printk(KERN_ERR "Cannot create VZ_QUOTA SLAB cache\n"); ++ goto nomem2; ++ } ++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&vzquota_hash_table[i]); ++ ++ return 0; ++ ++nomem2: ++ return -ENOMEM; ++} ++ ++static void vzquota_cache_release(void) ++{ ++ int i; ++ ++ /* sanity check */ ++ for (i = 0; i < VZ_QUOTA_HASH_SIZE; i++) ++ if (!list_empty(&vzquota_hash_table[i])) ++ BUG(); ++ ++ /* release caches */ ++ if (kmem_cache_destroy(vzquota_cachep)) ++ printk(KERN_ERR ++ "VZQUOTA: vz_quota_master kmem_cache_destroy failed\n"); ++ vzquota_cachep = NULL; ++} ++ ++static int quota_notifier_call(struct vnotifier_block *self, ++ unsigned long n, void *data, int err) ++{ ++ struct virt_info_quota *viq; ++ struct super_block *sb; ++ ++ viq = (struct virt_info_quota *)data; ++ switch (n) { ++ case VIRTINFO_QUOTA_ON: ++ err = NOTIFY_BAD; ++ if (!try_module_get(THIS_MODULE)) ++ break; ++ sb = viq->super; ++ memset(&sb->s_dquot.info, 0, sizeof(sb->s_dquot.info)); ++ INIT_LIST_HEAD(&sb->s_dquot.info[USRQUOTA].dqi_dirty_list); ++ INIT_LIST_HEAD(&sb->s_dquot.info[GRPQUOTA].dqi_dirty_list); ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_OFF: ++ module_put(THIS_MODULE); ++ err = NOTIFY_OK; ++ break; ++ case VIRTINFO_QUOTA_GETSTAT: ++ err = NOTIFY_BAD; ++ if (vzquota_dstat(viq->super, viq->qstat)) ++ break; ++ err = NOTIFY_OK; ++ break; ++ } ++ return err; ++} ++ ++struct vnotifier_block quota_notifier_block = { ++ .notifier_call = quota_notifier_call, ++ .priority = INT_MAX, ++}; ++ ++/* ---------------------------------------------------------------------- ++ * ++ * Init/exit procedures ++ * ++ * ---------------------------------------------------------------------*/ ++ ++static int __init vzquota_init(void) ++{ ++ int err; ++ ++ if ((err = vzquota_cache_init()) != 0) ++ goto out_cache; ++ ++ if ((err = vzquota_proc_init()) != 0) ++ goto out_proc; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++ if ((err = vzquota_ugid_init()) != 0) ++ goto out_ugid; ++#endif ++ ++ init_MUTEX(&vz_quota_sem); ++ vzioctl_register(&vzdqcalls); ++ virtinfo_notifier_register(VITYPE_QUOTA, "a_notifier_block); ++#if defined(CONFIG_VZ_QUOTA_UGID) && defined(CONFIG_PROC_FS) ++ vzaquota_init(); ++#endif ++ ++ return 0; ++ ++#ifdef CONFIG_VZ_QUOTA_UGID ++out_ugid: ++ vzquota_proc_release(); ++#endif ++out_proc: ++ vzquota_cache_release(); ++out_cache: ++ return err; ++} ++ ++#if defined(VZ_QUOTA_UNLOAD) ++static void __exit vzquota_release(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_QUOTA, "a_notifier_block); ++ vzioctl_unregister(&vzdqcalls); ++#ifdef CONFIG_VZ_QUOTA_UGID ++#ifdef CONFIG_PROC_FS ++ vzaquota_fini(); ++#endif ++ vzquota_ugid_release(); ++#endif ++ vzquota_proc_release(); ++ vzquota_cache_release(); ++} ++#endif ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Disk Quota"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(vzquota_init) ++#if defined(VZ_QUOTA_UNLOAD) ++module_exit(vzquota_release) ++#endif +diff -upr linux-2.6.16.orig/fs/xattr.c linux-2.6.16-026test015/fs/xattr.c +--- linux-2.6.16.orig/fs/xattr.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/xattr.c 2006-07-04 14:41:37.000000000 +0400 +@@ -58,7 +58,7 @@ xattr_permission(struct inode *inode, co + return -EPERM; + } + +- return permission(inode, mask, NULL); ++ return permission(inode, mask, NULL, NULL); + } + + int +diff -upr linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_aops.c linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_aops.c +--- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_aops.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_aops.c 2006-07-04 14:41:36.000000000 +0400 +@@ -616,7 +616,7 @@ xfs_is_delayed_page( + acceptable = (type == IOMAP_UNWRITTEN); + else if (buffer_delay(bh)) + acceptable = (type == IOMAP_DELAY); +- else if (buffer_mapped(bh)) ++ else if (buffer_dirty(bh) && buffer_mapped(bh)) + acceptable = (type == 0); + else + break; +diff -upr linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_iops.c linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_iops.c +--- linux-2.6.16.orig/fs/xfs/linux-2.6/xfs_iops.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/fs/xfs/linux-2.6/xfs_iops.c 2006-07-04 14:41:37.000000000 +0400 +@@ -615,7 +615,8 @@ STATIC int + linvfs_permission( + struct inode *inode, + int mode, +- struct nameidata *nd) ++ struct nameidata *nd, ++ struct exec_perm *perm) + { + vnode_t *vp = LINVFS_GET_VP(inode); + int error; +@@ -673,8 +674,7 @@ linvfs_setattr( + if (ia_valid & ATTR_ATIME) { + vattr.va_mask |= XFS_AT_ATIME; + vattr.va_atime = attr->ia_atime; +- if (ia_valid & ATTR_ATIME_SET) +- inode->i_atime = attr->ia_atime; ++ inode->i_atime = attr->ia_atime; + } + if (ia_valid & ATTR_MTIME) { + vattr.va_mask |= XFS_AT_MTIME; +diff -upr linux-2.6.16.orig/include/asm-arm26/tlbflush.h linux-2.6.16-026test015/include/asm-arm26/tlbflush.h +--- linux-2.6.16.orig/include/asm-arm26/tlbflush.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-arm26/tlbflush.h 2006-07-04 14:41:38.000000000 +0400 +@@ -25,7 +25,7 @@ static inline void memc_update_all(void) + { + struct task_struct *p; + cpu_memc_update_all(init_mm.pgd); +- for_each_process(p) { ++ for_each_process_all(p) { + if (!p->mm) + continue; + cpu_memc_update_all(p->mm->pgd); +diff -upr linux-2.6.16.orig/include/asm-generic/atomic.h linux-2.6.16-026test015/include/asm-generic/atomic.h +--- linux-2.6.16.orig/include/asm-generic/atomic.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-generic/atomic.h 2006-07-04 14:41:37.000000000 +0400 +@@ -66,6 +66,13 @@ static inline void atomic_long_sub(long + atomic64_sub(i, v); + } + ++static inline int atomic_long_add_negative(long i, atomic_long_t *l) ++{ ++ atomic64_t *v = (atomic64_t *)l; ++ ++ return atomic64_add_negative(i, v); ++} ++ + #else + + typedef atomic_t atomic_long_t; +@@ -113,5 +120,12 @@ static inline void atomic_long_sub(long + atomic_sub(i, v); + } + ++static inline int atomic_long_add_negative(long i, atomic_long_t *l) ++{ ++ atomic_t *v = (atomic_t *)l; ++ ++ return atomic_add_negative(i, v); ++} ++ + #endif + #endif +diff -upr linux-2.6.16.orig/include/asm-generic/pgtable.h linux-2.6.16-026test015/include/asm-generic/pgtable.h +--- linux-2.6.16.orig/include/asm-generic/pgtable.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-generic/pgtable.h 2006-07-04 14:41:36.000000000 +0400 +@@ -159,17 +159,8 @@ static inline void ptep_set_wrprotect(st + #define lazy_mmu_prot_update(pte) do { } while (0) + #endif + +-#ifndef __HAVE_ARCH_MULTIPLE_ZERO_PAGE ++#ifndef __HAVE_ARCH_MOVE_PTE + #define move_pte(pte, prot, old_addr, new_addr) (pte) +-#else +-#define move_pte(pte, prot, old_addr, new_addr) \ +-({ \ +- pte_t newpte = (pte); \ +- if (pte_present(pte) && pfn_valid(pte_pfn(pte)) && \ +- pte_page(pte) == ZERO_PAGE(old_addr)) \ +- newpte = mk_pte(ZERO_PAGE(new_addr), (prot)); \ +- newpte; \ +-}) + #endif + + /* +diff -upr linux-2.6.16.orig/include/asm-i386/bug.h linux-2.6.16-026test015/include/asm-i386/bug.h +--- linux-2.6.16.orig/include/asm-i386/bug.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/bug.h 2006-07-04 14:41:37.000000000 +0400 +@@ -14,7 +14,10 @@ + #ifdef CONFIG_DEBUG_BUGVERBOSE + #define BUG() \ + __asm__ __volatile__( "ud2\n" \ ++ "\t.byte 0x66\n"\ ++ "\t.byte 0xb8\n" /* mov $xxx, %ax */\ + "\t.word %c0\n" \ ++ "\t.byte 0xb8\n" /* mov $xxx, %eax */\ + "\t.long %c1\n" \ + : : "i" (__LINE__), "i" (__FILE__)) + #else +diff -upr linux-2.6.16.orig/include/asm-i386/cpufeature.h linux-2.6.16-026test015/include/asm-i386/cpufeature.h +--- linux-2.6.16.orig/include/asm-i386/cpufeature.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/cpufeature.h 2006-07-04 14:41:36.000000000 +0400 +@@ -70,6 +70,7 @@ + #define X86_FEATURE_P3 (3*32+ 6) /* P3 */ + #define X86_FEATURE_P4 (3*32+ 7) /* P4 */ + #define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */ ++#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */ + + /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ + #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ +diff -upr linux-2.6.16.orig/include/asm-i386/elf.h linux-2.6.16-026test015/include/asm-i386/elf.h +--- linux-2.6.16.orig/include/asm-i386/elf.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/elf.h 2006-07-04 14:41:39.000000000 +0400 +@@ -108,7 +108,7 @@ typedef struct user_fxsr_struct elf_fpxr + For the moment, we have only optimizations for the Intel generations, + but that could change... */ + +-#define ELF_PLATFORM (system_utsname.machine) ++#define ELF_PLATFORM (ve_utsname.machine) + + #ifdef __KERNEL__ + #define SET_PERSONALITY(ex, ibcs2) do { } while (0) +@@ -136,8 +136,10 @@ extern void __kernel_vsyscall; + + #define ARCH_DLINFO \ + do { \ ++ if (sysctl_at_vsyscall) { \ + NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ ++ } \ + } while (0) + + /* +diff -upr linux-2.6.16.orig/include/asm-i386/i387.h linux-2.6.16-026test015/include/asm-i386/i387.h +--- linux-2.6.16.orig/include/asm-i386/i387.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/i387.h 2006-07-04 14:41:36.000000000 +0400 +@@ -13,6 +13,7 @@ + + #include <linux/sched.h> + #include <linux/init.h> ++#include <linux/kernel_stat.h> + #include <asm/processor.h> + #include <asm/sigcontext.h> + #include <asm/user.h> +@@ -38,17 +39,38 @@ extern void init_fpu(struct task_struct + extern void kernel_fpu_begin(void); + #define kernel_fpu_end() do { stts(); preempt_enable(); } while(0) + ++/* We need a safe address that is cheap to find and that is already ++ in L1 during context switch. The best choices are unfortunately ++ different for UP and SMP */ ++#ifdef CONFIG_SMP ++#define safe_address (__per_cpu_offset[0]) ++#else ++#define safe_address (kstat_cpu(0).cpustat.user) ++#endif ++ + /* + * These must be called with preempt disabled + */ + static inline void __save_init_fpu( struct task_struct *tsk ) + { ++ /* Use more nops than strictly needed in case the compiler ++ varies code */ + alternative_input( +- "fnsave %1 ; fwait ;" GENERIC_NOP2, +- "fxsave %1 ; fnclex", ++ "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, ++ "fxsave %[fx]\n" ++ "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", + X86_FEATURE_FXSR, +- "m" (tsk->thread.i387.fxsave) +- :"memory"); ++ [fx] "m" (tsk->thread.i387.fxsave), ++ [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory"); ++ /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception ++ is pending. Clear the x87 state here by setting it to fixed ++ values. safe_address is a random variable that should be in L1 */ ++ alternative_input( ++ GENERIC_NOP8 GENERIC_NOP2, ++ "emms\n\t" /* clear stack tags */ ++ "fildl %[addr]", /* set F?P to defined value */ ++ X86_FEATURE_FXSAVE_LEAK, ++ [addr] "m" (safe_address)); + task_thread_info(tsk)->status &= ~TS_USEDFPU; + } + +diff -upr linux-2.6.16.orig/include/asm-i386/mman.h linux-2.6.16-026test015/include/asm-i386/mman.h +--- linux-2.6.16.orig/include/asm-i386/mman.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/mman.h 2006-07-04 14:41:37.000000000 +0400 +@@ -10,6 +10,7 @@ + #define MAP_NORESERVE 0x4000 /* don't check for reservations */ + #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* do soft ubc charge */ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ +diff -upr linux-2.6.16.orig/include/asm-i386/nmi.h linux-2.6.16-026test015/include/asm-i386/nmi.h +--- linux-2.6.16.orig/include/asm-i386/nmi.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/nmi.h 2006-07-04 14:41:37.000000000 +0400 +@@ -17,6 +17,7 @@ typedef int (*nmi_callback_t)(struct pt_ + * set. Return 1 if the NMI was handled. + */ + void set_nmi_callback(nmi_callback_t callback); ++void set_nmi_ipi_callback(nmi_callback_t callback); + + /** + * unset_nmi_callback +@@ -24,5 +25,6 @@ void set_nmi_callback(nmi_callback_t cal + * Remove the handler previously set. + */ + void unset_nmi_callback(void); ++void unset_nmi_ipi_callback(void); + + #endif /* ASM_NMI_H */ +diff -upr linux-2.6.16.orig/include/asm-i386/pgtable-2level.h linux-2.6.16-026test015/include/asm-i386/pgtable-2level.h +--- linux-2.6.16.orig/include/asm-i386/pgtable-2level.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/pgtable-2level.h 2006-07-04 14:41:36.000000000 +0400 +@@ -18,6 +18,9 @@ + #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) + #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) + ++#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) ++#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) ++ + #define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte_low, 0)) + #define pte_same(a, b) ((a).pte_low == (b).pte_low) + #define pte_page(x) pfn_to_page(pte_pfn(x)) +diff -upr linux-2.6.16.orig/include/asm-i386/pgtable-3level.h linux-2.6.16-026test015/include/asm-i386/pgtable-3level.h +--- linux-2.6.16.orig/include/asm-i386/pgtable-3level.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/pgtable-3level.h 2006-07-04 14:41:36.000000000 +0400 +@@ -85,6 +85,26 @@ static inline void pud_clear (pud_t * pu + #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ + pmd_index(address)) + ++/* ++ * For PTEs and PDEs, we must clear the P-bit first when clearing a page table ++ * entry, so clear the bottom half first and enforce ordering with a compiler ++ * barrier. ++ */ ++static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) ++{ ++ ptep->pte_low = 0; ++ smp_wmb(); ++ ptep->pte_high = 0; ++} ++ ++static inline void pmd_clear(pmd_t *pmd) ++{ ++ u32 *tmp = (u32 *)pmd; ++ *tmp = 0; ++ smp_wmb(); ++ *(tmp + 1) = 0; ++} ++ + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) + { + pte_t res; +diff -upr linux-2.6.16.orig/include/asm-i386/pgtable.h linux-2.6.16-026test015/include/asm-i386/pgtable.h +--- linux-2.6.16.orig/include/asm-i386/pgtable.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/pgtable.h 2006-07-04 14:41:36.000000000 +0400 +@@ -204,12 +204,10 @@ extern unsigned long long __PAGE_KERNEL, + extern unsigned long pg0[]; + + #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) +-#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) + + /* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ + #define pmd_none(x) (!(unsigned long)pmd_val(x)) + #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) +-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) + #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) + + +@@ -269,7 +267,7 @@ static inline pte_t ptep_get_and_clear_f + pte_t pte; + if (full) { + pte = *ptep; +- *ptep = __pte(0); ++ pte_clear(mm, addr, ptep); + } else { + pte = ptep_get_and_clear(mm, addr, ptep); + } +diff -upr linux-2.6.16.orig/include/asm-i386/thread_info.h linux-2.6.16-026test015/include/asm-i386/thread_info.h +--- linux-2.6.16.orig/include/asm-i386/thread_info.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/thread_info.h 2006-07-04 14:41:39.000000000 +0400 +@@ -101,13 +101,13 @@ register unsigned long current_stack_poi + ({ \ + struct thread_info *ret; \ + \ +- ret = kmalloc(THREAD_SIZE, GFP_KERNEL); \ ++ ret = kmalloc(THREAD_SIZE, GFP_KERNEL_UBC); \ + if (ret) \ + memset(ret, 0, THREAD_SIZE); \ + ret; \ + }) + #else +-#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL) ++#define alloc_thread_info(tsk) kmalloc(THREAD_SIZE, GFP_KERNEL_UBC) + #endif + + #define free_thread_info(info) kfree(info) +@@ -142,7 +142,8 @@ register unsigned long current_stack_poi + #define TIF_SECCOMP 8 /* secure computing */ + #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ + #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +-#define TIF_MEMDIE 17 ++#define TIF_FREEZE 17 /* Freeze request, atomic version of PF_FREEZE */ ++#define TIF_MEMDIE 18 + + #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) + #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) +diff -upr linux-2.6.16.orig/include/asm-i386/timex.h linux-2.6.16-026test015/include/asm-i386/timex.h +--- linux-2.6.16.orig/include/asm-i386/timex.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/timex.h 2006-07-04 14:41:38.000000000 +0400 +@@ -36,13 +36,17 @@ static inline cycles_t get_cycles (void) + { + unsigned long long ret=0; + +-#ifndef CONFIG_X86_TSC +- if (!cpu_has_tsc) +- return 0; +-#endif +- + #if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC) + rdtscll(ret); ++#elif defined(CONFIG_VE) ++ /* ++ * get_cycles is used in the following calculations: ++ * - VPS idle and iowait times in kernel/shced.h ++ * - task's sleep time to be shown with SyRq-t ++ * - kstat latencies in linux/vzstat.h ++ * - sched latency via wakeup_stamp in linux/ve_task.h ++ */ ++#warning "some of VPS statistics won't be correct without get_cycles() (kstat_lat, ve_idle, etc)" + #endif + return ret; + } +diff -upr linux-2.6.16.orig/include/asm-i386/unistd.h linux-2.6.16-026test015/include/asm-i386/unistd.h +--- linux-2.6.16.orig/include/asm-i386/unistd.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-i386/unistd.h 2006-07-04 14:41:39.000000000 +0400 +@@ -316,8 +316,16 @@ + #define __NR_pselect6 308 + #define __NR_ppoll 309 + #define __NR_unshare 310 +- +-#define NR_syscalls 311 ++#define __NR_fairsched_mknod 500 /* FairScheduler syscalls */ ++#define __NR_fairsched_rmnod 501 ++#define __NR_fairsched_chwt 502 ++#define __NR_fairsched_mvpr 503 ++#define __NR_fairsched_rate 504 ++#define __NR_getluid 510 ++#define __NR_setluid 511 ++#define __NR_setublimit 512 ++#define __NR_ubstat 513 ++#define NR_syscalls 513 + + /* + * user-visible error numbers are in the range -1 - -128: see +diff -upr linux-2.6.16.orig/include/asm-ia64/mman.h linux-2.6.16-026test015/include/asm-ia64/mman.h +--- linux-2.6.16.orig/include/asm-ia64/mman.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-ia64/mman.h 2006-07-04 14:41:37.000000000 +0400 +@@ -18,6 +18,7 @@ + #define MAP_NORESERVE 0x04000 /* don't check for reservations */ + #define MAP_POPULATE 0x08000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ +diff -upr linux-2.6.16.orig/include/asm-ia64/pgalloc.h linux-2.6.16-026test015/include/asm-ia64/pgalloc.h +--- linux-2.6.16.orig/include/asm-ia64/pgalloc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-ia64/pgalloc.h 2006-07-04 14:41:37.000000000 +0400 +@@ -20,6 +20,8 @@ + #include <linux/page-flags.h> + #include <linux/threads.h> + ++#include <ub/ub_mem.h> ++ + #include <asm/mmu_context.h> + + DECLARE_PER_CPU(unsigned long *, __pgtable_quicklist); +@@ -38,7 +40,7 @@ static inline long pgtable_quicklist_tot + return ql_size; + } + +-static inline void *pgtable_quicklist_alloc(void) ++static inline void *pgtable_quicklist_alloc(int charge) + { + unsigned long *ret = NULL; + +@@ -46,13 +48,19 @@ static inline void *pgtable_quicklist_al + + ret = pgtable_quicklist; + if (likely(ret != NULL)) { ++ if (ub_page_charge(virt_to_page(ret), 0, ++ charge ? __GFP_UBC|__GFP_SOFT_UBC : 0)) ++ goto out; ++ + pgtable_quicklist = (unsigned long *)(*ret); + ret[0] = 0; + --pgtable_quicklist_size; ++out: + preempt_enable(); + } else { + preempt_enable(); +- ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO); ++ ret = (unsigned long *)__get_free_page(GFP_KERNEL | __GFP_ZERO | ++ (charge ? __GFP_UBC | __GFP_SOFT_UBC : 0)); + } + + return ret; +@@ -70,6 +78,7 @@ static inline void pgtable_quicklist_fre + #endif + + preempt_disable(); ++ ub_page_uncharge(virt_to_page(pgtable_entry), 0); + *(unsigned long *)pgtable_entry = (unsigned long)pgtable_quicklist; + pgtable_quicklist = (unsigned long *)pgtable_entry; + ++pgtable_quicklist_size; +@@ -78,7 +87,7 @@ static inline void pgtable_quicklist_fre + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(1); + } + + static inline void pgd_free(pgd_t * pgd) +@@ -95,7 +104,7 @@ pgd_populate(struct mm_struct *mm, pgd_t + + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(1); + } + + static inline void pud_free(pud_t * pud) +@@ -113,7 +122,7 @@ pud_populate(struct mm_struct *mm, pud_t + + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(1); + } + + static inline void pmd_free(pmd_t * pmd) +@@ -138,13 +147,13 @@ pmd_populate_kernel(struct mm_struct *mm + static inline struct page *pte_alloc_one(struct mm_struct *mm, + unsigned long addr) + { +- return virt_to_page(pgtable_quicklist_alloc()); ++ return virt_to_page(pgtable_quicklist_alloc(1)); + } + + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long addr) + { +- return pgtable_quicklist_alloc(); ++ return pgtable_quicklist_alloc(0); + } + + static inline void pte_free(struct page *pte) +diff -upr linux-2.6.16.orig/include/asm-ia64/processor.h linux-2.6.16-026test015/include/asm-ia64/processor.h +--- linux-2.6.16.orig/include/asm-ia64/processor.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-ia64/processor.h 2006-07-04 14:41:38.000000000 +0400 +@@ -306,7 +306,7 @@ struct thread_struct { + regs->loadrs = 0; \ + regs->r8 = current->mm->dumpable; /* set "don't zap registers" flag */ \ + regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ +- if (unlikely(!current->mm->dumpable)) { \ ++ if (unlikely(!current->mm->dumpable || !current->mm->vps_dumpable)) { \ + /* \ + * Zap scratch regs to avoid leaking bits between processes with different \ + * uid/privileges. \ +diff -upr linux-2.6.16.orig/include/asm-ia64/thread_info.h linux-2.6.16-026test015/include/asm-ia64/thread_info.h +--- linux-2.6.16.orig/include/asm-ia64/thread_info.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-ia64/thread_info.h 2006-07-04 14:41:37.000000000 +0400 +@@ -94,6 +94,7 @@ struct thread_info { + #define TIF_MEMDIE 17 + #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ + #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ ++#define TIF_FREEZE 20 /* Freeze request, atomic version of PF_FREEZE */ + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +diff -upr linux-2.6.16.orig/include/asm-ia64/unistd.h linux-2.6.16-026test015/include/asm-ia64/unistd.h +--- linux-2.6.16.orig/include/asm-ia64/unistd.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-ia64/unistd.h 2006-07-04 14:41:39.000000000 +0400 +@@ -285,12 +285,22 @@ + #define __NR_faccessat 1293 + /* 1294, 1295 reserved for pselect/ppoll */ + #define __NR_unshare 1296 ++#define __NR_fairsched_mknod 1500 ++#define __NR_fairsched_rmnod 1501 ++#define __NR_fairsched_chwt 1502 ++#define __NR_fairsched_mvpr 1503 ++#define __NR_fairsched_rate 1504 ++#define __NR_getluid 1505 ++#define __NR_setluid 1506 ++#define __NR_setublimit 1507 ++#define __NR_ubstat 1508 + + #ifdef __KERNEL__ + + #include <linux/config.h> + +-#define NR_syscalls 273 /* length of syscall table */ ++/* length of syscall table */ ++#define NR_syscalls (__NR_ubstat - __NR_ni_syscall + 1) + + #define __ARCH_WANT_SYS_RT_SIGACTION + +diff -upr linux-2.6.16.orig/include/asm-m32r/smp.h linux-2.6.16-026test015/include/asm-m32r/smp.h +--- linux-2.6.16.orig/include/asm-m32r/smp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-m32r/smp.h 2006-07-04 14:41:36.000000000 +0400 +@@ -67,7 +67,8 @@ extern volatile int cpu_2_physid[NR_CPUS + #define raw_smp_processor_id() (current_thread_info()->cpu) + + extern cpumask_t cpu_callout_map; +-#define cpu_possible_map cpu_callout_map ++extern cpumask_t cpu_possible_map; ++extern cpumask_t cpu_present_map; + + static __inline__ int hard_smp_processor_id(void) + { +diff -upr linux-2.6.16.orig/include/asm-m32r/uaccess.h linux-2.6.16-026test015/include/asm-m32r/uaccess.h +--- linux-2.6.16.orig/include/asm-m32r/uaccess.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-m32r/uaccess.h 2006-07-04 14:41:36.000000000 +0400 +@@ -5,17 +5,9 @@ + * linux/include/asm-m32r/uaccess.h + * + * M32R version. +- * Copyright (C) 2004 Hirokazu Takata <takata at linux-m32r.org> ++ * Copyright (C) 2004, 2006 Hirokazu Takata <takata at linux-m32r.org> + */ + +-#undef UACCESS_DEBUG +- +-#ifdef UACCESS_DEBUG +-#define UAPRINTK(args...) printk(args) +-#else +-#define UAPRINTK(args...) +-#endif /* UACCESS_DEBUG */ +- + /* + * User space memory access functions + */ +@@ -38,27 +30,29 @@ + #define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) + + #ifdef CONFIG_MMU ++ + #define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) + #define USER_DS MAKE_MM_SEG(PAGE_OFFSET) +-#else +-#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) +-#define USER_DS MAKE_MM_SEG(0xFFFFFFFF) +-#endif /* CONFIG_MMU */ +- + #define get_ds() (KERNEL_DS) +-#ifdef CONFIG_MMU + #define get_fs() (current_thread_info()->addr_limit) + #define set_fs(x) (current_thread_info()->addr_limit = (x)) +-#else ++ ++#else /* not CONFIG_MMU */ ++ ++#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) ++#define USER_DS MAKE_MM_SEG(0xFFFFFFFF) ++#define get_ds() (KERNEL_DS) ++ + static inline mm_segment_t get_fs(void) + { +- return USER_DS; ++ return USER_DS; + } + + static inline void set_fs(mm_segment_t s) + { + } +-#endif /* CONFIG_MMU */ ++ ++#endif /* not CONFIG_MMU */ + + #define segment_eq(a,b) ((a).seg == (b).seg) + +@@ -83,9 +77,9 @@ static inline void set_fs(mm_segment_t s + " subx %0, %0\n" \ + " cmpu %4, %1\n" \ + " subx %0, %5\n" \ +- : "=&r"(flag), "=r"(sum) \ +- : "1"(addr), "r"((int)(size)), \ +- "r"(current_thread_info()->addr_limit.seg), "r"(0) \ ++ : "=&r" (flag), "=r" (sum) \ ++ : "1" (addr), "r" ((int)(size)), \ ++ "r" (current_thread_info()->addr_limit.seg), "r" (0) \ + : "cbit" ); \ + flag; }) + +@@ -113,10 +107,10 @@ static inline void set_fs(mm_segment_t s + #else + static inline int access_ok(int type, const void *addr, unsigned long size) + { +- extern unsigned long memory_start, memory_end; +- unsigned long val = (unsigned long)addr; ++ extern unsigned long memory_start, memory_end; ++ unsigned long val = (unsigned long)addr; + +- return ((val >= memory_start) && ((val + size) < memory_end)); ++ return ((val >= memory_start) && ((val + size) < memory_end)); + } + #endif /* CONFIG_MMU */ + +@@ -155,39 +149,6 @@ extern int fixup_exception(struct pt_reg + * accesses to the same area of user memory). + */ + +-extern void __get_user_1(void); +-extern void __get_user_2(void); +-extern void __get_user_4(void); +- +-#ifndef MODULE +-#define __get_user_x(size,ret,x,ptr) \ +- __asm__ __volatile__( \ +- " mv r0, %0\n" \ +- " mv r1, %1\n" \ +- " bl __get_user_" #size "\n" \ +- " mv %0, r0\n" \ +- " mv %1, r1\n" \ +- : "=r"(ret), "=r"(x) \ +- : "0"(ptr) \ +- : "r0", "r1", "r14" ) +-#else /* MODULE */ +-/* +- * Use "jl" instead of "bl" for MODULE +- */ +-#define __get_user_x(size,ret,x,ptr) \ +- __asm__ __volatile__( \ +- " mv r0, %0\n" \ +- " mv r1, %1\n" \ +- " seth lr, #high(__get_user_" #size ")\n" \ +- " or3 lr, lr, #low(__get_user_" #size ")\n" \ +- " jl lr\n" \ +- " mv %0, r0\n" \ +- " mv %1, r1\n" \ +- : "=r"(ret), "=r"(x) \ +- : "0"(ptr) \ +- : "r0", "r1", "r14" ) +-#endif +- + /* Careful: we have to cast the result to the type of the pointer for sign + reasons */ + /** +@@ -208,20 +169,7 @@ extern void __get_user_4(void); + * On error, the variable @x is set to zero. + */ + #define get_user(x,ptr) \ +-({ int __ret_gu; \ +- unsigned long __val_gu; \ +- __chk_user_ptr(ptr); \ +- switch(sizeof (*(ptr))) { \ +- case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break; \ +- case 2: __get_user_x(2,__ret_gu,__val_gu,ptr); break; \ +- case 4: __get_user_x(4,__ret_gu,__val_gu,ptr); break; \ +- default: __get_user_x(X,__ret_gu,__val_gu,ptr); break; \ +- } \ +- (x) = (__typeof__(*(ptr)))__val_gu; \ +- __ret_gu; \ +-}) +- +-extern void __put_user_bad(void); ++ __get_user_check((x),(ptr),sizeof(*(ptr))) + + /** + * put_user: - Write a simple value into user space. +@@ -240,8 +188,7 @@ extern void __put_user_bad(void); + * Returns zero on success, or -EFAULT on error. + */ + #define put_user(x,ptr) \ +- __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) +- ++ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) + + /** + * __get_user: - Get a simple variable from user space, with less checking. +@@ -264,8 +211,64 @@ extern void __put_user_bad(void); + * On error, the variable @x is set to zero. + */ + #define __get_user(x,ptr) \ +- __get_user_nocheck((x),(ptr),sizeof(*(ptr))) ++ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) + ++#define __get_user_nocheck(x,ptr,size) \ ++({ \ ++ long __gu_err = 0; \ ++ unsigned long __gu_val; \ ++ might_sleep(); \ ++ __get_user_size(__gu_val,(ptr),(size),__gu_err); \ ++ (x) = (__typeof__(*(ptr)))__gu_val; \ ++ __gu_err; \ ++}) ++ ++#define __get_user_check(x,ptr,size) \ ++({ \ ++ long __gu_err = -EFAULT; \ ++ unsigned long __gu_val = 0; \ ++ const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ ++ might_sleep(); \ ++ if (access_ok(VERIFY_READ,__gu_addr,size)) \ ++ __get_user_size(__gu_val,__gu_addr,(size),__gu_err); \ ++ (x) = (__typeof__(*(ptr)))__gu_val; \ ++ __gu_err; \ ++}) ++ ++extern long __get_user_bad(void); ++ ++#define __get_user_size(x,ptr,size,retval) \ ++do { \ ++ retval = 0; \ ++ __chk_user_ptr(ptr); \ ++ switch (size) { \ ++ case 1: __get_user_asm(x,ptr,retval,"ub"); break; \ ++ case 2: __get_user_asm(x,ptr,retval,"uh"); break; \ ++ case 4: __get_user_asm(x,ptr,retval,""); break; \ ++ default: (x) = __get_user_bad(); \ ++ } \ ++} while (0) ++ ++#define __get_user_asm(x, addr, err, itype) \ ++ __asm__ __volatile__( \ ++ " .fillinsn\n" \ ++ "1: ld"itype" %1,@%2\n" \ ++ " .fillinsn\n" \ ++ "2:\n" \ ++ ".section .fixup,\"ax\"\n" \ ++ " .balign 4\n" \ ++ "3: ldi %0,%3\n" \ ++ " seth r14,#high(2b)\n" \ ++ " or3 r14,r14,#low(2b)\n" \ ++ " jmp r14\n" \ ++ ".previous\n" \ ++ ".section __ex_table,\"a\"\n" \ ++ " .balign 4\n" \ ++ " .long 1b,3b\n" \ ++ ".previous" \ ++ : "=&r" (err), "=&r" (x) \ ++ : "r" (addr), "i" (-EFAULT), "0" (err) \ ++ : "r14", "memory") + + /** + * __put_user: - Write a simple value into user space, with less checking. +@@ -287,11 +290,13 @@ extern void __put_user_bad(void); + * Returns zero on success, or -EFAULT on error. + */ + #define __put_user(x,ptr) \ +- __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) ++ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) ++ + + #define __put_user_nocheck(x,ptr,size) \ + ({ \ + long __pu_err; \ ++ might_sleep(); \ + __put_user_size((x),(ptr),(size),__pu_err); \ + __pu_err; \ + }) +@@ -308,28 +313,28 @@ extern void __put_user_bad(void); + }) + + #if defined(__LITTLE_ENDIAN__) +-#define __put_user_u64(x, addr, err) \ +- __asm__ __volatile__( \ +- " .fillinsn\n" \ +- "1: st %L1,@%2\n" \ +- " .fillinsn\n" \ +- "2: st %H1,@(4,%2)\n" \ +- " .fillinsn\n" \ +- "3:\n" \ +- ".section .fixup,\"ax\"\n" \ +- " .balign 4\n" \ +- "4: ldi %0,%3\n" \ +- " seth r14,#high(3b)\n" \ +- " or3 r14,r14,#low(3b)\n" \ +- " jmp r14\n" \ +- ".previous\n" \ +- ".section __ex_table,\"a\"\n" \ +- " .balign 4\n" \ +- " .long 1b,4b\n" \ +- " .long 2b,4b\n" \ +- ".previous" \ +- : "=&r"(err) \ +- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \ ++#define __put_user_u64(x, addr, err) \ ++ __asm__ __volatile__( \ ++ " .fillinsn\n" \ ++ "1: st %L1,@%2\n" \ ++ " .fillinsn\n" \ ++ "2: st %H1,@(4,%2)\n" \ ++ " .fillinsn\n" \ ++ "3:\n" \ ++ ".section .fixup,\"ax\"\n" \ ++ " .balign 4\n" \ ++ "4: ldi %0,%3\n" \ ++ " seth r14,#high(3b)\n" \ ++ " or3 r14,r14,#low(3b)\n" \ ++ " jmp r14\n" \ ++ ".previous\n" \ ++ ".section __ex_table,\"a\"\n" \ ++ " .balign 4\n" \ ++ " .long 1b,4b\n" \ ++ " .long 2b,4b\n" \ ++ ".previous" \ ++ : "=&r" (err) \ ++ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \ + : "r14", "memory") + + #elif defined(__BIG_ENDIAN__) +@@ -353,13 +358,15 @@ extern void __put_user_bad(void); + " .long 1b,4b\n" \ + " .long 2b,4b\n" \ + ".previous" \ +- : "=&r"(err) \ +- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \ ++ : "=&r" (err) \ ++ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \ + : "r14", "memory") + #else + #error no endian defined + #endif + ++extern void __put_user_bad(void); ++ + #define __put_user_size(x,ptr,size,retval) \ + do { \ + retval = 0; \ +@@ -398,52 +405,8 @@ struct __large_struct { unsigned long bu + " .balign 4\n" \ + " .long 1b,3b\n" \ + ".previous" \ +- : "=&r"(err) \ +- : "r"(x), "r"(addr), "i"(-EFAULT), "0"(err) \ +- : "r14", "memory") +- +-#define __get_user_nocheck(x,ptr,size) \ +-({ \ +- long __gu_err; \ +- unsigned long __gu_val; \ +- __get_user_size(__gu_val,(ptr),(size),__gu_err); \ +- (x) = (__typeof__(*(ptr)))__gu_val; \ +- __gu_err; \ +-}) +- +-extern long __get_user_bad(void); +- +-#define __get_user_size(x,ptr,size,retval) \ +-do { \ +- retval = 0; \ +- __chk_user_ptr(ptr); \ +- switch (size) { \ +- case 1: __get_user_asm(x,ptr,retval,"ub"); break; \ +- case 2: __get_user_asm(x,ptr,retval,"uh"); break; \ +- case 4: __get_user_asm(x,ptr,retval,""); break; \ +- default: (x) = __get_user_bad(); \ +- } \ +-} while (0) +- +-#define __get_user_asm(x, addr, err, itype) \ +- __asm__ __volatile__( \ +- " .fillinsn\n" \ +- "1: ld"itype" %1,@%2\n" \ +- " .fillinsn\n" \ +- "2:\n" \ +- ".section .fixup,\"ax\"\n" \ +- " .balign 4\n" \ +- "3: ldi %0,%3\n" \ +- " seth r14,#high(2b)\n" \ +- " or3 r14,r14,#low(2b)\n" \ +- " jmp r14\n" \ +- ".previous\n" \ +- ".section __ex_table,\"a\"\n" \ +- " .balign 4\n" \ +- " .long 1b,3b\n" \ +- ".previous" \ +- : "=&r"(err), "=&r"(x) \ +- : "r"(addr), "i"(-EFAULT), "0"(err) \ ++ : "=&r" (err) \ ++ : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err) \ + : "r14", "memory") + + /* +@@ -453,7 +416,6 @@ do { \ + * anything, so this is accurate. + */ + +- + /* + * Copy To/From Userspace + */ +@@ -511,8 +473,9 @@ do { \ + " .long 2b,9b\n" \ + " .long 3b,9b\n" \ + ".previous\n" \ +- : "=&r"(__dst), "=&r"(__src), "=&r"(size), "=&r"(__c) \ +- : "0"(to), "1"(from), "2"(size), "3"(size / 4) \ ++ : "=&r" (__dst), "=&r" (__src), "=&r" (size), \ ++ "=&r" (__c) \ ++ : "0" (to), "1" (from), "2" (size), "3" (size / 4) \ + : "r14", "memory"); \ + } while (0) + +@@ -573,8 +536,9 @@ do { \ + " .long 2b,7b\n" \ + " .long 3b,7b\n" \ + ".previous\n" \ +- : "=&r"(__dst), "=&r"(__src), "=&r"(size), "=&r"(__c) \ +- : "0"(to), "1"(from), "2"(size), "3"(size / 4) \ ++ : "=&r" (__dst), "=&r" (__src), "=&r" (size), \ ++ "=&r" (__c) \ ++ : "0" (to), "1" (from), "2" (size), "3" (size / 4) \ + : "r14", "memory"); \ + } while (0) + +@@ -676,7 +640,7 @@ unsigned long __generic_copy_from_user(v + #define copy_from_user(to,from,n) \ + ({ \ + might_sleep(); \ +-__generic_copy_from_user((to),(from),(n)); \ ++ __generic_copy_from_user((to),(from),(n)); \ + }) + + long __must_check strncpy_from_user(char *dst, const char __user *src, +diff -upr linux-2.6.16.orig/include/asm-mips/bitops.h linux-2.6.16-026test015/include/asm-mips/bitops.h +--- linux-2.6.16.orig/include/asm-mips/bitops.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-mips/bitops.h 2006-07-04 14:41:36.000000000 +0400 +@@ -654,7 +654,12 @@ static inline unsigned long fls(unsigned + { + #ifdef CONFIG_32BIT + #ifdef CONFIG_CPU_MIPS32 +- __asm__ ("clz %0, %1" : "=r" (word) : "r" (word)); ++ __asm__ ( ++ " .set mips32 \n" ++ " clz %0, %1 \n" ++ " .set mips0 \n" ++ : "=r" (word) ++ : "r" (word)); + + return 32 - word; + #else +@@ -678,7 +683,12 @@ static inline unsigned long fls(unsigned + #ifdef CONFIG_64BIT + #ifdef CONFIG_CPU_MIPS64 + +- __asm__ ("dclz %0, %1" : "=r" (word) : "r" (word)); ++ __asm__ ( ++ " .set mips64 \n" ++ " dclz %0, %1 \n" ++ " .set mips0 \n" ++ : "=r" (word) ++ : "r" (word)); + + return 64 - word; + #else +diff -upr linux-2.6.16.orig/include/asm-mips/byteorder.h linux-2.6.16-026test015/include/asm-mips/byteorder.h +--- linux-2.6.16.orig/include/asm-mips/byteorder.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-mips/byteorder.h 2006-07-04 14:41:36.000000000 +0400 +@@ -19,7 +19,9 @@ + static __inline__ __attribute_const__ __u16 ___arch__swab16(__u16 x) + { + __asm__( ++ " .set mips32r2 \n" + " wsbh %0, %1 \n" ++ " .set mips0 \n" + : "=r" (x) + : "r" (x)); + +@@ -30,8 +32,10 @@ static __inline__ __attribute_const__ __ + static __inline__ __attribute_const__ __u32 ___arch__swab32(__u32 x) + { + __asm__( ++ " .set mips32r2 \n" + " wsbh %0, %1 \n" + " rotr %0, %0, 16 \n" ++ " .set mips0 \n" + : "=r" (x) + : "r" (x)); + +diff -upr linux-2.6.16.orig/include/asm-mips/interrupt.h linux-2.6.16-026test015/include/asm-mips/interrupt.h +--- linux-2.6.16.orig/include/asm-mips/interrupt.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-mips/interrupt.h 2006-07-04 14:41:36.000000000 +0400 +@@ -20,7 +20,9 @@ __asm__ ( + " .set reorder \n" + " .set noat \n" + #ifdef CONFIG_CPU_MIPSR2 ++ " .set mips32r2 \n" + " ei \n" ++ " .set mips0 \n" + #else + " mfc0 $1,$12 \n" + " ori $1,0x1f \n" +@@ -63,7 +65,9 @@ __asm__ ( + " .set push \n" + " .set noat \n" + #ifdef CONFIG_CPU_MIPSR2 ++ " .set mips32r2 \n" + " di \n" ++ " .set mips0 \n" + #else + " mfc0 $1,$12 \n" + " ori $1,0x1f \n" +@@ -103,8 +107,10 @@ __asm__ ( + " .set reorder \n" + " .set noat \n" + #ifdef CONFIG_CPU_MIPSR2 ++ " .set mips32r2 \n" + " di \\result \n" + " andi \\result, 1 \n" ++ " .set mips0 \n" + #else + " mfc0 \\result, $12 \n" + " ori $1, \\result, 0x1f \n" +@@ -133,9 +139,11 @@ __asm__ ( + * Slow, but doesn't suffer from a relativly unlikely race + * condition we're having since days 1. + */ ++ " .set mips32r2 \n" + " beqz \\flags, 1f \n" + " di \n" + " ei \n" ++ " .set mips0 \n" + "1: \n" + #elif defined(CONFIG_CPU_MIPSR2) + /* +diff -upr linux-2.6.16.orig/include/asm-mips/pgtable.h linux-2.6.16-026test015/include/asm-mips/pgtable.h +--- linux-2.6.16.orig/include/asm-mips/pgtable.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-mips/pgtable.h 2006-07-04 14:41:36.000000000 +0400 +@@ -70,7 +70,15 @@ extern unsigned long zero_page_mask; + #define ZERO_PAGE(vaddr) \ + (virt_to_page(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask))) + +-#define __HAVE_ARCH_MULTIPLE_ZERO_PAGE ++#define __HAVE_ARCH_MOVE_PTE ++#define move_pte(pte, prot, old_addr, new_addr) \ ++({ \ ++ pte_t newpte = (pte); \ ++ if (pte_present(pte) && pfn_valid(pte_pfn(pte)) && \ ++ pte_page(pte) == ZERO_PAGE(old_addr)) \ ++ newpte = mk_pte(ZERO_PAGE(new_addr), (prot)); \ ++ newpte; \ ++}) + + extern void paging_init(void); + +diff -upr linux-2.6.16.orig/include/asm-mips/r4kcache.h linux-2.6.16-026test015/include/asm-mips/r4kcache.h +--- linux-2.6.16.orig/include/asm-mips/r4kcache.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-mips/r4kcache.h 2006-07-04 14:41:36.000000000 +0400 +@@ -37,7 +37,7 @@ + " cache %0, %1 \n" \ + " .set pop \n" \ + : \ +- : "i" (op), "m" (*(unsigned char *)(addr))) ++ : "i" (op), "R" (*(unsigned char *)(addr))) + + static inline void flush_icache_line_indexed(unsigned long addr) + { +diff -upr linux-2.6.16.orig/include/asm-powerpc/floppy.h linux-2.6.16-026test015/include/asm-powerpc/floppy.h +--- linux-2.6.16.orig/include/asm-powerpc/floppy.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-powerpc/floppy.h 2006-07-04 14:41:36.000000000 +0400 +@@ -35,6 +35,7 @@ + #ifdef CONFIG_PCI + + #include <linux/pci.h> ++#include <asm/ppc-pci.h> /* for ppc64_isabridge_dev */ + + #define fd_dma_setup(addr,size,mode,io) powerpc_fd_dma_setup(addr,size,mode,io) + +@@ -52,12 +53,12 @@ static __inline__ int powerpc_fd_dma_set + if (bus_addr + && (addr != prev_addr || size != prev_size || dir != prev_dir)) { + /* different from last time -- unmap prev */ +- pci_unmap_single(NULL, bus_addr, prev_size, prev_dir); ++ pci_unmap_single(ppc64_isabridge_dev, bus_addr, prev_size, prev_dir); + bus_addr = 0; + } + + if (!bus_addr) /* need to map it */ +- bus_addr = pci_map_single(NULL, addr, size, dir); ++ bus_addr = pci_map_single(ppc64_isabridge_dev, addr, size, dir); + + /* remember this one as prev */ + prev_addr = addr; +diff -upr linux-2.6.16.orig/include/asm-powerpc/pgalloc.h linux-2.6.16-026test015/include/asm-powerpc/pgalloc.h +--- linux-2.6.16.orig/include/asm-powerpc/pgalloc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-powerpc/pgalloc.h 2006-07-04 14:41:37.000000000 +0400 +@@ -33,7 +33,8 @@ extern kmem_cache_t *pgtable_cache[]; + + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { +- return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); ++ return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], ++ GFP_KERNEL_UBC | __GFP_SOFT_UBC); + } + + static inline void pgd_free(pgd_t *pgd) +@@ -48,7 +49,7 @@ static inline void pgd_free(pgd_t *pgd) + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { + return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); + } + + static inline void pud_free(pud_t *pud) +@@ -84,7 +85,7 @@ static inline void pmd_populate_kernel(s + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) + { + return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ GFP_KERNEL_UBC|__GFP_SOFT_UBC|__GFP_REPEAT); + } + + static inline void pmd_free(pmd_t *pmd) +@@ -92,17 +93,21 @@ static inline void pmd_free(pmd_t *pmd) + kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); + } + ++static inline pte_t *__pte_alloc(gfp_t flags) ++{ ++ return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], flags); ++} ++ + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) + { +- return kmem_cache_alloc(pgtable_cache[PTE_CACHE_NUM], +- GFP_KERNEL|__GFP_REPEAT); ++ return __pte_alloc(GFP_KERNEL | __GFP_REPEAT); + } + + static inline struct page *pte_alloc_one(struct mm_struct *mm, + unsigned long address) + { +- return virt_to_page(pte_alloc_one_kernel(mm, address)); ++ return virt_to_page(__pte_alloc(GFP_KERNEL_UBC | __GFP_SOFT_UBC)); + } + + static inline void pte_free_kernel(pte_t *pte) +diff -upr linux-2.6.16.orig/include/asm-powerpc/unistd.h linux-2.6.16-026test015/include/asm-powerpc/unistd.h +--- linux-2.6.16.orig/include/asm-powerpc/unistd.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-powerpc/unistd.h 2006-07-04 14:41:37.000000000 +0400 +@@ -301,8 +301,12 @@ + #define __NR_pselect6 280 + #define __NR_ppoll 281 + #define __NR_unshare 282 +- +-#define __NR_syscalls 283 ++#define __NR_getluid 410 ++#define __NR_setluid 411 ++#define __NR_setublimit 412 ++#define __NR_ubstat 413 ++ ++#define NR_syscalls 414 + + #ifdef __KERNEL__ + #define __NR__exit __NR_exit +diff -upr linux-2.6.16.orig/include/asm-s390/pgalloc.h linux-2.6.16-026test015/include/asm-s390/pgalloc.h +--- linux-2.6.16.orig/include/asm-s390/pgalloc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-s390/pgalloc.h 2006-07-04 14:41:37.000000000 +0400 +@@ -34,12 +34,12 @@ static inline pgd_t *pgd_alloc(struct mm + int i; + + #ifndef __s390x__ +- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,1); ++ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 1); + if (pgd != NULL) + for (i = 0; i < USER_PTRS_PER_PGD; i++) + pmd_clear(pmd_offset(pgd + i, i*PGDIR_SIZE)); + #else /* __s390x__ */ +- pgd = (pgd_t *) __get_free_pages(GFP_KERNEL,2); ++ pgd = (pgd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); + if (pgd != NULL) + for (i = 0; i < PTRS_PER_PGD; i++) + pgd_clear(pgd + i); +@@ -72,7 +72,7 @@ static inline pmd_t * pmd_alloc_one(stru + pmd_t *pmd; + int i; + +- pmd = (pmd_t *) __get_free_pages(GFP_KERNEL, 2); ++ pmd = (pmd_t *) __get_free_pages(GFP_KERNEL_UBC | __GFP_SOFT_UBC, 2); + if (pmd != NULL) { + for (i=0; i < PTRS_PER_PMD; i++) + pmd_clear(pmd+i); +@@ -118,16 +118,13 @@ pmd_populate(struct mm_struct *mm, pmd_t + pmd_populate_kernel(mm, pmd, (pte_t *)((page-mem_map) << PAGE_SHIFT)); + } + +-/* +- * page table entry allocation/free routines. +- */ +-static inline pte_t * +-pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) ++static inline pte_t *pte_alloc(struct mm_struct *mm, unsigned long vmaddr, ++ gfp_t mask) + { + pte_t *pte; + int i; + +- pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); ++ pte = (pte_t *)__get_free_page(mask); + if (pte != NULL) { + for (i=0; i < PTRS_PER_PTE; i++) { + pte_clear(mm, vmaddr, pte+i); +@@ -137,10 +134,20 @@ pte_alloc_one_kernel(struct mm_struct *m + return pte; + } + ++/* ++ * page table entry allocation/free routines. ++ */ ++static inline pte_t * ++pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) ++{ ++ return pte_alloc(mm, vmaddr, GFP_KERNEL | __GFP_REPEAT); ++} ++ + static inline struct page * + pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr) + { +- pte_t *pte = pte_alloc_one_kernel(mm, vmaddr); ++ pte_t *pte = pte_alloc(mm, vmaddr, GFP_KERNEL_UBC | __GFP_SOFT_UBC | ++ __GFP_REPEAT); + if (pte) + return virt_to_page(pte); + return 0; +diff -upr linux-2.6.16.orig/include/asm-sh64/pgalloc.h linux-2.6.16-026test015/include/asm-sh64/pgalloc.h +--- linux-2.6.16.orig/include/asm-sh64/pgalloc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-sh64/pgalloc.h 2006-07-04 14:41:38.000000000 +0400 +@@ -173,7 +173,7 @@ static inline void set_pgdir(unsigned lo + pgd_t *pgd; + + read_lock(&tasklist_lock); +- for_each_process(p) { ++ for_each_process_all(p) { + if (!p->mm) + continue; + *pgd_offset(p->mm,address) = entry; +diff -upr linux-2.6.16.orig/include/asm-sparc64/dma-mapping.h linux-2.6.16-026test015/include/asm-sparc64/dma-mapping.h +--- linux-2.6.16.orig/include/asm-sparc64/dma-mapping.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-sparc64/dma-mapping.h 2006-07-04 14:41:36.000000000 +0400 +@@ -4,7 +4,146 @@ + #include <linux/config.h> + + #ifdef CONFIG_PCI +-#include <asm-generic/dma-mapping.h> ++ ++/* we implement the API below in terms of the existing PCI one, ++ * so include it */ ++#include <linux/pci.h> ++/* need struct page definitions */ ++#include <linux/mm.h> ++ ++static inline int ++dma_supported(struct device *dev, u64 mask) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ return pci_dma_supported(to_pci_dev(dev), mask); ++} ++ ++static inline int ++dma_set_mask(struct device *dev, u64 dma_mask) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ return pci_set_dma_mask(to_pci_dev(dev), dma_mask); ++} ++ ++static inline void * ++dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, ++ gfp_t flag) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ return __pci_alloc_consistent(to_pci_dev(dev), size, dma_handle, flag); ++} ++ ++static inline void ++dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, ++ dma_addr_t dma_handle) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_free_consistent(to_pci_dev(dev), size, cpu_addr, dma_handle); ++} ++ ++static inline dma_addr_t ++dma_map_single(struct device *dev, void *cpu_addr, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ return pci_map_single(to_pci_dev(dev), cpu_addr, size, (int)direction); ++} ++ ++static inline void ++dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_unmap_single(to_pci_dev(dev), dma_addr, size, (int)direction); ++} ++ ++static inline dma_addr_t ++dma_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction); ++} ++ ++static inline void ++dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_unmap_page(to_pci_dev(dev), dma_address, size, (int)direction); ++} ++ ++static inline int ++dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction); ++} ++ ++static inline void ++dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_unmap_sg(to_pci_dev(dev), sg, nhwentries, (int)direction); ++} ++ ++static inline void ++dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_dma_sync_single_for_cpu(to_pci_dev(dev), dma_handle, ++ size, (int)direction); ++} ++ ++static inline void ++dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_dma_sync_single_for_device(to_pci_dev(dev), dma_handle, ++ size, (int)direction); ++} ++ ++static inline void ++dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_dma_sync_sg_for_cpu(to_pci_dev(dev), sg, nelems, (int)direction); ++} ++ ++static inline void ++dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, ++ enum dma_data_direction direction) ++{ ++ BUG_ON(dev->bus != &pci_bus_type); ++ ++ pci_dma_sync_sg_for_device(to_pci_dev(dev), sg, nelems, (int)direction); ++} ++ ++static inline int ++dma_mapping_error(dma_addr_t dma_addr) ++{ ++ return pci_dma_mapping_error(dma_addr); ++} ++ + #else + + struct device; +diff -upr linux-2.6.16.orig/include/asm-sparc64/pci.h linux-2.6.16-026test015/include/asm-sparc64/pci.h +--- linux-2.6.16.orig/include/asm-sparc64/pci.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-sparc64/pci.h 2006-07-04 14:41:36.000000000 +0400 +@@ -44,7 +44,9 @@ struct pci_dev; + /* Allocate and map kernel buffer using consistent mode DMA for a device. + * hwdev should be valid struct pci_dev pointer for PCI devices. + */ +-extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle); ++extern void *__pci_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); ++#define pci_alloc_consistent(DEV,SZ,HANDLE) \ ++ __pci_alloc_consistent(DEV,SZ,HANDLE,GFP_ATOMIC) + + /* Free and unmap a consistent DMA buffer. + * cpu_addr is what was returned from pci_alloc_consistent, +diff -upr linux-2.6.16.orig/include/asm-sparc64/pgtable.h linux-2.6.16-026test015/include/asm-sparc64/pgtable.h +--- linux-2.6.16.orig/include/asm-sparc64/pgtable.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-sparc64/pgtable.h 2006-07-04 14:41:36.000000000 +0400 +@@ -335,6 +335,23 @@ static inline void set_pte_at(struct mm_ + #define pte_clear(mm,addr,ptep) \ + set_pte_at((mm), (addr), (ptep), __pte(0UL)) + ++#ifdef DCACHE_ALIASING_POSSIBLE ++#define __HAVE_ARCH_MOVE_PTE ++#define move_pte(pte, prot, old_addr, new_addr) \ ++({ \ ++ pte_t newpte = (pte); \ ++ if (pte_present(pte)) { \ ++ unsigned long this_pfn = pte_pfn(pte); \ ++ \ ++ if (pfn_valid(this_pfn) && \ ++ (((old_addr) ^ (new_addr)) & (1 << 13))) \ ++ flush_dcache_page_all(current->mm, \ ++ pfn_to_page(this_pfn)); \ ++ } \ ++ newpte; \ ++}) ++#endif ++ + extern pgd_t swapper_pg_dir[2048]; + extern pmd_t swapper_low_pmd_dir[2048]; + +diff -upr linux-2.6.16.orig/include/asm-x86_64/cpufeature.h linux-2.6.16-026test015/include/asm-x86_64/cpufeature.h +--- linux-2.6.16.orig/include/asm-x86_64/cpufeature.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/cpufeature.h 2006-07-04 14:41:36.000000000 +0400 +@@ -64,6 +64,7 @@ + #define X86_FEATURE_REP_GOOD (3*32+ 4) /* rep microcode works well on this CPU */ + #define X86_FEATURE_CONSTANT_TSC (3*32+5) /* TSC runs at constant rate */ + #define X86_FEATURE_SYNC_RDTSC (3*32+6) /* RDTSC syncs CPU core */ ++#define X86_FEATURE_FXSAVE_LEAK (3*32+7) /* FIP/FOP/FDP leaks through FXSAVE */ + + /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ + #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ +diff -upr linux-2.6.16.orig/include/asm-x86_64/i387.h linux-2.6.16-026test015/include/asm-x86_64/i387.h +--- linux-2.6.16.orig/include/asm-x86_64/i387.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/i387.h 2006-07-04 14:41:36.000000000 +0400 +@@ -72,6 +72,23 @@ extern int set_fpregs(struct task_struct + #define set_fpu_swd(t,val) ((t)->thread.i387.fxsave.swd = (val)) + #define set_fpu_fxsr_twd(t,val) ((t)->thread.i387.fxsave.twd = (val)) + ++#define X87_FSW_ES (1 << 7) /* Exception Summary */ ++ ++/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception ++ is pending. Clear the x87 state here by setting it to fixed ++ values. The kernel data segment can be sometimes 0 and sometimes ++ new user value. Both should be ok. ++ Use the PDA as safe address because it should be already in L1. */ ++static inline void clear_fpu_state(struct i387_fxsave_struct *fx) ++{ ++ if (unlikely(fx->swd & X87_FSW_ES)) ++ asm volatile("fnclex"); ++ alternative_input(ASM_NOP8 ASM_NOP2, ++ " emms\n" /* clear stack tags */ ++ " fildl %%gs:0", /* load to clear state */ ++ X86_FEATURE_FXSAVE_LEAK); ++} ++ + static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) + { + int err; +@@ -119,6 +136,7 @@ static inline int save_i387_checking(str + #endif + if (unlikely(err)) + __clear_user(fx, sizeof(struct i387_fxsave_struct)); ++ /* No need to clear here because the caller clears USED_MATH */ + return err; + } + +@@ -149,7 +167,7 @@ static inline void __fxsave_clear(struct + "i" (offsetof(__typeof__(*tsk), + thread.i387.fxsave))); + #endif +- __asm__ __volatile__("fnclex"); ++ clear_fpu_state(&tsk->thread.i387.fxsave); + } + + static inline void kernel_fpu_begin(void) +diff -upr linux-2.6.16.orig/include/asm-x86_64/mman.h linux-2.6.16-026test015/include/asm-x86_64/mman.h +--- linux-2.6.16.orig/include/asm-x86_64/mman.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/mman.h 2006-07-04 14:41:37.000000000 +0400 +@@ -12,6 +12,7 @@ + #define MAP_NORESERVE 0x4000 /* don't check for reservations */ + #define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ + #define MAP_NONBLOCK 0x10000 /* do not block on IO */ ++#define MAP_EXECPRIO 0x20000 /* soft ubc charge */ + + #define MCL_CURRENT 1 /* lock all current mappings */ + #define MCL_FUTURE 2 /* lock all future mappings */ +diff -upr linux-2.6.16.orig/include/asm-x86_64/nmi.h linux-2.6.16-026test015/include/asm-x86_64/nmi.h +--- linux-2.6.16.orig/include/asm-x86_64/nmi.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/nmi.h 2006-07-04 14:41:37.000000000 +0400 +@@ -24,6 +24,9 @@ void set_nmi_callback(nmi_callback_t cal + * Remove the handler previously set. + */ + void unset_nmi_callback(void); ++ ++void set_nmi_ipi_callback(nmi_callback_t callback); ++void unset_nmi_ipi_callback(void); + + #ifdef CONFIG_PM + +diff -upr linux-2.6.16.orig/include/asm-x86_64/pgalloc.h linux-2.6.16-026test015/include/asm-x86_64/pgalloc.h +--- linux-2.6.16.orig/include/asm-x86_64/pgalloc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/pgalloc.h 2006-07-04 14:41:37.000000000 +0400 +@@ -31,12 +31,14 @@ static inline void pmd_free(pmd_t *pmd) + + static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) + { +- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ return (pmd_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + } + + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) + { +- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ return (pud_t *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + } + + static inline void pud_free (pud_t *pud) +@@ -48,7 +50,8 @@ static inline void pud_free (pud_t *pud) + static inline pgd_t *pgd_alloc(struct mm_struct *mm) + { + unsigned boundary; +- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); ++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + if (!pgd) + return NULL; + /* +@@ -77,7 +80,8 @@ static inline pte_t *pte_alloc_one_kerne + + static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) + { +- void *p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); ++ void *p = (void *)get_zeroed_page(GFP_KERNEL_UBC|__GFP_REPEAT| ++ __GFP_SOFT_UBC); + if (!p) + return NULL; + return virt_to_page(p); +diff -upr linux-2.6.16.orig/include/asm-x86_64/processor.h linux-2.6.16-026test015/include/asm-x86_64/processor.h +--- linux-2.6.16.orig/include/asm-x86_64/processor.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/processor.h 2006-07-04 14:41:39.000000000 +0400 +@@ -167,7 +167,7 @@ static inline void clear_in_cr4 (unsigne + /* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +-#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) ++#define IA32_PAGE_OFFSET 0xc0000000 + + #define TASK_SIZE (test_thread_flag(TIF_IA32) ? IA32_PAGE_OFFSET : TASK_SIZE64) + #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? IA32_PAGE_OFFSET : TASK_SIZE64) +diff -upr linux-2.6.16.orig/include/asm-x86_64/segment.h linux-2.6.16-026test015/include/asm-x86_64/segment.h +--- linux-2.6.16.orig/include/asm-x86_64/segment.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/segment.h 2006-07-04 14:41:39.000000000 +0400 +@@ -3,29 +3,28 @@ + + #include <asm/cache.h> + +-#define __KERNEL_CS 0x10 +-#define __KERNEL_DS 0x18 +- +-#define __KERNEL32_CS 0x38 +- ++#define GDT_ENTRY_BOOT_CS 2 ++#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) ++#define GDT_ENTRY_BOOT_DS 3 ++#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) ++#define GDT_ENTRY_TSS 4 /* needs two entries */ + /* + * we cannot use the same code segment descriptor for user and kernel + * -- not even in the long flat mode, because of different DPL /kkeil + * The segment offset needs to contain a RPL. Grr. -AK + * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) + */ ++#define GDT_ENTRY_TLS_MIN 6 ++#define GDT_ENTRY_TLS_MAX 8 + +-#define __USER32_CS 0x23 /* 4*8+3 */ +-#define __USER_DS 0x2b /* 5*8+3 */ +-#define __USER_CS 0x33 /* 6*8+3 */ ++#define GDT_ENTRY_LDT 9 /* needs two entries */ ++#define __KERNEL32_CS 0x58 /* 11*8 */ ++#define __KERNEL_CS 0x60 /* 12*8 */ ++#define __KERNEL_DS 0x68 /* 13*8 */ ++#define __USER32_CS 0x73 /* 14*8+3 */ ++#define __USER_DS 0x7b /* 15*8+3 */ + #define __USER32_DS __USER_DS +- +-#define GDT_ENTRY_TLS 1 +-#define GDT_ENTRY_TSS 8 /* needs two entries */ +-#define GDT_ENTRY_LDT 10 /* needs two entries */ +-#define GDT_ENTRY_TLS_MIN 12 +-#define GDT_ENTRY_TLS_MAX 14 +-/* 15 free */ ++#define __USER_CS 0x83 /* 16*8+3 */ + + #define GDT_ENTRY_TLS_ENTRIES 3 + +@@ -37,7 +36,7 @@ + #define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) + + #define IDT_ENTRIES 256 +-#define GDT_ENTRIES 16 ++#define GDT_ENTRIES 32 + #define GDT_SIZE (GDT_ENTRIES * 8) + #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) + +diff -upr linux-2.6.16.orig/include/asm-x86_64/signal.h linux-2.6.16-026test015/include/asm-x86_64/signal.h +--- linux-2.6.16.orig/include/asm-x86_64/signal.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/signal.h 2006-07-04 14:41:39.000000000 +0400 +@@ -23,11 +23,6 @@ typedef struct { + unsigned long sig[_NSIG_WORDS]; + } sigset_t; + +- +-struct pt_regs; +-asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); +- +- + #else + /* Here we must cater to libcs that poke about in kernel headers. */ + +diff -upr linux-2.6.16.orig/include/asm-x86_64/thread_info.h linux-2.6.16-026test015/include/asm-x86_64/thread_info.h +--- linux-2.6.16.orig/include/asm-x86_64/thread_info.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/thread_info.h 2006-07-04 14:41:39.000000000 +0400 +@@ -74,7 +74,7 @@ static inline struct thread_info *stack_ + + /* thread information allocation */ + #define alloc_thread_info(tsk) \ +- ((struct thread_info *) __get_free_pages(GFP_KERNEL,THREAD_ORDER)) ++ ((struct thread_info *) __get_free_pages(GFP_KERNEL_UBC,THREAD_ORDER)) + #define free_thread_info(ti) free_pages((unsigned long) (ti), THREAD_ORDER) + + #else /* !__ASSEMBLY__ */ +@@ -101,11 +101,13 @@ static inline struct thread_info *stack_ + #define TIF_IRET 5 /* force IRET */ + #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ + #define TIF_SECCOMP 8 /* secure computing */ ++#define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ + #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ + #define TIF_IA32 17 /* 32bit process */ + #define TIF_FORK 18 /* ret_from_fork */ + #define TIF_ABI_PENDING 19 +-#define TIF_MEMDIE 20 ++#define TIF_FREEZE 20 ++#define TIF_MEMDIE 21 + + #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) + #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) +@@ -115,6 +117,7 @@ static inline struct thread_info *stack_ + #define _TIF_IRET (1<<TIF_IRET) + #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) + #define _TIF_SECCOMP (1<<TIF_SECCOMP) ++#define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK) + #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) + #define _TIF_IA32 (1<<TIF_IA32) + #define _TIF_FORK (1<<TIF_FORK) +diff -upr linux-2.6.16.orig/include/asm-x86_64/unistd.h linux-2.6.16-026test015/include/asm-x86_64/unistd.h +--- linux-2.6.16.orig/include/asm-x86_64/unistd.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/asm-x86_64/unistd.h 2006-07-04 14:41:39.000000000 +0400 +@@ -605,8 +605,26 @@ __SYSCALL(__NR_pselect6, sys_ni_syscall) + __SYSCALL(__NR_ppoll, sys_ni_syscall) /* for now */ + #define __NR_unshare 272 + __SYSCALL(__NR_unshare, sys_unshare) +- +-#define __NR_syscall_max __NR_unshare ++#define __NR_getluid 500 ++__SYSCALL(__NR_getluid, sys_getluid) ++#define __NR_setluid 501 ++__SYSCALL(__NR_setluid, sys_setluid) ++#define __NR_setublimit 502 ++__SYSCALL(__NR_setublimit, sys_setublimit) ++#define __NR_ubstat 503 ++__SYSCALL(__NR_ubstat, sys_ubstat) ++#define __NR_fairsched_mknod 504 /* FairScheduler syscalls */ ++__SYSCALL(__NR_fairsched_mknod, sys_fairsched_mknod) ++#define __NR_fairsched_rmnod 505 ++__SYSCALL(__NR_fairsched_rmnod, sys_fairsched_rmnod) ++#define __NR_fairsched_chwt 506 ++__SYSCALL(__NR_fairsched_chwt, sys_fairsched_chwt) ++#define __NR_fairsched_mvpr 507 ++__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr) ++#define __NR_fairsched_rate 508 ++__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate) ++ ++#define __NR_syscall_max __NR_fairsched_rate + + #ifndef __NO_STUBS + +@@ -645,6 +663,7 @@ do { \ + #define __ARCH_WANT_SYS_RT_SIGACTION + #define __ARCH_WANT_SYS_TIME + #define __ARCH_WANT_COMPAT_SYS_TIME ++#define __ARCH_WANT_SYS_RT_SIGSUSPEND + #endif + + #ifndef __KERNEL_SYSCALLS__ +diff -upr linux-2.6.16.orig/include/linux/aio.h linux-2.6.16-026test015/include/linux/aio.h +--- linux-2.6.16.orig/include/linux/aio.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/aio.h 2006-07-04 14:41:39.000000000 +0400 +@@ -247,4 +247,8 @@ static inline struct kiocb *list_kiocb(s + extern unsigned long aio_nr; + extern unsigned long aio_max_nr; + ++void wait_for_all_aios(struct kioctx *ctx); ++extern kmem_cache_t *kioctx_cachep; ++extern void aio_kick_handler(void *); ++ + #endif /* __LINUX__AIO_H */ +diff -upr linux-2.6.16.orig/include/linux/binfmts.h linux-2.6.16-026test015/include/linux/binfmts.h +--- linux-2.6.16.orig/include/linux/binfmts.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/binfmts.h 2006-07-04 14:41:37.000000000 +0400 +@@ -2,6 +2,7 @@ + #define _LINUX_BINFMTS_H + + #include <linux/capability.h> ++#include <linux/fs.h> + + struct pt_regs; + +@@ -28,6 +29,7 @@ struct linux_binprm{ + int sh_bang; + struct file * file; + int e_uid, e_gid; ++ struct exec_perm perm; + kernel_cap_t cap_inheritable, cap_permitted, cap_effective; + void *security; + int argc, envc; +diff -upr linux-2.6.16.orig/include/linux/capability.h linux-2.6.16-026test015/include/linux/capability.h +--- linux-2.6.16.orig/include/linux/capability.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/capability.h 2006-07-04 14:41:38.000000000 +0400 +@@ -146,12 +146,9 @@ typedef __u32 kernel_cap_t; + + #define CAP_NET_BROADCAST 11 + +-/* Allow interface configuration */ + /* Allow administration of IP firewall, masquerading and accounting */ + /* Allow setting debug option on sockets */ + /* Allow modification of routing tables */ +-/* Allow setting arbitrary process / process group ownership on +- sockets */ + /* Allow binding to any address for transparent proxying */ + /* Allow setting TOS (type of service) */ + /* Allow setting promiscuous mode */ +@@ -200,24 +197,19 @@ typedef __u32 kernel_cap_t; + + /* Allow configuration of the secure attention key */ + /* Allow administration of the random device */ +-/* Allow examination and configuration of disk quotas */ + /* Allow configuring the kernel's syslog (printk behaviour) */ + /* Allow setting the domainname */ + /* Allow setting the hostname */ + /* Allow calling bdflush() */ +-/* Allow mount() and umount(), setting up new smb connection */ ++/* Allow setting up new smb connection */ + /* Allow some autofs root ioctls */ + /* Allow nfsservctl */ + /* Allow VM86_REQUEST_IRQ */ + /* Allow to read/write pci config on alpha */ + /* Allow irix_prctl on mips (setstacksize) */ + /* Allow flushing all cache on m68k (sys_cacheflush) */ +-/* Allow removing semaphores */ +-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores +- and shared memory */ + /* Allow locking/unlocking of shared memory segment */ + /* Allow turning swap on/off */ +-/* Allow forged pids on socket credentials passing */ + /* Allow setting readahead and flushing buffers on block devices */ + /* Allow setting geometry in floppy driver */ + /* Allow turning DMA on/off in xd driver */ +@@ -235,6 +227,8 @@ typedef __u32 kernel_cap_t; + arbitrary SCSI commands */ + /* Allow setting encryption key on loopback filesystem */ + /* Allow setting zone reclaim policy */ ++/* Modify data journaling mode on ext3 filesystem (uses journaling ++ resources) */ + + #define CAP_SYS_ADMIN 21 + +@@ -254,8 +248,6 @@ typedef __u32 kernel_cap_t; + /* Override resource limits. Set resource limits. */ + /* Override quota limits. */ + /* Override reserved space on ext2 filesystem */ +-/* Modify data journaling mode on ext3 filesystem (uses journaling +- resources) */ + /* NOTE: ext2 honors fsuid when checking for resource overrides, so + you can override using fsuid too */ + /* Override size restrictions on IPC message queues */ +@@ -288,7 +280,52 @@ typedef __u32 kernel_cap_t; + + #define CAP_AUDIT_CONTROL 30 + ++/* ++ * Important note: VZ capabilities do intersect with CAP_AUDIT ++ * this is due to compatibility reasons. Nothing bad. ++ * Both VZ and Audit/SELinux caps are disabled in VPSs. ++ */ ++ ++/* Allow access to all information. In the other case some structures will be ++ hiding to ensure different Virtual Environment non-interaction on the same ++ node */ ++#define CAP_SETVEID 29 ++ ++#define CAP_VE_ADMIN 30 ++ + #ifdef __KERNEL__ ++ ++#include <linux/config.h> ++ ++#ifdef CONFIG_VE ++ ++/* Replacement for CAP_NET_ADMIN: ++ delegated rights to the Virtual environment of its network administration. ++ For now the following rights have been delegated: ++ ++ Allow setting arbitrary process / process group ownership on sockets ++ Allow interface configuration ++ */ ++#define CAP_VE_NET_ADMIN CAP_VE_ADMIN ++ ++/* Replacement for CAP_SYS_ADMIN: ++ delegated rights to the Virtual environment of its administration. ++ For now the following rights have been delegated: ++ */ ++/* Allow mount/umount/remount */ ++/* Allow examination and configuration of disk quotas */ ++/* Allow removing semaphores */ ++/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores ++ and shared memory */ ++/* Allow locking/unlocking of shared memory segment */ ++/* Allow forged pids on socket credentials passing */ ++ ++#define CAP_VE_SYS_ADMIN CAP_VE_ADMIN ++#else ++#define CAP_VE_NET_ADMIN CAP_NET_ADMIN ++#define CAP_VE_SYS_ADMIN CAP_SYS_ADMIN ++#endif ++ + /* + * Bounding set + */ +@@ -352,9 +389,14 @@ static inline kernel_cap_t cap_invert(ke + #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) + + #define cap_clear(c) do { cap_t(c) = 0; } while(0) ++#ifndef CONFIG_VE + #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) ++#else ++#define cap_set_full(c) \ ++ do {cap_t(c) = ve_is_super(get_exec_env()) ? ~0 : \ ++ get_exec_env()->cap_default; } while(0) ++#endif + #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) +- + #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) + + extern int capable(int cap); +diff -upr linux-2.6.16.orig/include/linux/coda_linux.h linux-2.6.16-026test015/include/linux/coda_linux.h +--- linux-2.6.16.orig/include/linux/coda_linux.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/coda_linux.h 2006-07-04 14:41:37.000000000 +0400 +@@ -38,7 +38,8 @@ extern struct file_operations coda_ioctl + int coda_open(struct inode *i, struct file *f); + int coda_flush(struct file *f); + int coda_release(struct inode *i, struct file *f); +-int coda_permission(struct inode *inode, int mask, struct nameidata *nd); ++int coda_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *); + int coda_revalidate_inode(struct dentry *); + int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); + int coda_setattr(struct dentry *, struct iattr *); +diff -upr linux-2.6.16.orig/include/linux/compat.h linux-2.6.16-026test015/include/linux/compat.h +--- linux-2.6.16.orig/include/linux/compat.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/compat.h 2006-07-04 14:41:39.000000000 +0400 +@@ -181,5 +181,7 @@ static inline int compat_timespec_compar + return lhs->tv_nsec - rhs->tv_nsec; + } + ++extern long compat_nanosleep_restart(struct restart_block *restart); ++ + #endif /* CONFIG_COMPAT */ + #endif /* _LINUX_COMPAT_H */ +diff -upr linux-2.6.16.orig/include/linux/cpt_image.h linux-2.6.16-026test015/include/linux/cpt_image.h +--- linux-2.6.16.orig/include/linux/cpt_image.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/cpt_image.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1453 @@ ++/* ++ * ++ * include/linux/cpt_image.h ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __CPT_IMAGE_H_ ++#define __CPT_IMAGE_H_ 1 ++ ++#define CPT_NULL (~0ULL) ++#define CPT_NOINDEX (~0U) ++ ++/* ++ * Image file layout. ++ * ++ * - major header ++ * - sections[] ++ * ++ * Each section is: ++ * - section header ++ * - array of objects ++ * ++ * All data records are arch independent, 64 bit aligned. ++ */ ++ ++enum _cpt_object_type ++{ ++ CPT_OBJ_TASK = 0, ++ CPT_OBJ_MM, ++ CPT_OBJ_FS, ++ CPT_OBJ_FILES, ++ CPT_OBJ_FILE, ++ CPT_OBJ_SIGHAND_STRUCT, ++ CPT_OBJ_SIGNAL_STRUCT, ++ CPT_OBJ_TTY, ++ CPT_OBJ_SOCKET, ++ CPT_OBJ_SYSVSEM_UNDO, ++ CPT_OBJ_NAMESPACE, ++ CPT_OBJ_SYSV_SHM, ++ CPT_OBJ_INODE, ++ CPT_OBJ_UBC, ++ CPT_OBJ_SLM_SGREG, ++ CPT_OBJ_SLM_REGOBJ, ++ CPT_OBJ_SLM_MM, ++ CPT_OBJ_MAX, ++ /* The objects above are stored in memory while checkpointing */ ++ ++ CPT_OBJ_VMA = 1024, ++ CPT_OBJ_FILEDESC, ++ CPT_OBJ_SIGHANDLER, ++ CPT_OBJ_SIGINFO, ++ CPT_OBJ_LASTSIGINFO, ++ CPT_OBJ_SYSV_SEM, ++ CPT_OBJ_SKB, ++ CPT_OBJ_FLOCK, ++ CPT_OBJ_OPENREQ, ++ CPT_OBJ_VFSMOUNT, ++ CPT_OBJ_TRAILER, ++ CPT_OBJ_SYSVSEM_UNDO_REC, ++ CPT_OBJ_NET_DEVICE, ++ CPT_OBJ_NET_IFADDR, ++ CPT_OBJ_NET_ROUTE, ++ CPT_OBJ_NET_CONNTRACK, ++ CPT_OBJ_NET_CONNTRACK_EXPECT, ++ CPT_OBJ_AIO_CONTEXT, ++ CPT_OBJ_VEINFO, ++ CPT_OBJ_EPOLL, ++ CPT_OBJ_EPOLL_FILE, ++ CPT_OBJ_SKFILTER, ++ CPT_OBJ_SIGALTSTACK, ++ CPT_OBJ_SOCK_MCADDR, ++ ++ CPT_OBJ_X86_REGS = 4096, ++ CPT_OBJ_X86_64_REGS, ++ CPT_OBJ_PAGES, ++ CPT_OBJ_COPYPAGES, ++ CPT_OBJ_REMAPPAGES, ++ CPT_OBJ_LAZYPAGES, ++ CPT_OBJ_NAME, ++ CPT_OBJ_BITS, ++ CPT_OBJ_REF, ++}; ++ ++#define CPT_ALIGN(n) (((n)+7)&~7) ++ ++struct cpt_major_hdr ++{ ++ __u8 cpt_signature[4]; /* Magic number */ ++ __u16 cpt_hdrlen; /* Length of this header */ ++ __u16 cpt_image_version; /* Format of this file; mbz */ ++ __u16 cpt_os_arch; /* Architecture */ ++#define CPT_OS_ARCH_I386 0 ++#define CPT_OS_ARCH_EMT64 1 ++#define CPT_OS_ARCH_IA64 2 ++ __u16 __cpt_pad1; ++ __u32 cpt_os_version; /* Version of kernel, where image was done */ ++ __u32 cpt_os_features; /* Kernel features: SMP etc. */ ++ __u16 cpt_pagesize; /* Page size used by OS */ ++ __u16 cpt_hz; /* HZ used by OS */ ++ __u64 cpt_start_jiffies64; /* Jiffies */ ++ __u32 cpt_start_sec; /* Seconds */ ++ __u32 cpt_start_nsec; /* Nanoseconds */ ++ __u32 cpt_cpu_caps[4]; /* CPU capabilities */ ++ __u32 cpt_kernel_config[4]; /* Kernel config */ ++ __u64 cpt_iptables_mask; /* Used netfilter modules */ ++} __attribute__ ((aligned (8))); ++ ++#define CPT_SIGNATURE0 0x79 ++#define CPT_SIGNATURE1 0x1c ++#define CPT_SIGNATURE2 0x01 ++#define CPT_SIGNATURE3 0x63 ++ ++#define CPT_CPU_X86_CMOV 0 ++#define CPT_CPU_X86_FXSR 1 ++#define CPT_CPU_X86_SSE 2 ++#define CPT_CPU_X86_SSE2 3 ++#define CPT_CPU_X86_MMX 4 ++#define CPT_CPU_X86_3DNOW 5 ++#define CPT_CPU_X86_3DNOW2 6 ++#define CPT_CPU_X86_SEP 7 ++#define CPT_CPU_X86_EMT64 8 ++#define CPT_CPU_X86_IA64 9 ++ ++#define CPT_KERNEL_CONFIG_PAE 0 ++ ++struct cpt_section_hdr ++{ ++ __u64 cpt_next; ++ __u32 cpt_section; ++ __u16 cpt_hdrlen; ++ __u16 cpt_align; ++} __attribute__ ((aligned (8))); ++ ++enum ++{ ++ CPT_SECT_ERROR, /* Error section, content is string */ ++ CPT_SECT_VEINFO, ++ CPT_SECT_FILES, /* Files. Content is array of file objects */ ++ CPT_SECT_TASKS, ++ CPT_SECT_MM, ++ CPT_SECT_FILES_STRUCT, ++ CPT_SECT_FS, ++ CPT_SECT_SIGHAND_STRUCT, ++ CPT_SECT_TTY, ++ CPT_SECT_SOCKET, ++ CPT_SECT_NAMESPACE, ++ CPT_SECT_SYSVSEM_UNDO, ++ CPT_SECT_INODE, /* Inodes with i->i_nlink==0 and ++ * deleted dentires with inodes not ++ * referenced inside dumped process. ++ */ ++ CPT_SECT_SYSV_SHM, ++ CPT_SECT_SYSV_SEM, ++ CPT_SECT_ORPHANS, ++ CPT_SECT_NET_DEVICE, ++ CPT_SECT_NET_IFADDR, ++ CPT_SECT_NET_ROUTE, ++ CPT_SECT_NET_IPTABLES, ++ CPT_SECT_NET_CONNTRACK, ++ CPT_SECT_NET_CONNTRACK_VE0, ++ CPT_SECT_UTSNAME, ++ CPT_SECT_TRAILER, ++ CPT_SECT_UBC, ++ CPT_SECT_SLM_SGREGS, ++ CPT_SECT_SLM_REGOBJS, ++/* Due to silly mistake we cannot index sections beyond this value */ ++#define CPT_SECT_MAX_INDEX (CPT_SECT_SLM_REGOBJS+1) ++ CPT_SECT_EPOLL, ++ CPT_SECT_MAX ++}; ++ ++struct cpt_major_tail ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_lazypages; ++ __u32 cpt_64bit; ++ __u64 cpt_sections[CPT_SECT_MAX_INDEX]; ++ __u32 cpt_nsect; ++ __u8 cpt_signature[4]; /* Magic number */ ++} __attribute__ ((aligned (8))); ++ ++ ++/* Common object header. */ ++struct cpt_object_hdr ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++} __attribute__ ((aligned (8))); ++ ++enum _cpt_content_type { ++ CPT_CONTENT_VOID, ++ CPT_CONTENT_ARRAY, ++ CPT_CONTENT_DATA, ++ CPT_CONTENT_NAME, ++ ++ CPT_CONTENT_STACK, ++ CPT_CONTENT_X86_FPUSTATE_OLD, ++ CPT_CONTENT_X86_FPUSTATE, ++ CPT_CONTENT_MM_CONTEXT, ++ CPT_CONTENT_SEMARRAY, ++ CPT_CONTENT_SEMUNDO, ++ CPT_CONTENT_NLMARRAY, ++ CPT_CONTENT_MAX ++}; ++ ++/* CPT_OBJ_BITS: encode array of bytes */ ++struct cpt_obj_bits ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_size; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_REF: a reference to another object */ ++struct cpt_obj_ref ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_pos; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_VEINFO: various ve specific data */ ++struct cpt_veinfo_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ /* ipc ctls */ ++ __u32 shm_ctl_max; ++ __u32 shm_ctl_all; ++ __u32 shm_ctl_mni; ++ __u32 msg_ctl_max; ++ __u32 msg_ctl_mni; ++ __u32 msg_ctl_mnb; ++ __u32 sem_ctl_arr[4]; ++ ++ /* start time */ ++ __u64 start_timespec_delta; ++ __u64 start_jiffies_delta; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_FILE: one struct file */ ++struct cpt_file_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_flags; ++ __u32 cpt_mode; ++ __u64 cpt_pos; ++ __u32 cpt_uid; ++ __u32 cpt_gid; ++ ++ __u32 cpt_i_mode; ++ __u32 cpt_lflags; ++#define CPT_DENTRY_DELETED 1 ++#define CPT_DENTRY_ROOT 2 ++#define CPT_DENTRY_CLONING 4 ++#define CPT_DENTRY_PROC 8 ++#define CPT_DENTRY_EPOLL 0x10 ++ __u64 cpt_inode; ++ __u64 cpt_priv; ++ ++ __u32 cpt_fown_fd; ++ __u32 cpt_fown_pid; ++ __u32 cpt_fown_uid; ++ __u32 cpt_fown_euid; ++ __u32 cpt_fown_signo; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by file name, encoded as CPT_OBJ_NAME */ ++ ++struct cpt_epoll_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++} __attribute__ ((aligned (8))); ++/* Followed by array of struct cpt_epoll_file */ ++ ++struct cpt_epoll_file_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_fd; ++ __u32 cpt_events; ++ __u64 cpt_data; ++ __u32 cpt_revents; ++ __u32 cpt_ready; ++} __attribute__ ((aligned (8))); ++ ++ ++/* CPT_OBJ_FILEDESC: one file descriptor */ ++struct cpt_fd_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_fd; ++ __u32 cpt_flags; ++#define CPT_FD_FLAG_CLOSEEXEC 1 ++ __u64 cpt_file; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_FILES: one files_struct */ ++struct cpt_files_struct_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_index; ++ __u32 cpt_max_fds; ++ __u32 cpt_next_fd; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by array of cpt_fd_image */ ++ ++/* CPT_OBJ_FS: one fs_struct */ ++struct cpt_fs_struct_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_umask; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++/* Followed by two/three CPT_OBJ_FILENAME for root, pwd and, optionally, altroot */ ++ ++/* CPT_OBJ_INODE: one struct inode */ ++struct cpt_inode_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_dev; ++ __u64 cpt_ino; ++ __u32 cpt_mode; ++ __u32 cpt_nlink; ++ __u32 cpt_uid; ++ __u32 cpt_gid; ++ __u64 cpt_rdev; ++ __u64 cpt_size; ++ __u64 cpt_blksize; ++ __u64 cpt_atime; ++ __u64 cpt_mtime; ++ __u64 cpt_ctime; ++ __u64 cpt_blocks; ++ __u32 cpt_sb; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++/* CPT_OBJ_VFSMOUNT: one vfsmount */ ++struct cpt_vfsmount_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_mntflags; ++ __u32 cpt_flags; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_flock_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_owner; ++ __u32 cpt_pid; ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u32 cpt_flags; ++ __u32 cpt_type; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_tty_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_flags; ++ __u32 cpt_link; ++ __u32 cpt_index; ++ __u32 cpt_drv_type; ++ __u32 cpt_drv_subtype; ++ __u32 cpt_drv_flags; ++ __u8 cpt_packet; ++ __u8 cpt_stopped; ++ __u8 cpt_hw_stopped; ++ __u8 cpt_flow_stopped; ++ ++ __u32 cpt_canon_data; ++ __u32 cpt_canon_head; ++ __u32 cpt_canon_column; ++ __u32 cpt_column; ++ __u8 cpt_ctrl_status; ++ __u8 cpt_erasing; ++ __u8 cpt_lnext; ++ __u8 cpt_icanon; ++ __u8 cpt_raw; ++ __u8 cpt_real_raw; ++ __u8 cpt_closing; ++ __u8 __cpt_pad1; ++ __u16 cpt_minimum_to_wake; ++ __u16 __cpt_pad2; ++ __u32 cpt_pgrp; ++ __u32 cpt_session; ++ __u32 cpt_c_line; ++ __u8 cpt_name[64]; ++ __u16 cpt_ws_row; ++ __u16 cpt_ws_col; ++ __u16 cpt_ws_prow; ++ __u16 cpt_ws_pcol; ++ __u8 cpt_c_cc[32]; ++ __u32 cpt_c_iflag; ++ __u32 cpt_c_oflag; ++ __u32 cpt_c_cflag; ++ __u32 cpt_c_lflag; ++ __u32 cpt_read_flags[4096/32]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sock_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_parent; ++ __u32 cpt_index; ++ ++ __u64 cpt_ssflags; ++ __u16 cpt_type; ++ __u16 cpt_family; ++ __u8 cpt_sstate; ++ __u8 cpt_passcred; ++ __u8 cpt_state; ++ __u8 cpt_reuse; ++ ++ __u8 cpt_zapped; ++ __u8 cpt_shutdown; ++ __u8 cpt_userlocks; ++ __u8 cpt_no_check; ++ __u8 cpt_debug; ++ __u8 cpt_rcvtstamp; ++ __u8 cpt_localroute; ++ __u8 cpt_protocol; ++ ++ __u32 cpt_err; ++ __u32 cpt_err_soft; ++ ++ __u16 cpt_max_ack_backlog; ++ __u16 __cpt_pad1; ++ __u32 cpt_priority; ++ ++ __u32 cpt_rcvlowat; ++ __u32 cpt_bound_dev_if; ++ ++ __u64 cpt_rcvtimeo; ++ __u64 cpt_sndtimeo; ++ __u32 cpt_rcvbuf; ++ __u32 cpt_sndbuf; ++ __u64 cpt_flags; ++ __u64 cpt_lingertime; ++ __u32 cpt_peer_pid; ++ __u32 cpt_peer_uid; ++ ++ __u32 cpt_peer_gid; ++ __u32 cpt_laddrlen; ++ __u32 cpt_laddr[128/4]; ++ __u32 cpt_raddrlen; ++ __u32 cpt_raddr[128/4]; ++ /* AF_UNIX */ ++ __u32 cpt_peer; ++ ++ __u8 cpt_socketpair; ++ __u8 cpt_deleted; ++ __u16 __cpt_pad4; ++ __u32 __cpt_pad5; ++/* ++ struct sk_filter *sk_filter; ++ */ ++ ++ __u64 cpt_stamp; ++ __u32 cpt_daddr; ++ __u16 cpt_dport; ++ __u16 cpt_sport; ++ ++ __u32 cpt_saddr; ++ __u32 cpt_rcv_saddr; ++ ++ __u32 cpt_uc_ttl; ++ __u32 cpt_tos; ++ ++ __u32 cpt_cmsg_flags; ++ __u32 cpt_mc_index; ++ ++ __u32 cpt_mc_addr; ++/* ++ struct ip_options *opt; ++ */ ++ __u8 cpt_hdrincl; ++ __u8 cpt_mc_ttl; ++ __u8 cpt_mc_loop; ++ __u8 cpt_pmtudisc; ++ ++ __u8 cpt_recverr; ++ __u8 cpt_freebind; ++ __u16 cpt_idcounter; ++ __u32 cpt_cork_flags; ++ ++ __u32 cpt_cork_fragsize; ++ __u32 cpt_cork_length; ++ __u32 cpt_cork_addr; ++ __u32 cpt_cork_saddr; ++ __u32 cpt_cork_daddr; ++ __u32 cpt_cork_oif; ++ ++ __u32 cpt_udp_pending; ++ __u32 cpt_udp_corkflag; ++ __u16 cpt_udp_encap; ++ __u16 cpt_udp_len; ++ __u32 __cpt_pad7; ++ ++ __u64 cpt_saddr6[2]; ++ __u64 cpt_rcv_saddr6[2]; ++ __u64 cpt_daddr6[2]; ++ __u32 cpt_flow_label6; ++ __u32 cpt_frag_size6; ++ __u32 cpt_hop_limit6; ++ __u32 cpt_mcast_hops6; ++ ++ __u32 cpt_mcast_oif6; ++ __u8 cpt_rxopt6; ++ __u8 cpt_mc_loop6; ++ __u8 cpt_recverr6; ++ __u8 cpt_sndflow6; ++ ++ __u8 cpt_pmtudisc6; ++ __u8 cpt_ipv6only6; ++ __u8 cpt_mapped; ++ __u8 __cpt_pad8; ++ __u32 cpt_pred_flags; ++ ++ __u32 cpt_rcv_nxt; ++ __u32 cpt_snd_nxt; ++ ++ __u32 cpt_snd_una; ++ __u32 cpt_snd_sml; ++ ++ __u32 cpt_rcv_tstamp; ++ __u32 cpt_lsndtime; ++ ++ __u8 cpt_tcp_header_len; ++ __u8 cpt_ack_pending; ++ __u8 cpt_quick; ++ __u8 cpt_pingpong; ++ __u8 cpt_blocked; ++ __u8 __cpt_pad9; ++ __u16 __cpt_pad10; ++ ++ __u32 cpt_ato; ++ __u32 cpt_ack_timeout; ++ ++ __u32 cpt_lrcvtime; ++ __u16 cpt_last_seg_size; ++ __u16 cpt_rcv_mss; ++ ++ __u32 cpt_snd_wl1; ++ __u32 cpt_snd_wnd; ++ ++ __u32 cpt_max_window; ++ __u32 cpt_pmtu_cookie; ++ ++ __u32 cpt_mss_cache; ++ __u16 cpt_mss_cache_std; ++ __u16 cpt_mss_clamp; ++ ++ __u16 cpt_ext_header_len; ++ __u16 cpt_ext2_header_len; ++ __u8 cpt_ca_state; ++ __u8 cpt_retransmits; ++ __u8 cpt_reordering; ++ __u8 cpt_frto_counter; ++ ++ __u32 cpt_frto_highmark; ++ __u8 cpt_adv_cong; ++ __u8 cpt_defer_accept; ++ __u8 cpt_backoff; ++ __u8 __cpt_pad11; ++ ++ __u32 cpt_srtt; ++ __u32 cpt_mdev; ++ ++ __u32 cpt_mdev_max; ++ __u32 cpt_rttvar; ++ ++ __u32 cpt_rtt_seq; ++ __u32 cpt_rto; ++ ++ __u32 cpt_packets_out; ++ __u32 cpt_left_out; ++ ++ __u32 cpt_retrans_out; ++ __u32 cpt_snd_ssthresh; ++ ++ __u32 cpt_snd_cwnd; ++ __u16 cpt_snd_cwnd_cnt; ++ __u16 cpt_snd_cwnd_clamp; ++ ++ __u32 cpt_snd_cwnd_used; ++ __u32 cpt_snd_cwnd_stamp; ++ ++ __u32 cpt_timeout; ++ __u32 cpt_ka_timeout; ++ ++ __u32 cpt_rcv_wnd; ++ __u32 cpt_rcv_wup; ++ ++ __u32 cpt_write_seq; ++ __u32 cpt_pushed_seq; ++ ++ __u32 cpt_copied_seq; ++ __u8 cpt_tstamp_ok; ++ __u8 cpt_wscale_ok; ++ __u8 cpt_sack_ok; ++ __u8 cpt_saw_tstamp; ++ ++ __u8 cpt_snd_wscale; ++ __u8 cpt_rcv_wscale; ++ __u8 cpt_nonagle; ++ __u8 cpt_keepalive_probes; ++ __u32 cpt_rcv_tsval; ++ ++ __u32 cpt_rcv_tsecr; ++ __u32 cpt_ts_recent; ++ ++ __u64 cpt_ts_recent_stamp; ++ __u16 cpt_user_mss; ++ __u8 cpt_dsack; ++ __u8 cpt_eff_sacks; ++ __u32 cpt_sack_array[2*5]; ++ __u32 cpt_window_clamp; ++ ++ __u32 cpt_rcv_ssthresh; ++ __u8 cpt_probes_out; ++ __u8 cpt_num_sacks; ++ __u16 cpt_advmss; ++ ++ __u8 cpt_syn_retries; ++ __u8 cpt_ecn_flags; ++ __u16 cpt_prior_ssthresh; ++ __u32 cpt_lost_out; ++ ++ __u32 cpt_sacked_out; ++ __u32 cpt_fackets_out; ++ ++ __u32 cpt_high_seq; ++ __u32 cpt_retrans_stamp; ++ ++ __u32 cpt_undo_marker; ++ __u32 cpt_undo_retrans; ++ ++ __u32 cpt_urg_seq; ++ __u16 cpt_urg_data; ++ __u8 cpt_pending; ++ __u8 cpt_urg_mode; ++ ++ __u32 cpt_snd_up; ++ __u32 cpt_keepalive_time; ++ ++ __u32 cpt_keepalive_intvl; ++ __u32 cpt_linger2; ++ ++ __u32 cpt_rcvrtt_rtt; ++ __u32 cpt_rcvrtt_seq; ++ ++ __u32 cpt_rcvrtt_time; ++ __u32 __cpt_pad12; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sockmc_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u16 cpt_family; ++ __u16 cpt_mode; ++ __u32 cpt_ifindex; ++ __u32 cpt_mcaddr[4]; ++} __attribute__ ((aligned (8))); ++/* Followed by array of source addresses, each zero padded to 16 bytes */ ++ ++struct cpt_openreq_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_rcv_isn; ++ __u32 cpt_snt_isn; ++ ++ __u16 cpt_rmt_port; ++ __u16 cpt_mss; ++ __u8 cpt_family; ++ __u8 cpt_retrans; ++ __u8 cpt_snd_wscale; ++ __u8 cpt_rcv_wscale; ++ ++ __u8 cpt_tstamp_ok; ++ __u8 cpt_sack_ok; ++ __u8 cpt_wscale_ok; ++ __u8 cpt_ecn_ok; ++ __u8 cpt_acked; ++ __u8 __cpt_pad1; ++ __u16 __cpt_pad2; ++ ++ __u32 cpt_window_clamp; ++ __u32 cpt_rcv_wnd; ++ __u32 cpt_ts_recent; ++ __u32 cpt_iif; ++ __u64 cpt_expires; ++ ++ __u64 cpt_loc_addr[2]; ++ __u64 cpt_rmt_addr[2]; ++/* ++ struct ip_options *opt; ++ */ ++ ++} __attribute__ ((aligned (8))); ++ ++struct cpt_skb_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_owner; ++ __u32 cpt_queue; ++#define CPT_SKB_NQ 0 ++#define CPT_SKB_RQ 1 ++#define CPT_SKB_WQ 2 ++#define CPT_SKB_OFOQ 3 ++ ++ __u64 cpt_stamp; ++ __u32 cpt_len; ++ __u32 cpt_hspace; ++ __u32 cpt_tspace; ++ __u32 cpt_h; ++ __u32 cpt_nh; ++ __u32 cpt_mac; ++ ++ __u64 cpt_cb[5]; ++ __u32 cpt_mac_len; ++ __u32 cpt_csum; ++ __u8 cpt_local_df; ++ __u8 cpt_pkt_type; ++ __u8 cpt_ip_summed; ++ __u8 __cpt_pad1; ++ __u32 cpt_priority; ++ __u16 cpt_protocol; ++ __u16 cpt_security; ++ __u16 cpt_tso_segs; ++ __u16 cpt_tso_size; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_sysvshm_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_key; ++ __u64 cpt_uid; ++ __u64 cpt_gid; ++ __u64 cpt_cuid; ++ __u64 cpt_cgid; ++ __u64 cpt_mode; ++ __u64 cpt_seq; ++ ++ __u32 cpt_id; ++ __u32 cpt_mlockuser; ++ __u64 cpt_segsz; ++ __u64 cpt_atime; ++ __u64 cpt_ctime; ++ __u64 cpt_dtime; ++ __u64 cpt_creator; ++ __u64 cpt_last; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_sysvsem_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_key; ++ __u64 cpt_uid; ++ __u64 cpt_gid; ++ __u64 cpt_cuid; ++ __u64 cpt_cgid; ++ __u64 cpt_mode; ++ __u64 cpt_seq; ++ __u32 cpt_id; ++ __u32 __cpt_pad1; ++ ++ __u64 cpt_otime; ++ __u64 cpt_ctime; ++} __attribute__ ((aligned (8))); ++/* Content is array of pairs semval/sempid */ ++ ++struct cpt_sysvsem_undo_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_id; ++ __u32 cpt_nsem; ++} __attribute__ ((aligned (8))); ++ ++ ++struct cpt_mm_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start_code; ++ __u64 cpt_end_code; ++ __u64 cpt_start_data; ++ __u64 cpt_end_data; ++ __u64 cpt_start_brk; ++ __u64 cpt_brk; ++ __u64 cpt_start_stack; ++ __u64 cpt_start_arg; ++ __u64 cpt_end_arg; ++ __u64 cpt_start_env; ++ __u64 cpt_end_env; ++ __u64 cpt_def_flags; ++ __u64 cpt_mmub; ++ __u8 cpt_dumpable; ++ __u8 cpt_vps_dumpable; ++ __u8 cpt_used_hugetlb; ++ __u8 __cpt_pad; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_page_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_remappage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_pgoff; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_copypage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_source; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_lazypage_block ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_index; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_vma_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_file; ++ __u32 cpt_type; ++#define CPT_VMA_TYPE_0 0 ++#define CPT_VMA_TYPE_SHM 1 ++ __u32 cpt_anonvma; ++ __u64 cpt_anonvmaid; ++ ++ __u64 cpt_start; ++ __u64 cpt_end; ++ __u64 cpt_flags; ++ __u64 cpt_pgprot; ++ __u64 cpt_pgoff; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_aio_ctx_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_max_reqs; ++ __u32 cpt_ring_pages; ++ __u32 cpt_tail; ++ __u32 cpt_nr; ++ __u64 cpt_mmap_base; ++ /* Data (io_event's) and struct aio_ring are stored in user space VM */ ++} __attribute__ ((aligned (8))); ++ ++ ++/* Format of MM section. ++ * ++ * It is array of MM objects (mm_struct). Each MM object is ++ * header, encoding mm_struct, followed by array of VMA objects. ++ * Each VMA consists of VMA header, encoding vm_area_struct, and ++ * if the VMA contains copied pages, the header is followed by ++ * array of tuples start-end each followed by data. ++ * ++ * ATTN: no block/page alignment. Only 64bit alignment. This might be not good? ++ */ ++ ++struct cpt_restart_block { ++ __u64 fn; ++#define CPT_RBL_0 0 ++#define CPT_RBL_NANOSLEEP 1 ++#define CPT_RBL_COMPAT_NANOSLEEP 2 ++ __u64 arg0; ++ __u64 arg1; ++ __u64 arg2; ++ __u64 arg3; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_siginfo_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_qflags; ++ __u32 cpt_signo; ++ __u32 cpt_errno; ++ __u32 cpt_code; ++ ++ __u64 cpt_sigval; ++ __u32 cpt_pid; ++ __u32 cpt_uid; ++ __u64 cpt_utime; ++ __u64 cpt_stime; ++ ++ __u64 cpt_user; ++} __attribute__ ((aligned (8))); ++ ++/* Portable presentaions for segment registers */ ++ ++#define CPT_SEG_ZERO 0 ++#define CPT_SEG_TLS1 1 ++#define CPT_SEG_TLS2 2 ++#define CPT_SEG_TLS3 3 ++#define CPT_SEG_USER32_DS 4 ++#define CPT_SEG_USER32_CS 5 ++#define CPT_SEG_USER64_DS 6 ++#define CPT_SEG_USER64_CS 7 ++#define CPT_SEG_LDT 256 ++ ++struct cpt_x86_regs ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_debugreg[8]; ++ __u32 cpt_fs; ++ __u32 cpt_gs; ++ ++ __u32 cpt_ebx; ++ __u32 cpt_ecx; ++ __u32 cpt_edx; ++ __u32 cpt_esi; ++ __u32 cpt_edi; ++ __u32 cpt_ebp; ++ __u32 cpt_eax; ++ __u32 cpt_xds; ++ __u32 cpt_xes; ++ __u32 cpt_orig_eax; ++ __u32 cpt_eip; ++ __u32 cpt_xcs; ++ __u32 cpt_eflags; ++ __u32 cpt_esp; ++ __u32 cpt_xss; ++ __u32 cpt_pad; ++}; ++ ++struct cpt_x86_64_regs ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_debugreg[8]; ++ ++ __u64 cpt_fsbase; ++ __u64 cpt_gsbase; ++ __u32 cpt_fsindex; ++ __u32 cpt_gsindex; ++ __u32 cpt_ds; ++ __u32 cpt_es; ++ ++ __u64 cpt_r15; ++ __u64 cpt_r14; ++ __u64 cpt_r13; ++ __u64 cpt_r12; ++ __u64 cpt_rbp; ++ __u64 cpt_rbx; ++ __u64 cpt_r11; ++ __u64 cpt_r10; ++ __u64 cpt_r9; ++ __u64 cpt_r8; ++ __u64 cpt_rax; ++ __u64 cpt_rcx; ++ __u64 cpt_rdx; ++ __u64 cpt_rsi; ++ __u64 cpt_rdi; ++ __u64 cpt_orig_rax; ++ __u64 cpt_rip; ++ __u64 cpt_cs; ++ __u64 cpt_eflags; ++ __u64 cpt_rsp; ++ __u64 cpt_ss; ++}; ++ ++struct cpt_task_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_state; ++ __u64 cpt_flags; ++ __u64 cpt_ptrace; ++ __u32 cpt_prio; ++ __u32 cpt_static_prio; ++ __u32 cpt_policy; ++ __u32 cpt_rt_priority; ++ ++ /* struct thread_info */ ++ __u64 cpt_exec_domain; ++ __u64 cpt_thrflags; ++ __u64 cpt_thrstatus; ++ __u64 cpt_addr_limit; ++ ++ __u64 cpt_personality; ++ ++ __u64 cpt_mm; ++ __u64 cpt_files; ++ __u64 cpt_fs; ++ __u64 cpt_signal; ++ __u64 cpt_sighand; ++ __u64 cpt_sigblocked; ++ __u64 cpt_sigrblocked; ++ __u64 cpt_sigpending; ++ __u64 cpt_namespace; ++ __u64 cpt_sysvsem_undo; ++ __u32 cpt_pid; ++ __u32 cpt_tgid; ++ __u32 cpt_ppid; ++ __u32 cpt_rppid; ++ __u32 cpt_pgrp; ++ __u32 cpt_session; ++ __u32 cpt_old_pgrp; ++ __u32 __cpt_pad; ++ __u32 cpt_leader; ++ __u8 cpt_pn_state; ++ __u8 cpt_stopped_state; ++ __u8 cpt_sigsuspend_state; ++ __u8 cpt_64bit; ++ __u64 cpt_set_tid; ++ __u64 cpt_clear_tid; ++ __u32 cpt_exit_code; ++ __u32 cpt_exit_signal; ++ __u32 cpt_pdeath_signal; ++ __u32 cpt_user; ++ __u32 cpt_uid; ++ __u32 cpt_euid; ++ __u32 cpt_suid; ++ __u32 cpt_fsuid; ++ __u32 cpt_gid; ++ __u32 cpt_egid; ++ __u32 cpt_sgid; ++ __u32 cpt_fsgid; ++ __u32 cpt_ngids; ++ __u32 cpt_gids[32]; ++ __u32 __cpt_pad2; ++ __u64 cpt_ecap; ++ __u64 cpt_icap; ++ __u64 cpt_pcap; ++ __u8 cpt_comm[16]; ++ __u64 cpt_tls[3]; ++ struct cpt_restart_block cpt_restart; ++ __u64 cpt_it_real_value; /* V0: jiffies, V1: nsec */ ++ __u64 cpt_it_real_incr; /* V0: jiffies, V1: nsec */ ++ __u64 cpt_it_prof_value; ++ __u64 cpt_it_prof_incr; ++ __u64 cpt_it_virt_value; ++ __u64 cpt_it_virt_incr; ++ ++ __u16 cpt_used_math; ++ __u8 cpt_keepcap; ++ __u8 cpt_did_exec; ++ __u32 cpt_ptrace_message; ++ ++ __u64 cpt_utime; ++ __u64 cpt_stime; ++ __u64 cpt_starttime; /* V0: jiffies, V1: timespec */ ++ __u64 cpt_nvcsw; ++ __u64 cpt_nivcsw; ++ __u64 cpt_min_flt; ++ __u64 cpt_maj_flt; ++ ++ __u64 cpt_sigsuspend_blocked; ++ __u64 cpt_cutime, cpt_cstime; ++ __u64 cpt_cnvcsw, cpt_cnivcsw; ++ __u64 cpt_cmin_flt, cpt_cmaj_flt; ++ ++#define CPT_RLIM_NLIMITS 16 ++ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; ++ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; ++ ++ __u64 cpt_task_ub; ++ __u64 cpt_exec_ub; ++ __u64 cpt_mm_ub; ++ __u64 cpt_fork_sub; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_sigaltstack_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_stack; ++ __u32 cpt_stacksize; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_signal_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_leader; ++ __u8 cpt_pgrp_type; ++ __u8 cpt_old_pgrp_type; ++ __u8 cpt_session_type; ++#define CPT_PGRP_NORMAL 0 ++#define CPT_PGRP_ORPHAN 1 ++#define CPT_PGRP_STRAY 2 ++ __u8 __cpt_pad1; ++ __u64 cpt_pgrp; ++ __u64 cpt_old_pgrp; ++ __u64 cpt_session; ++ __u64 cpt_sigpending; ++ __u64 cpt_ctty; ++ ++ __u32 cpt_curr_target; ++ __u32 cpt_group_exit; ++ __u32 cpt_group_exit_code; ++ __u32 cpt_group_exit_task; ++ __u32 cpt_notify_count; ++ __u32 cpt_group_stop_count; ++ __u32 cpt_stop_state; ++ __u32 __cpt_pad2; ++ ++ __u64 cpt_utime, cpt_stime, cpt_cutime, cpt_cstime; ++ __u64 cpt_nvcsw, cpt_nivcsw, cpt_cnvcsw, cpt_cnivcsw; ++ __u64 cpt_min_flt, cpt_maj_flt, cpt_cmin_flt, cpt_cmaj_flt; ++ ++ __u64 cpt_rlim_cur[CPT_RLIM_NLIMITS]; ++ __u64 cpt_rlim_max[CPT_RLIM_NLIMITS]; ++} __attribute__ ((aligned (8))); ++/* Followed by list of posix timers. */ ++ ++struct cpt_sighand_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++} __attribute__ ((aligned (8))); ++/* Followed by list of sighandles. */ ++ ++struct cpt_sighandler_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_signo; ++ __u32 __cpt_pad1; ++ __u64 cpt_handler; ++ __u64 cpt_restorer; ++ __u64 cpt_flags; ++ __u64 cpt_mask; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_netdev_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_index; ++ __u32 cpt_flags; ++ __u8 cpt_name[16]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ifaddr_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u32 cpt_index; ++ __u8 cpt_family; ++ __u8 cpt_masklen; ++ __u8 cpt_flags; ++ __u8 cpt_scope; ++ __u32 cpt_address[4]; ++ __u32 cpt_peer[4]; ++ __u32 cpt_broadcast[4]; ++ __u8 cpt_label[16]; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ipct_tuple ++{ ++ __u32 cpt_src; ++ __u16 cpt_srcport; ++ __u16 __cpt_pad1; ++ ++ __u32 cpt_dst; ++ __u16 cpt_dstport; ++ __u8 cpt_protonum; ++ __u8 cpt_dir; /* TEMPORARY HACK TO VALIDATE CODE */ ++} __attribute__ ((aligned (8))); ++ ++struct cpt_nat_manip ++{ ++ __u8 cpt_direction; ++ __u8 cpt_hooknum; ++ __u8 cpt_maniptype; ++ __u8 __cpt_pad1; ++ ++ __u32 cpt_manip_addr; ++ __u16 cpt_manip_port; ++ __u16 __cpt_pad2; ++ __u32 __cpt_pad3; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_nat_seq ++{ ++ __u32 cpt_correction_pos; ++ __u32 cpt_offset_before; ++ __u32 cpt_offset_after; ++ __u32 __cpt_pad1; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ip_connexpect_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_timeout; ++ __u32 cpt_sibling_conntrack; /* Index of child conntrack */ ++ __u32 cpt_seq; /* id in 2.6.15 */ ++ ++ struct cpt_ipct_tuple cpt_ct_tuple; /* NU 2.6.15 */ ++ struct cpt_ipct_tuple cpt_tuple; ++ struct cpt_ipct_tuple cpt_mask; ++ ++ /* union ip_conntrack_expect_help. Used by ftp, irc, amanda */ ++ __u32 cpt_help[3]; /* NU 2.6.15 */ ++ __u16 cpt_manip_proto; ++ __u8 cpt_dir; ++ __u8 cpt_flags; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_ip_conntrack_image ++{ ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ struct cpt_ipct_tuple cpt_tuple[2]; ++ __u64 cpt_status; ++ __u64 cpt_timeout; ++ __u32 cpt_index; ++ __u8 cpt_ct_helper; ++ __u8 cpt_nat_helper; ++ __u16 cpt_pad1; ++ ++ /* union ip_conntrack_proto. Used by tcp and icmp. */ ++ __u32 cpt_proto_data[12]; ++ ++ /* union ip_conntrack_help. Used by ftp and pptp helper. ++ * We do not support pptp... ++ */ ++ __u32 cpt_help_data[6]; ++ ++ /* nat info */ ++ __u32 cpt_initialized; /* NU 2.6.15 */ ++ __u32 cpt_num_manips; /* NU 2.6.15 */ ++ struct cpt_nat_manip cpt_nat_manips[6]; /* NU 2.6.15 */ ++ ++ struct cpt_nat_seq cpt_nat_seq[2]; ++ ++ __u32 cpt_masq_index; ++ __u32 cpt_id; ++ __u32 cpt_mark; ++} __attribute__ ((aligned (8))); ++ ++struct cpt_beancounter_image { ++ __u64 cpt_next; ++ __u32 cpt_object; ++ __u16 cpt_hdrlen; ++ __u16 cpt_content; ++ ++ __u64 cpt_parent; ++ __u32 cpt_id; ++ __u32 __cpt_pad; ++ __u64 cpt_parms[32 * 6 * 2]; ++} __attribute__ ((aligned (8))); ++ ++#ifdef __KERNEL__ ++ ++static inline void *cpt_ptr_import(__u64 ptr) ++{ ++ return (void*)(unsigned long)ptr; ++} ++ ++static inline __u64 cpt_ptr_export(void __user *ptr) ++{ ++ return (__u64)(unsigned long)ptr; ++} ++ ++static inline void cpt_sigset_import(sigset_t *sig, __u64 ptr) ++{ ++ memcpy(sig, &ptr, sizeof(*sig)); ++} ++ ++static inline __u64 cpt_sigset_export(sigset_t *sig) ++{ ++ return *(__u64*)sig; ++} ++ ++static inline __u64 cpt_timespec_export(struct timespec *tv) ++{ ++ return (((u64)tv->tv_sec) << 32) + tv->tv_nsec; ++} ++ ++static inline void cpt_timespec_import(struct timespec *tv, __u64 val) ++{ ++ tv->tv_sec = val>>32; ++ tv->tv_nsec = (val&0xFFFFFFFF); ++} ++ ++static inline __u64 cpt_timeval_export(struct timeval *tv) ++{ ++ return (((u64)tv->tv_sec) << 32) + tv->tv_usec; ++} ++ ++static inline void cpt_timeval_import(struct timeval *tv, __u64 val) ++{ ++ tv->tv_sec = val>>32; ++ tv->tv_usec = (val&0xFFFFFFFF); ++} ++ ++#endif ++ ++#endif /* __CPT_IMAGE_H_ */ +diff -upr linux-2.6.16.orig/include/linux/cpt_ioctl.h linux-2.6.16-026test015/include/linux/cpt_ioctl.h +--- linux-2.6.16.orig/include/linux/cpt_ioctl.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/cpt_ioctl.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,41 @@ ++/* ++ * ++ * include/linux/cpt_ioctl.h ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _CPT_IOCTL_H_ ++#define _CPT_IOCTL_H_ 1 ++ ++#include <linux/types.h> ++#include <linux/ioctl.h> ++ ++#define CPTCTLTYPE '-' ++#define CPT_SET_DUMPFD _IOW(CPTCTLTYPE, 1, int) ++#define CPT_SET_STATUSFD _IOW(CPTCTLTYPE, 2, int) ++#define CPT_SET_LOCKFD _IOW(CPTCTLTYPE, 3, int) ++#define CPT_SET_VEID _IOW(CPTCTLTYPE, 4, int) ++#define CPT_SUSPEND _IO(CPTCTLTYPE, 5) ++#define CPT_DUMP _IO(CPTCTLTYPE, 6) ++#define CPT_UNDUMP _IO(CPTCTLTYPE, 7) ++#define CPT_RESUME _IO(CPTCTLTYPE, 8) ++#define CPT_KILL _IO(CPTCTLTYPE, 9) ++#define CPT_JOIN_CONTEXT _IO(CPTCTLTYPE, 10) ++#define CPT_GET_CONTEXT _IOW(CPTCTLTYPE, 11, unsigned int) ++#define CPT_PUT_CONTEXT _IO(CPTCTLTYPE, 12) ++#define CPT_SET_PAGEINFDIN _IOW(CPTCTLTYPE, 13, int) ++#define CPT_SET_PAGEINFDOUT _IOW(CPTCTLTYPE, 14, int) ++#define CPT_PAGEIND _IO(CPTCTLTYPE, 15) ++#define CPT_VMPREP _IOW(CPTCTLTYPE, 16, int) ++#define CPT_SET_LAZY _IOW(CPTCTLTYPE, 17, int) ++#define CPT_SET_CPU_FLAGS _IOW(CPTCTLTYPE, 18, unsigned int) ++#define CPT_TEST_CAPS _IOW(CPTCTLTYPE, 19, unsigned int) ++#define CPT_TEST_VECAPS _IOW(CPTCTLTYPE, 20, unsigned int) ++#define CPT_SET_ERRORFD _IOW(CPTCTLTYPE, 21, int) ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/cpu.h linux-2.6.16-026test015/include/linux/cpu.h +--- linux-2.6.16.orig/include/linux/cpu.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/cpu.h 2006-07-04 14:41:36.000000000 +0400 +@@ -32,7 +32,7 @@ struct cpu { + }; + + extern int register_cpu(struct cpu *, int, struct node *); +-extern struct sys_device *get_cpu_sysdev(int cpu); ++extern struct sys_device *get_cpu_sysdev(unsigned cpu); + #ifdef CONFIG_HOTPLUG_CPU + extern void unregister_cpu(struct cpu *, struct node *); + #endif +diff -upr linux-2.6.16.orig/include/linux/cpumask.h linux-2.6.16-026test015/include/linux/cpumask.h +--- linux-2.6.16.orig/include/linux/cpumask.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/cpumask.h 2006-07-04 14:41:36.000000000 +0400 +@@ -408,6 +408,7 @@ extern cpumask_t cpu_present_map; + }) + + #define for_each_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map) ++#define for_each_possible_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map) + #define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map) + #define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map) + +diff -upr linux-2.6.16.orig/include/linux/dcache.h linux-2.6.16-026test015/include/linux/dcache.h +--- linux-2.6.16.orig/include/linux/dcache.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/dcache.h 2006-07-04 14:41:38.000000000 +0400 +@@ -9,6 +9,8 @@ + #include <linux/cache.h> + #include <linux/rcupdate.h> + ++#include <ub/ub_dcache.h> ++ + struct nameidata; + struct vfsmount; + +@@ -111,6 +113,9 @@ struct dentry { + struct dcookie_struct *d_cookie; /* cookie, if any */ + #endif + int d_mounted; ++#ifdef CONFIG_USER_RESOURCE ++ struct dentry_beancounter dentry_bc; ++#endif + unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ + }; + +@@ -161,7 +166,11 @@ d_iput: no no no yes + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_VIRTUAL 0x0100 /* ve accessible */ ++ ++extern void mark_tree_virtual(struct vfsmount *m, struct dentry *d); + ++extern kmem_cache_t *dentry_cache; + extern spinlock_t dcache_lock; + + /** +@@ -215,7 +224,7 @@ extern struct dentry * d_alloc_anon(stru + extern struct dentry * d_splice_alias(struct inode *, struct dentry *); + extern void shrink_dcache_sb(struct super_block *); + extern void shrink_dcache_parent(struct dentry *); +-extern void shrink_dcache_anon(struct hlist_head *); ++extern void shrink_dcache_anon(struct super_block *); + extern int d_invalidate(struct dentry *); + + /* only used at mount-time */ +@@ -277,6 +286,7 @@ extern struct dentry * __d_lookup(struct + /* validate "insecure" dentry pointer */ + extern int d_validate(struct dentry *, struct dentry *); + ++extern int d_root_check(struct dentry *, struct vfsmount *); + extern char * d_path(struct dentry *, struct vfsmount *, char *, int); + + /* Allocation counts.. */ +@@ -297,6 +307,8 @@ extern char * d_path(struct dentry *, st + static inline struct dentry *dget(struct dentry *dentry) + { + if (dentry) { ++ if (ub_dget_testone(dentry)) ++ BUG(); + BUG_ON(!atomic_read(&dentry->d_count)); + atomic_inc(&dentry->d_count); + } +@@ -340,6 +352,8 @@ extern struct dentry *lookup_create(stru + + extern int sysctl_vfs_cache_pressure; + ++extern int check_area_access_ve(struct dentry *, struct vfsmount *); ++extern int check_area_execute_ve(struct dentry *, struct vfsmount *); + #endif /* __KERNEL__ */ + + #endif /* __LINUX_DCACHE_H */ +diff -upr linux-2.6.16.orig/include/linux/devpts_fs.h linux-2.6.16-026test015/include/linux/devpts_fs.h +--- linux-2.6.16.orig/include/linux/devpts_fs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/devpts_fs.h 2006-07-04 14:41:38.000000000 +0400 +@@ -21,6 +21,15 @@ int devpts_pty_new(struct tty_struct *tt + struct tty_struct *devpts_get_tty(int number); /* get tty structure */ + void devpts_pty_kill(int number); /* unlink */ + ++struct devpts_config { ++ int setuid; ++ int setgid; ++ uid_t uid; ++ gid_t gid; ++ umode_t mode; ++}; ++ ++extern struct devpts_config devpts_config; + #else + + /* Dummy stubs in the no-pty case */ +diff -upr linux-2.6.16.orig/include/linux/elfcore.h linux-2.6.16-026test015/include/linux/elfcore.h +--- linux-2.6.16.orig/include/linux/elfcore.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/elfcore.h 2006-07-04 14:41:39.000000000 +0400 +@@ -7,6 +7,8 @@ + #include <linux/user.h> + #include <linux/ptrace.h> + ++extern int sysctl_at_vsyscall; ++ + struct elf_siginfo + { + int si_signo; /* signal number */ +diff -upr linux-2.6.16.orig/include/linux/eventpoll.h linux-2.6.16-026test015/include/linux/eventpoll.h +--- linux-2.6.16.orig/include/linux/eventpoll.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/eventpoll.h 2006-07-04 14:41:39.000000000 +0400 +@@ -85,6 +85,91 @@ static inline void eventpoll_release(str + eventpoll_release_file(file); + } + ++struct epoll_filefd { ++ struct file *file; ++ int fd; ++}; ++ ++/* ++ * This structure is stored inside the "private_data" member of the file ++ * structure and rapresent the main data sructure for the eventpoll ++ * interface. ++ */ ++struct eventpoll { ++ /* Protect the this structure access */ ++ rwlock_t lock; ++ ++ /* ++ * This semaphore is used to ensure that files are not removed ++ * while epoll is using them. This is read-held during the event ++ * collection loop and it is write-held during the file cleanup ++ * path, the epoll file exit code and the ctl operations. ++ */ ++ struct rw_semaphore sem; ++ ++ /* Wait queue used by sys_epoll_wait() */ ++ wait_queue_head_t wq; ++ ++ /* Wait queue used by file->poll() */ ++ wait_queue_head_t poll_wait; ++ ++ /* List of ready file descriptors */ ++ struct list_head rdllist; ++ ++ /* RB-Tree root used to store monitored fd structs */ ++ struct rb_root rbr; ++}; ++ ++/* ++ * Each file descriptor added to the eventpoll interface will ++ * have an entry of this type linked to the hash. ++ */ ++struct epitem { ++ /* RB-Tree node used to link this structure to the eventpoll rb-tree */ ++ struct rb_node rbn; ++ ++ /* List header used to link this structure to the eventpoll ready list */ ++ struct list_head rdllink; ++ ++ /* The file descriptor information this item refers to */ ++ struct epoll_filefd ffd; ++ ++ /* Number of active wait queue attached to poll operations */ ++ int nwait; ++ ++ /* List containing poll wait queues */ ++ struct list_head pwqlist; ++ ++ /* The "container" of this item */ ++ struct eventpoll *ep; ++ ++ /* The structure that describe the interested events and the source fd */ ++ struct epoll_event event; ++ ++ /* ++ * Used to keep track of the usage count of the structure. This avoids ++ * that the structure will desappear from underneath our processing. ++ */ ++ atomic_t usecnt; ++ ++ /* List header used to link this item to the "struct file" items list */ ++ struct list_head fllink; ++ ++ /* List header used to link the item to the transfer list */ ++ struct list_head txlink; ++ ++ /* ++ * This is used during the collection/transfer of events to userspace ++ * to pin items empty events set. ++ */ ++ unsigned int revents; ++}; ++ ++extern struct semaphore epsem; ++struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); ++int ep_insert(struct eventpoll *ep, struct epoll_event *event, ++ struct file *tfile, int fd); ++void ep_release_epitem(struct epitem *epi); + + #else + +diff -upr linux-2.6.16.orig/include/linux/fairsched.h linux-2.6.16-026test015/include/linux/fairsched.h +--- linux-2.6.16.orig/include/linux/fairsched.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/fairsched.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,118 @@ ++#ifndef __LINUX_FAIRSCHED_H__ ++#define __LINUX_FAIRSCHED_H__ ++ ++/* ++ * Fair Scheduler ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/cache.h> ++#include <asm/timex.h> ++ ++#define FAIRSCHED_HAS_CPU_BINDING 0 ++ ++typedef struct { cycles_t t; } fschtag_t; ++typedef struct { unsigned long d; } fschdur_t; ++typedef struct { cycles_t v; } fschvalue_t; ++ ++struct vcpu_scheduler; ++ ++struct fairsched_node { ++ struct list_head runlist; ++ ++ /* ++ * Fair Scheduler fields ++ * ++ * nr_running >= nr_ready (!= if delayed) ++ */ ++ fschtag_t start_tag; ++ int nr_ready; ++ int nr_runnable; ++ int nr_pcpu; ++ ++ /* ++ * Rate limitator fields ++ */ ++ cycles_t last_updated_at; ++ fschvalue_t value; /* leaky function value */ ++ cycles_t delay; /* removed from schedule till */ ++ unsigned char delayed; ++ ++ /* ++ * Configuration ++ * ++ * Read-only most of the time. ++ */ ++ unsigned weight ____cacheline_aligned_in_smp; ++ /* fairness weight */ ++ unsigned char rate_limited; ++ unsigned rate; /* max CPU share */ ++ fschtag_t max_latency; ++ unsigned min_weight; ++ ++ struct list_head nodelist; ++ int id; ++#ifdef CONFIG_VE ++ struct ve_struct *owner_env; ++#endif ++ struct vcpu_scheduler *vsched; ++}; ++ ++#ifdef CONFIG_FAIRSCHED ++ ++#define FSCHWEIGHT_MAX ((1 << 16) - 1) ++#define FSCHRATE_SHIFT 10 ++ ++/* ++ * Fairsched nodes used in boot process. ++ */ ++extern struct fairsched_node fairsched_init_node; ++extern struct fairsched_node fairsched_idle_node; ++ ++/* ++ * For proc output. ++ */ ++extern unsigned fairsched_nr_cpus; ++extern void fairsched_cpu_online_map(int id, cpumask_t *mask); ++ ++/* I hope vsched_id is always equal to fairsched node id --SAW */ ++#define task_fairsched_node_id(p) task_vsched_id(p) ++ ++/* ++ * Core functions. ++ */ ++extern void fairsched_incrun(struct fairsched_node *node); ++extern void fairsched_decrun(struct fairsched_node *node); ++extern void fairsched_inccpu(struct fairsched_node *node); ++extern void fairsched_deccpu(struct fairsched_node *node); ++extern struct fairsched_node *fairsched_schedule( ++ struct fairsched_node *prev_node, ++ struct fairsched_node *cur_node, ++ int cur_node_active, ++ cycles_t time); ++ ++/* ++ * Management functions. ++ */ ++void fairsched_init_early(void); ++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid); ++asmlinkage int sys_fairsched_rmnod(unsigned int id); ++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid); ++ ++#else /* CONFIG_FAIRSCHED */ ++ ++#define task_fairsched_node_id(p) 0 ++#define fairsched_incrun(p) do { } while (0) ++#define fairsched_decrun(p) do { } while (0) ++#define fairsched_deccpu(p) do { } while (0) ++#define fairsched_cpu_online_map(id, mask) do { *(mask) = cpu_online_map; } while (0) ++ ++#endif /* CONFIG_FAIRSCHED */ ++ ++#endif /* __LINUX_FAIRSCHED_H__ */ +diff -upr linux-2.6.16.orig/include/linux/faudit.h linux-2.6.16-026test015/include/linux/faudit.h +--- linux-2.6.16.orig/include/linux/faudit.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/faudit.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,46 @@ ++/* ++ * include/linux/faudit.h ++ * ++ * Copyright (C) 2005 SWSoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __FAUDIT_H_ ++#define __FAUDIT_H_ ++ ++#include <linux/config.h> ++#include <linux/virtinfo.h> ++ ++struct vfsmount; ++struct dentry; ++struct super_block; ++struct kstatfs; ++struct kstat; ++struct pt_regs; ++ ++struct faudit_regs_arg { ++ int err; ++ struct pt_regs *regs; ++}; ++ ++struct faudit_stat_arg { ++ int err; ++ struct vfsmount *mnt; ++ struct dentry *dentry; ++ struct kstat *stat; ++}; ++ ++struct faudit_statfs_arg { ++ int err; ++ struct super_block *sb; ++ struct kstatfs *stat; ++}; ++ ++#define VIRTINFO_FAUDIT (0) ++#define VIRTINFO_FAUDIT_STAT (VIRTINFO_FAUDIT + 0) ++#define VIRTINFO_FAUDIT_STATFS (VIRTINFO_FAUDIT + 1) ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/fb.h linux-2.6.16-026test015/include/linux/fb.h +--- linux-2.6.16.orig/include/linux/fb.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/fb.h 2006-07-04 14:41:36.000000000 +0400 +@@ -839,12 +839,10 @@ struct fb_info { + #define FB_LEFT_POS(bpp) (32 - bpp) + #define FB_SHIFT_HIGH(val, bits) ((val) >> (bits)) + #define FB_SHIFT_LOW(val, bits) ((val) << (bits)) +-#define FB_BIT_NR(b) (7 - (b)) + #else + #define FB_LEFT_POS(bpp) (0) + #define FB_SHIFT_HIGH(val, bits) ((val) << (bits)) + #define FB_SHIFT_LOW(val, bits) ((val) >> (bits)) +-#define FB_BIT_NR(b) (b) + #endif + + /* +diff -upr linux-2.6.16.orig/include/linux/fs.h linux-2.6.16-026test015/include/linux/fs.h +--- linux-2.6.16.orig/include/linux/fs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/fs.h 2006-07-04 14:41:39.000000000 +0400 +@@ -7,6 +7,7 @@ + */ + + #include <linux/config.h> ++#include <linux/ve_owner.h> + #include <linux/limits.h> + #include <linux/ioctl.h> + +@@ -64,6 +65,7 @@ extern int dir_notify_enable; + #define FMODE_LSEEK 4 + #define FMODE_PREAD 8 + #define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ ++#define FMODE_QUOTACTL 4 + + #define RW_MASK 1 + #define RWA_MASK 2 +@@ -83,6 +85,7 @@ extern int dir_notify_enable; + /* public flags for file_system_type */ + #define FS_REQUIRES_DEV 1 + #define FS_BINARY_MOUNTDATA 2 ++#define FS_VIRTUALIZED 64 /* Can mount this fstype inside ve */ + #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ + #define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon + * as nfs_rename() will be cleaned up +@@ -297,6 +300,9 @@ struct iattr { + * Includes for diskquotas. + */ + #include <linux/quota.h> ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++#include <linux/vzquota_qlnk.h> ++#endif + + /** + * enum positive_aop_returns - aop return codes with specific semantics +@@ -493,6 +499,9 @@ struct inode { + #ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + #endif ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ struct vz_quota_ilink i_qlnk; ++#endif + /* These three should probably be a union */ + struct list_head i_devices; + struct pipe_inode_info *i_pipe; +@@ -527,6 +536,8 @@ struct inode { + #endif + }; + ++extern kmem_cache_t *inode_cachep; ++ + /* + * NOTE: in a 32bit arch with a preemptable kernel and + * an UP compile the i_size_read/write must be atomic +@@ -588,6 +599,20 @@ static inline unsigned imajor(struct ino + + extern struct block_device *I_BDEV(struct inode *inode); + ++struct exec_perm { ++ umode_t mode; ++ uid_t uid, gid; ++ int set; ++}; ++ ++static inline void set_exec_perm(struct exec_perm *perm, struct inode *ino) ++{ ++ perm->set = 1; ++ perm->mode = ino->i_mode; ++ perm->uid = ino->i_uid; ++ perm->gid = ino->i_gid; ++} ++ + struct fown_struct { + rwlock_t lock; /* protects pid, uid, euid fields */ + int pid; /* pid or -pgrp where SIGIO should be sent */ +@@ -646,7 +671,10 @@ struct file { + spinlock_t f_ep_lock; + #endif /* #ifdef CONFIG_EPOLL */ + struct address_space *f_mapping; ++ struct ve_struct *owner_env; + }; ++DCL_VE_OWNER_PROTO(FILP, struct file, owner_env) ++ + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); + #define file_list_unlock() spin_unlock(&files_lock); +@@ -710,6 +738,9 @@ struct file_lock { + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; ++#ifdef CONFIG_USER_RESOURCE ++ unsigned char fl_charged; ++#endif + loff_t fl_start; + loff_t fl_end; + +@@ -902,7 +933,7 @@ static inline void unlock_super(struct s + /* + * VFS helper functions.. + */ +-extern int vfs_permission(struct nameidata *, int); ++extern int vfs_permission(struct nameidata *, int, struct exec_perm *); + extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); + extern int vfs_mkdir(struct inode *, struct dentry *, int); + extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); +@@ -1041,7 +1072,8 @@ struct inode_operations { + void * (*follow_link) (struct dentry *, struct nameidata *); + void (*put_link) (struct dentry *, struct nameidata *, void *); + void (*truncate) (struct inode *); +- int (*permission) (struct inode *, int, struct nameidata *); ++ int (*permission) (struct inode *, int, struct nameidata *, ++ struct exec_perm *); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); +@@ -1089,6 +1121,8 @@ struct super_operations { + + ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); + ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); ++ ++ struct inode *(*get_quota_root)(struct super_block *); + }; + + /* Inode state bits. Protected by inode_lock. */ +@@ -1246,8 +1280,14 @@ struct file_system_type { + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(FSTYPE, struct file_system_type, owner_env) ++ ++void get_filesystem(struct file_system_type *fs); ++void put_filesystem(struct file_system_type *fs); ++ + struct super_block *get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)); +@@ -1285,6 +1325,7 @@ extern struct vfsmount *kern_mount(struc + extern int may_umount_tree(struct vfsmount *); + extern int may_umount(struct vfsmount *); + extern void umount_tree(struct vfsmount *, int, struct list_head *); ++#define kern_umount mntput + extern void release_mounts(struct list_head *); + extern long do_mount(char *, char *, char *, unsigned long, void *); + extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); +@@ -1292,6 +1333,7 @@ extern void mnt_set_mountpoint(struct vf + struct vfsmount *); + + extern int vfs_statfs(struct super_block *, struct kstatfs *); ++extern int faudit_statfs(struct super_block *, struct kstatfs *); + + /* /sys/fs */ + extern struct subsystem fs_subsys; +@@ -1383,6 +1425,7 @@ extern int bd_claim(struct block_device + extern void bd_release(struct block_device *); + + /* fs/char_dev.c */ ++#define CHRDEV_MAJOR_HASH_SIZE 255 + extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); + extern int register_chrdev_region(dev_t, unsigned, const char *); + extern int register_chrdev(unsigned int, const char *, +@@ -1390,25 +1433,17 @@ extern int register_chrdev(unsigned int, + extern int unregister_chrdev(unsigned int, const char *); + extern void unregister_chrdev_region(dev_t, unsigned); + extern int chrdev_open(struct inode *, struct file *); +-extern int get_chrdev_list(char *); +-extern void *acquire_chrdev_list(void); +-extern int count_chrdev_list(void); +-extern void *get_next_chrdev(void *); +-extern int get_chrdev_info(void *, int *, char **); +-extern void release_chrdev_list(void *); ++extern void chrdev_show(struct seq_file *,off_t); + + /* fs/block_dev.c */ ++#define BLKDEV_MAJOR_HASH_SIZE 255 + #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ + extern const char *__bdevname(dev_t, char *buffer); + extern const char *bdevname(struct block_device *bdev, char *buffer); +-extern struct block_device *lookup_bdev(const char *); ++extern struct block_device *lookup_bdev(const char *, int mode); + extern struct block_device *open_bdev_excl(const char *, int, void *); + extern void close_bdev_excl(struct block_device *); +-extern void *acquire_blkdev_list(void); +-extern int count_blkdev_list(void); +-extern void *get_next_blkdev(void *); +-extern int get_blkdev_info(void *, int *, char **); +-extern void release_blkdev_list(void *); ++extern void blkdev_show(struct seq_file *,off_t); + + extern void init_special_inode(struct inode *, umode_t, dev_t); + +@@ -1433,7 +1468,7 @@ extern int fs_may_remount_ro(struct supe + #define bio_data_dir(bio) ((bio)->bi_rw & 1) + + extern int check_disk_change(struct block_device *); +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes(struct super_block *, int); + extern int __invalidate_device(struct block_device *); + extern int invalidate_partition(struct gendisk *, int); + unsigned long invalidate_mapping_pages(struct address_space *mapping, +@@ -1463,9 +1498,10 @@ extern int do_remount_sb(struct super_bl + void *data, int force); + extern sector_t bmap(struct inode *, sector_t); + extern int notify_change(struct dentry *, struct iattr *); +-extern int permission(struct inode *, int, struct nameidata *); ++extern int permission(struct inode *, int, struct nameidata *, ++ struct exec_perm *); + extern int generic_permission(struct inode *, int, +- int (*check_acl)(struct inode *, int)); ++ int (*check_acl)(struct inode *, int), struct exec_perm *); + + extern int get_write_access(struct inode *); + extern int deny_write_access(struct file *); +@@ -1484,7 +1520,9 @@ extern int open_namei(int dfd, const cha + extern int may_open(struct nameidata *, int, int); + + extern int kernel_read(struct file *, unsigned long, char *, unsigned long); +-extern struct file * open_exec(const char *); ++ ++struct linux_binprm; ++extern struct file * open_exec(const char *, struct linux_binprm *); + + /* fs/dcache.c -- generic fs support functions */ + extern int is_subdir(struct dentry *, struct dentry *); +diff -upr linux-2.6.16.orig/include/linux/genhd.h linux-2.6.16-026test015/include/linux/genhd.h +--- linux-2.6.16.orig/include/linux/genhd.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/genhd.h 2006-07-04 14:41:38.000000000 +0400 +@@ -421,6 +421,7 @@ static inline struct block_device *bdget + return bdget(MKDEV(disk->major, disk->first_minor) + index); + } + ++extern struct subsystem block_subsys; + #endif + + #endif +diff -upr linux-2.6.16.orig/include/linux/gfp.h linux-2.6.16-026test015/include/linux/gfp.h +--- linux-2.6.16.orig/include/linux/gfp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/gfp.h 2006-07-04 14:41:37.000000000 +0400 +@@ -47,6 +47,8 @@ struct vm_area_struct; + #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ + #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ + #define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ ++#define __GFP_UBC ((__force gfp_t)0x40000u)/* charge kmem in buddy and slab */ ++#define __GFP_SOFT_UBC ((__force gfp_t)0x80000u)/* use soft charging */ + + #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ + #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) +@@ -55,14 +57,17 @@ struct vm_area_struct; + #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ + __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ +- __GFP_NOMEMALLOC|__GFP_HARDWALL) ++ __GFP_NOMEMALLOC|__GFP_HARDWALL| \ ++ __GFP_UBC|__GFP_SOFT_UBC) + + /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ + #define GFP_ATOMIC (__GFP_HIGH) + #define GFP_NOIO (__GFP_WAIT) + #define GFP_NOFS (__GFP_WAIT | __GFP_IO) + #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) ++#define GFP_KERNEL_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_UBC) + #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) ++#define GFP_USER_UBC (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | __GFP_UBC) + #define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL | \ + __GFP_HIGHMEM) + +diff -upr linux-2.6.16.orig/include/linux/hrtimer.h linux-2.6.16-026test015/include/linux/hrtimer.h +--- linux-2.6.16.orig/include/linux/hrtimer.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/hrtimer.h 2006-07-04 14:41:39.000000000 +0400 +@@ -140,4 +140,9 @@ extern void hrtimer_run_queues(void); + /* Bootup initialization: */ + extern void __init hrtimers_init(void); + ++extern long nanosleep_restart(struct restart_block *restart); ++ ++extern ktime_t schedule_hrtimer(struct hrtimer *timer, ++ const enum hrtimer_mode mode); ++ + #endif +diff -upr linux-2.6.16.orig/include/linux/i2o.h linux-2.6.16-026test015/include/linux/i2o.h +--- linux-2.6.16.orig/include/linux/i2o.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/i2o.h 2006-07-04 14:41:36.000000000 +0400 +@@ -1116,8 +1116,11 @@ static inline struct i2o_message *i2o_ms + + mmsg->mfa = readl(c->in_port); + if (unlikely(mmsg->mfa >= c->in_queue.len)) { ++ u32 mfa = mmsg->mfa; ++ + mempool_free(mmsg, c->in_msg.mempool); +- if(mmsg->mfa == I2O_QUEUE_EMPTY) ++ ++ if (mfa == I2O_QUEUE_EMPTY) + return ERR_PTR(-EBUSY); + return ERR_PTR(-EFAULT); + } +diff -upr linux-2.6.16.orig/include/linux/inetdevice.h linux-2.6.16-026test015/include/linux/inetdevice.h +--- linux-2.6.16.orig/include/linux/inetdevice.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/inetdevice.h 2006-07-04 14:41:38.000000000 +0400 +@@ -34,6 +34,12 @@ struct ipv4_devconf + }; + + extern struct ipv4_devconf ipv4_devconf; ++extern struct ipv4_devconf ipv4_devconf_dflt; ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ipv4_devconf (*(get_exec_env()->_ipv4_devconf)) ++#else ++#define ve_ipv4_devconf ipv4_devconf ++#endif + + struct in_device + { +@@ -60,29 +66,29 @@ struct in_device + }; + + #define IN_DEV_FORWARD(in_dev) ((in_dev)->cnf.forwarding) +-#define IN_DEV_MFORWARD(in_dev) (ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) +-#define IN_DEV_RPFILTER(in_dev) (ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) +-#define IN_DEV_SOURCE_ROUTE(in_dev) (ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) +-#define IN_DEV_BOOTP_RELAY(in_dev) (ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) +- +-#define IN_DEV_LOG_MARTIANS(in_dev) (ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) +-#define IN_DEV_PROXY_ARP(in_dev) (ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) +-#define IN_DEV_SHARED_MEDIA(in_dev) (ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) +-#define IN_DEV_TX_REDIRECTS(in_dev) (ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) +-#define IN_DEV_SEC_REDIRECTS(in_dev) (ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) ++#define IN_DEV_MFORWARD(in_dev) (ve_ipv4_devconf.mc_forwarding && (in_dev)->cnf.mc_forwarding) ++#define IN_DEV_RPFILTER(in_dev) (ve_ipv4_devconf.rp_filter && (in_dev)->cnf.rp_filter) ++#define IN_DEV_SOURCE_ROUTE(in_dev) (ve_ipv4_devconf.accept_source_route && (in_dev)->cnf.accept_source_route) ++#define IN_DEV_BOOTP_RELAY(in_dev) (ve_ipv4_devconf.bootp_relay && (in_dev)->cnf.bootp_relay) ++ ++#define IN_DEV_LOG_MARTIANS(in_dev) (ve_ipv4_devconf.log_martians || (in_dev)->cnf.log_martians) ++#define IN_DEV_PROXY_ARP(in_dev) (ve_ipv4_devconf.proxy_arp || (in_dev)->cnf.proxy_arp) ++#define IN_DEV_SHARED_MEDIA(in_dev) (ve_ipv4_devconf.shared_media || (in_dev)->cnf.shared_media) ++#define IN_DEV_TX_REDIRECTS(in_dev) (ve_ipv4_devconf.send_redirects || (in_dev)->cnf.send_redirects) ++#define IN_DEV_SEC_REDIRECTS(in_dev) (ve_ipv4_devconf.secure_redirects || (in_dev)->cnf.secure_redirects) + #define IN_DEV_IDTAG(in_dev) ((in_dev)->cnf.tag) + #define IN_DEV_MEDIUM_ID(in_dev) ((in_dev)->cnf.medium_id) + #define IN_DEV_PROMOTE_SECONDARIES(in_dev) (ipv4_devconf.promote_secondaries || (in_dev)->cnf.promote_secondaries) + + #define IN_DEV_RX_REDIRECTS(in_dev) \ + ((IN_DEV_FORWARD(in_dev) && \ +- (ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ ++ (ve_ipv4_devconf.accept_redirects && (in_dev)->cnf.accept_redirects)) \ + || (!IN_DEV_FORWARD(in_dev) && \ +- (ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) ++ (ve_ipv4_devconf.accept_redirects || (in_dev)->cnf.accept_redirects))) + +-#define IN_DEV_ARPFILTER(in_dev) (ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) +-#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) +-#define IN_DEV_ARP_IGNORE(in_dev) (max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) ++#define IN_DEV_ARPFILTER(in_dev) (ve_ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter) ++#define IN_DEV_ARP_ANNOUNCE(in_dev) (max(ve_ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce)) ++#define IN_DEV_ARP_IGNORE(in_dev) (max(ve_ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore)) + + struct in_ifaddr + { +@@ -113,6 +119,7 @@ extern u32 inet_select_addr(const struc + extern u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope); + extern struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, u32 mask); + extern void inet_forward_change(void); ++extern void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, int destroy); + + static __inline__ int inet_ifa_match(u32 addr, struct in_ifaddr *ifa) + { +@@ -180,6 +187,10 @@ static inline void in_dev_put(struct in_ + #define __in_dev_put(idev) atomic_dec(&(idev)->refcnt) + #define in_dev_hold(idev) atomic_inc(&(idev)->refcnt) + ++struct ve_struct; ++extern int devinet_sysctl_init(struct ve_struct *); ++extern void devinet_sysctl_fini(struct ve_struct *); ++extern void devinet_sysctl_free(struct ve_struct *); + #endif /* __KERNEL__ */ + + static __inline__ __u32 inet_make_mask(int logmask) +diff -upr linux-2.6.16.orig/include/linux/ipv6.h linux-2.6.16-026test015/include/linux/ipv6.h +--- linux-2.6.16.orig/include/linux/ipv6.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/ipv6.h 2006-07-04 14:41:39.000000000 +0400 +@@ -415,12 +415,13 @@ static inline struct raw6_sock *raw6_sk( + #define inet_v6_ipv6only(__sk) 0 + #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ + +-#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif)\ ++#define INET6_MATCH(__sk, __hash, __saddr, __daddr, __ports, __dif,__ve)\ + (((__sk)->sk_hash == (__hash)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + ((__sk)->sk_family == AF_INET6) && \ + ipv6_addr_equal(&inet6_sk(__sk)->daddr, (__saddr)) && \ + ipv6_addr_equal(&inet6_sk(__sk)->rcv_saddr, (__daddr)) && \ ++ ve_accessible_strict(VE_OWNER_SK(__sk), (__ve)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) + + #endif /* __KERNEL__ */ +diff -upr linux-2.6.16.orig/include/linux/jbd.h linux-2.6.16-026test015/include/linux/jbd.h +--- linux-2.6.16.orig/include/linux/jbd.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/jbd.h 2006-07-04 14:41:37.000000000 +0400 +@@ -245,10 +245,15 @@ typedef struct journal_superblock_s + #define J_ASSERT(assert) \ + do { \ + if (!(assert)) { \ ++ unsigned long stack; \ + printk (KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + __FUNCTION__, __FILE__, __LINE__, # assert); \ +- BUG(); \ ++ printk("Stack=%p current=%p pid=%d ve=%d comm='%s'\n", \ ++ &stack, current, current->pid, \ ++ get_exec_env()->veid, \ ++ current->comm); \ ++ dump_stack(); \ + } \ + } while (0) + +diff -upr linux-2.6.16.orig/include/linux/jiffies.h linux-2.6.16-026test015/include/linux/jiffies.h +--- linux-2.6.16.orig/include/linux/jiffies.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/jiffies.h 2006-07-04 14:41:39.000000000 +0400 +@@ -74,6 +74,7 @@ + */ + extern u64 __jiffy_data jiffies_64; + extern unsigned long volatile __jiffy_data jiffies; ++extern unsigned long cycles_per_jiffy, cycles_per_clock; + + #if (BITS_PER_LONG < 64) + u64 get_jiffies_64(void); +diff -upr linux-2.6.16.orig/include/linux/kdev_t.h linux-2.6.16-026test015/include/linux/kdev_t.h +--- linux-2.6.16.orig/include/linux/kdev_t.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/kdev_t.h 2006-07-04 14:41:38.000000000 +0400 +@@ -87,6 +87,57 @@ static inline unsigned sysv_minor(u32 de + return dev & 0x3ffff; + } + ++#define UNNAMED_MAJOR_COUNT 16 ++ ++#if UNNAMED_MAJOR_COUNT > 1 ++ ++extern int unnamed_dev_majors[UNNAMED_MAJOR_COUNT]; ++ ++static inline dev_t make_unnamed_dev(int idx) ++{ ++ /* ++ * Here we transfer bits from 8 to 8+log2(UNNAMED_MAJOR_COUNT) of the ++ * unnamed device index into major number. ++ */ ++ return MKDEV(unnamed_dev_majors[(idx >> 8) & (UNNAMED_MAJOR_COUNT - 1)], ++ idx & ~((UNNAMED_MAJOR_COUNT - 1) << 8)); ++} ++ ++static inline int unnamed_dev_idx(dev_t dev) ++{ ++ int i; ++ for (i = 0; i < UNNAMED_MAJOR_COUNT && ++ MAJOR(dev) != unnamed_dev_majors[i]; i++); ++ return MINOR(dev) | (i << 8); ++} ++ ++static inline int is_unnamed_dev(dev_t dev) ++{ ++ int i; ++ for (i = 0; i < UNNAMED_MAJOR_COUNT && ++ MAJOR(dev) != unnamed_dev_majors[i]; i++); ++ return i < UNNAMED_MAJOR_COUNT; ++} ++ ++#else /* UNNAMED_MAJOR_COUNT */ ++ ++static inline dev_t make_unnamed_dev(int idx) ++{ ++ return MKDEV(0, idx); ++} ++ ++static inline int unnamed_dev_idx(dev_t dev) ++{ ++ return MINOR(dev); ++} ++ ++static inline int is_unnamed_dev(dev_t dev) ++{ ++ return MAJOR(dev) == 0; ++} ++ ++#endif /* UNNAMED_MAJOR_COUNT */ ++ + + #else /* __KERNEL__ */ + +diff -upr linux-2.6.16.orig/include/linux/kernel.h linux-2.6.16-026test015/include/linux/kernel.h +--- linux-2.6.16.orig/include/linux/kernel.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/kernel.h 2006-07-04 14:41:38.000000000 +0400 +@@ -132,6 +132,9 @@ asmlinkage int vprintk(const char *fmt, + __attribute__ ((format (printf, 1, 0))); + asmlinkage int printk(const char * fmt, ...) + __attribute__ ((format (printf, 1, 2))); ++asmlinkage int ve_printk(int, const char * fmt, ...) ++ __attribute__ ((format (printf, 2, 3))); ++void prepare_printk(void); + #else + static inline int vprintk(const char *s, va_list args) + __attribute__ ((format (printf, 1, 0))); +@@ -139,8 +142,16 @@ static inline int vprintk(const char *s, + static inline int printk(const char *s, ...) + __attribute__ ((format (printf, 1, 2))); + static inline int printk(const char *s, ...) { return 0; } ++static inline int ve_printk(int d, const char *s, ...) ++ __attribute__ ((format (printf, 1, 2))); ++static inline int printk(int d, const char *s, ...) { return 0; } ++#define prepare_printk() do { } while (0) + #endif + ++#define VE0_LOG 1 ++#define VE_LOG 2 ++#define VE_LOG_BOTH (VE0_LOG | VE_LOG) ++ + unsigned long int_sqrt(unsigned long); + + static inline int __attribute_pure__ long_log2(unsigned long x) +@@ -159,9 +170,14 @@ static inline unsigned long __attribute_ + extern int printk_ratelimit(void); + extern int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst); + ++extern int console_silence_loglevel; ++ + static inline void console_silent(void) + { +- console_loglevel = 0; ++ if (console_loglevel > console_silence_loglevel) { ++ printk(KERN_EMERG "console shuts up ...\n"); ++ console_loglevel = 0; ++ } + } + + static inline void console_verbose(void) +@@ -171,10 +187,13 @@ static inline void console_verbose(void) + } + + extern void bust_spinlocks(int yes); ++extern void wake_up_klogd(void); + extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ + extern __deprecated_for_modules int panic_timeout; + extern int panic_on_oops; ++extern int decode_call_traces; + extern int tainted; ++extern int kernel_text_csum_broken; + extern const char *print_tainted(void); + extern void add_taint(unsigned); + +diff -upr linux-2.6.16.orig/include/linux/kmem_cache.h linux-2.6.16-026test015/include/linux/kmem_cache.h +--- linux-2.6.16.orig/include/linux/kmem_cache.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/kmem_cache.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,199 @@ ++#ifndef __KMEM_CACHE_H__ ++#define __KMEM_CACHE_H__ ++#include <linux/threads.h> ++#include <linux/smp.h> ++#include <linux/spinlock.h> ++#include <linux/list.h> ++#include <linux/mm.h> ++#include <asm/atomic.h> ++ ++/* ++ * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, ++ * SLAB_RED_ZONE & SLAB_POISON. ++ * 0 for faster, smaller code (especially in the critical paths). ++ * ++ * STATS - 1 to collect stats for /proc/slabinfo. ++ * 0 for faster, smaller code (especially in the critical paths). ++ * ++ * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) ++ */ ++ ++#ifdef CONFIG_DEBUG_SLAB ++#define SLAB_DEBUG 1 ++#define SLAB_STATS 1 ++#define SLAB_FORCED_DEBUG 1 ++#else ++#define SLAB_DEBUG 0 ++#define SLAB_STATS 0 ++#define SLAB_FORCED_DEBUG 0 ++#endif ++ ++/* ++ * struct array_cache ++ * ++ * Purpose: ++ * - LIFO ordering, to hand out cache-warm objects from _alloc ++ * - reduce the number of linked list operations ++ * - reduce spinlock operations ++ * ++ * The limit is stored in the per-cpu structure to reduce the data cache ++ * footprint. ++ * ++ */ ++struct array_cache { ++ unsigned int avail; ++ unsigned int limit; ++ unsigned int batchcount; ++ unsigned int touched; ++ spinlock_t lock; ++ void *entry[0]; /* ++ * Must have this definition in here for the proper ++ * alignment of array_cache. Also simplifies accessing ++ * the entries. ++ * [0] is for gcc 2.95. It should really be []. ++ */ ++}; ++ ++/* bootstrap: The caches do not work without cpuarrays anymore, ++ * but the cpuarrays are allocated from the generic caches... ++ */ ++#define BOOT_CPUCACHE_ENTRIES 1 ++struct arraycache_init { ++ struct array_cache cache; ++ void *entries[BOOT_CPUCACHE_ENTRIES]; ++}; ++ ++/* ++ * The slab lists for all objects. ++ */ ++struct kmem_list3 { ++ struct list_head slabs_partial; /* partial list first, better asm code */ ++ struct list_head slabs_full; ++ struct list_head slabs_free; ++ unsigned long free_objects; ++ unsigned long next_reap; ++ int free_touched; ++ unsigned int free_limit; ++ unsigned int colour_next; /* Per-node cache coloring */ ++ spinlock_t list_lock; ++ struct array_cache *shared; /* shared per node */ ++ struct array_cache **alien; /* on other nodes */ ++}; ++ ++/* ++ * struct kmem_cache ++ * ++ * manages a cache. ++ */ ++ ++struct kmem_cache { ++/* 1) per-cpu data, touched during every alloc/free */ ++ struct array_cache *array[NR_CPUS]; ++ unsigned int batchcount; ++ unsigned int limit; ++ unsigned int shared; ++ unsigned int buffer_size; ++/* 2) touched by every alloc & free from the backend */ ++ struct kmem_list3 *nodelists[MAX_NUMNODES]; ++ unsigned int flags; /* constant flags */ ++ unsigned int num; /* # of objs per slab */ ++ spinlock_t spinlock; ++ ++/* 3) cache_grow/shrink */ ++ /* order of pgs per slab (2^n) */ ++ unsigned int gfporder; ++ ++ /* force GFP flags, e.g. GFP_DMA */ ++ gfp_t gfpflags; ++ ++ size_t colour; /* cache colouring range */ ++ unsigned int colour_off; /* colour offset */ ++ struct kmem_cache *slabp_cache; ++ unsigned int slab_size; ++ unsigned int dflags; /* dynamic flags */ ++ ++ /* constructor func */ ++ void (*ctor) (void *, struct kmem_cache *, unsigned long); ++ ++ /* de-constructor func */ ++ void (*dtor) (void *, struct kmem_cache *, unsigned long); ++ ++/* 4) cache creation/removal */ ++ const char *name; ++ struct list_head next; ++ ++/* 5) statistics */ ++#if SLAB_STATS ++ unsigned long num_active; ++ unsigned long num_allocations; ++ unsigned long high_mark; ++ unsigned long grown; ++ unsigned long reaped; ++ unsigned long errors; ++ unsigned long max_freeable; ++ unsigned long node_allocs; ++ unsigned long node_frees; ++ atomic_t allochit; ++ atomic_t allocmiss; ++ atomic_t freehit; ++ atomic_t freemiss; ++#endif ++#if SLAB_DEBUG ++ /* ++ * If debugging is enabled, then the allocator can add additional ++ * fields and/or padding to every object. buffer_size contains the total ++ * object size including these internal fields, the following two ++ * variables contain the offset to the user object and its size. ++ */ ++ int obj_offset; ++ int obj_size; ++#endif ++#ifdef CONFIG_USER_RESOURCE ++ unsigned int objuse; ++#endif ++}; ++ ++#define CFLGS_OFF_SLAB (0x80000000UL) ++#define CFLGS_ENVIDS (0x04000000UL) ++#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) ++#define ENVIDS(x) ((x)->flags & CFLGS_ENVIDS) ++#define kmem_mark_nocharge(c) do { (c)->flags |= SLAB_NO_CHARGE; } while (0) ++ ++struct slab; ++/* Functions for storing/retrieving the cachep and or slab from the ++ * global 'mem_map'. These are used to find the slab an obj belongs to. ++ * With kfree(), these are used to find the cache which an obj belongs to. ++ */ ++static inline void page_set_cache(struct page *page, struct kmem_cache *cache) ++{ ++ page->lru.next = (struct list_head *)cache; ++} ++ ++static inline struct kmem_cache *page_get_cache(struct page *page) ++{ ++ return (struct kmem_cache *)page->lru.next; ++} ++ ++static inline void page_set_slab(struct page *page, struct slab *slab) ++{ ++ page->lru.prev = (struct list_head *)slab; ++} ++ ++static inline struct slab *page_get_slab(struct page *page) ++{ ++ return (struct slab *)page->lru.prev; ++} ++ ++static inline struct kmem_cache *virt_to_cache(const void *obj) ++{ ++ struct page *page = virt_to_page(obj); ++ return page_get_cache(page); ++} ++ ++static inline struct slab *virt_to_slab(const void *obj) ++{ ++ struct page *page = virt_to_page(obj); ++ return page_get_slab(page); ++} ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/kmem_slab.h linux-2.6.16-026test015/include/linux/kmem_slab.h +--- linux-2.6.16.orig/include/linux/kmem_slab.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/kmem_slab.h 2006-07-04 14:41:36.000000000 +0400 +@@ -0,0 +1,71 @@ ++#ifndef __KMEM_SLAB_H__ ++#define __KMEM_SLAB_H__ ++ ++/* ++ * kmem_bufctl_t: ++ * ++ * Bufctl's are used for linking objs within a slab ++ * linked offsets. ++ * ++ * This implementation relies on "struct page" for locating the cache & ++ * slab an object belongs to. ++ * This allows the bufctl structure to be small (one int), but limits ++ * the number of objects a slab (not a cache) can contain when off-slab ++ * bufctls are used. The limit is the size of the largest general cache ++ * that does not use off-slab slabs. ++ * For 32bit archs with 4 kB pages, is this 56. ++ * This is not serious, as it is only for large objects, when it is unwise ++ * to have too many per slab. ++ * Note: This limit can be raised by introducing a general cache whose size ++ * is less than 512 (PAGE_SIZE<<3), but greater than 256. ++ */ ++ ++typedef unsigned int kmem_bufctl_t; ++#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) ++#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) ++#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) ++ ++/* ++ * struct slab ++ * ++ * Manages the objs in a slab. Placed either at the beginning of mem allocated ++ * for a slab, or allocated from an general cache. ++ * Slabs are chained into three list: fully used, partial, fully free slabs. ++ */ ++struct slab { ++ struct list_head list; ++ unsigned long colouroff; ++ void *s_mem; /* including colour offset */ ++ unsigned int inuse; /* num of objs active in slab */ ++ kmem_bufctl_t free; ++ unsigned short nodeid; ++}; ++ ++/* ++ * struct slab_rcu ++ * ++ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to ++ * arrange for kmem_freepages to be called via RCU. This is useful if ++ * we need to approach a kernel structure obliquely, from its address ++ * obtained without the usual locking. We can lock the structure to ++ * stabilize it and check it's still at the given address, only if we ++ * can be sure that the memory has not been meanwhile reused for some ++ * other kind of object (which our subsystem's lock might corrupt). ++ * ++ * rcu_read_lock before reading the address, then rcu_read_unlock after ++ * taking the spinlock within the structure expected at that address. ++ * ++ * We assume struct slab_rcu can overlay struct slab when destroying. ++ */ ++struct slab_rcu { ++ struct rcu_head head; ++ struct kmem_cache *cachep; ++ void *addr; ++}; ++ ++static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) ++{ ++ return (kmem_bufctl_t *) (slabp + 1); ++} ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/list.h linux-2.6.16-026test015/include/linux/list.h +--- linux-2.6.16.orig/include/linux/list.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/list.h 2006-07-04 14:41:38.000000000 +0400 +@@ -325,6 +325,9 @@ static inline void list_splice_init(stru + #define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + ++#define list_first_entry(ptr, type, member) \ ++ container_of((ptr)->next, type, member) ++ + /** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. +@@ -411,6 +414,20 @@ static inline void list_splice_init(stru + pos = list_entry(pos->member.next, typeof(*pos), member)) + + /** ++ * list_for_each_entry_continue_reverse - iterate backwards over list of given ++ * type continuing after existing point ++ * @pos: the type * to use as a loop counter. ++ * @head: the head for your list. ++ * @member: the name of the list_struct within the struct. ++ */ ++#define list_for_each_entry_continue_reverse(pos, head, member) \ ++ for (pos = list_entry(pos->member.prev, typeof(*pos), member), \ ++ prefetch(pos->member.prev); \ ++ &pos->member != (head); \ ++ pos = list_entry(pos->member.prev, typeof(*pos), member), \ ++ prefetch(pos->member.prev)) ++ ++/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage +diff -upr linux-2.6.16.orig/include/linux/major.h linux-2.6.16-026test015/include/linux/major.h +--- linux-2.6.16.orig/include/linux/major.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/major.h 2006-07-04 14:41:38.000000000 +0400 +@@ -165,4 +165,7 @@ + + #define VIOTAPE_MAJOR 230 + ++#define UNNAMED_EXTRA_MAJOR 130 ++#define UNNAMED_EXTRA_MAJOR_COUNT 120 ++ + #endif +diff -upr linux-2.6.16.orig/include/linux/mm.h linux-2.6.16-026test015/include/linux/mm.h +--- linux-2.6.16.orig/include/linux/mm.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/mm.h 2006-07-04 14:41:39.000000000 +0400 +@@ -41,6 +41,27 @@ extern int sysctl_legacy_va_layout; + + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) + ++#include <linux/mm_counter.h> ++ ++#ifdef CONFIG_USER_RESOURCE ++#define set_vma_rss(vma, v) set_mm_counter(vma, vm_rss, v) ++#define get_vma_rss(vma) get_mm_counter(vma, vm_rss) ++#define inc_vma_rss(vma) inc_mm_counter(vma, vm_rss) ++#define dec_vma_rss(vma) dec_mm_counter(vma, vm_rss) ++#define add_vma_rss(vma, v) add_mm_counter(vma, vm_rss, v) ++#define sub_vma_rss(vma, v) do { \ ++ if (unlikely(dec_mm_counter_chk(vma, vm_rss, v))) \ ++ warn_bad_rss(vma, v); \ ++ } while (0) ++#else ++#define set_vma_rss(vma, v) do { } while (0) ++#define get_vma_rss(vma) (0) ++#define inc_vma_rss(vma) do { } while (0) ++#define dec_vma_rss(vma) do { } while (0) ++#define add_vma_rss(vma, v) do { } while (0) ++#define sub_vma_rss(vma, v) do { } while (0) ++#endif ++ + /* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way +@@ -111,6 +132,9 @@ struct vm_area_struct { + #ifdef CONFIG_NUMA + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif ++#ifdef CONFIG_USER_RESOURCE ++ mm_counter_t _vm_rss; ++#endif + }; + + /* +@@ -229,10 +253,9 @@ struct page { + unsigned long private; /* Mapping-private opaque data: + * usually used for buffer_heads + * if PagePrivate set; used for +- * swp_entry_t if PageSwapCache. +- * When page is free, this ++ * swp_entry_t if PageSwapCache; + * indicates order in the buddy +- * system. ++ * system if PG_buddy is set. + */ + struct address_space *mapping; /* If low bit clear, points to + * inode address_space, or NULL. +@@ -264,6 +287,12 @@ struct page { + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ + #endif /* WANT_PAGE_VIRTUAL */ ++#ifdef CONFIG_USER_RESOURCE ++ union { ++ struct user_beancounter *page_ub; ++ struct page_beancounter *page_pb; ++ } bc; ++#endif + }; + + #define page_private(page) ((page)->private) +@@ -636,16 +665,9 @@ struct page *shmem_nopage(struct vm_area + int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); + struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr); +-int shmem_lock(struct file *file, int lock, struct user_struct *user); + #else + #define shmem_nopage filemap_nopage + +-static inline int shmem_lock(struct file *file, int lock, +- struct user_struct *user) +-{ +- return 0; +-} +- + static inline int shmem_set_policy(struct vm_area_struct *vma, + struct mempolicy *new) + { +@@ -706,7 +728,9 @@ void free_pgd_range(struct mmu_gather ** + void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + int copy_page_range(struct mm_struct *dst, struct mm_struct *src, +- struct vm_area_struct *vma); ++ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); ++int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, ++ unsigned long addr, size_t size); + int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); + void unmap_mapping_range(struct address_space *mapping, +diff -upr linux-2.6.16.orig/include/linux/mm_counter.h linux-2.6.16-026test015/include/linux/mm_counter.h +--- linux-2.6.16.orig/include/linux/mm_counter.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/mm_counter.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,32 @@ ++#ifndef __MM_COUNTER_H_ ++#define __MM_COUNTER_H_ ++#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS ++/* ++ * The mm counters are not protected by its page_table_lock, ++ * so must be incremented atomically. ++ */ ++#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) ++#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) ++#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) ++#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) ++#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) ++#define dec_mm_counter_chk(mm, member, value) \ ++ atomic_long_add_negative(-(value), &(mm)->_##member) ++typedef atomic_long_t mm_counter_t; ++ ++#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++/* ++ * The mm counters are protected by its page_table_lock, ++ * so can be incremented directly. ++ */ ++#define set_mm_counter(mm, member, value) (mm)->_##member = (value) ++#define get_mm_counter(mm, member) ((mm)->_##member) ++#define add_mm_counter(mm, member, value) (mm)->_##member += (value) ++#define inc_mm_counter(mm, member) (mm)->_##member++ ++#define dec_mm_counter(mm, member) (mm)->_##member-- ++#define dec_mm_counter_chk(mm, member, value) \ ++ (((mm)->_##member -= (value)) < 0) ++typedef unsigned long mm_counter_t; ++ ++#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++#endif +diff -upr linux-2.6.16.orig/include/linux/mount.h linux-2.6.16-026test015/include/linux/mount.h +--- linux-2.6.16.orig/include/linux/mount.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/mount.h 2006-07-04 14:41:38.000000000 +0400 +@@ -47,6 +47,7 @@ struct vfsmount { + struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */ + struct namespace *mnt_namespace; /* containing namespace */ + int mnt_pinned; ++ unsigned owner; + }; + + static inline struct vfsmount *mntget(struct vfsmount *mnt) +diff -upr linux-2.6.16.orig/include/linux/msg.h linux-2.6.16-026test015/include/linux/msg.h +--- linux-2.6.16.orig/include/linux/msg.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/msg.h 2006-07-04 14:41:39.000000000 +0400 +@@ -92,6 +92,8 @@ struct msg_queue { + struct list_head q_senders; + }; + ++int sysvipc_walk_msg(int (*func)(int, struct msg_queue*, void *), void *arg); ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_MSG_H */ +diff -upr linux-2.6.16.orig/include/linux/namei.h linux-2.6.16-026test015/include/linux/namei.h +--- linux-2.6.16.orig/include/linux/namei.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/namei.h 2006-07-04 14:41:38.000000000 +0400 +@@ -48,12 +48,15 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA + #define LOOKUP_PARENT 16 + #define LOOKUP_NOALT 32 + #define LOOKUP_REVAL 64 ++#define LOOKUP_STRICT 128 /* no symlinks or other filesystems */ ++ + /* + * Intent data + */ + #define LOOKUP_OPEN (0x0100) + #define LOOKUP_CREATE (0x0200) + #define LOOKUP_ACCESS (0x0400) ++#define LOOKUP_NOAREACHECK (0x0800) /* no area check on lookup */ + + extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *)); + extern int FASTCALL(__user_walk_fd(int dfd, const char __user *, unsigned, struct nameidata *)); +diff -upr linux-2.6.16.orig/include/linux/namespace.h linux-2.6.16-026test015/include/linux/namespace.h +--- linux-2.6.16.orig/include/linux/namespace.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/namespace.h 2006-07-04 14:41:38.000000000 +0400 +@@ -13,6 +13,8 @@ struct namespace { + int event; + }; + ++extern struct rw_semaphore namespace_sem; ++ + extern int copy_namespace(int, struct task_struct *); + extern void __put_namespace(struct namespace *namespace); + extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *); +diff -upr linux-2.6.16.orig/include/linux/netdevice.h linux-2.6.16-026test015/include/linux/netdevice.h +--- linux-2.6.16.orig/include/linux/netdevice.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netdevice.h 2006-07-04 14:41:39.000000000 +0400 +@@ -37,6 +37,7 @@ + #include <linux/config.h> + #include <linux/device.h> + #include <linux/percpu.h> ++#include <linux/ctype.h> + + struct divert_blk; + struct vlan_group; +@@ -233,6 +234,11 @@ enum netdev_state_t + __LINK_STATE_LINKWATCH_PENDING + }; + ++struct netdev_bc { ++ struct user_beancounter *exec_ub, *owner_ub; ++}; ++ ++#define netdev_bc(dev) (&(dev)->dev_bc) + + /* + * This structure holds at boot time configured netdevice settings. They +@@ -309,6 +315,8 @@ struct net_device + #define NETIF_F_TSO 2048 /* Can offload TCP/IP segmentation */ + #define NETIF_F_LLTX 4096 /* LockLess TX */ + #define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ ++#define NETIF_F_VIRTUAL 0x40000000 /* can be registered in ve */ ++#define NETIF_F_VENET 0x80000000 /* Device is VENET device */ + + struct net_device *next_sched; + +@@ -431,6 +439,7 @@ struct net_device + enum { NETREG_UNINITIALIZED=0, + NETREG_REGISTERING, /* called register_netdevice */ + NETREG_REGISTERED, /* completed register todo */ ++ NETREG_REGISTER_ERR, /* register todo failed */ + NETREG_UNREGISTERING, /* called unregister_netdevice */ + NETREG_UNREGISTERED, /* completed unregister todo */ + NETREG_RELEASED, /* called free_netdev */ +@@ -500,8 +509,18 @@ struct net_device + struct divert_blk *divert; + #endif /* CONFIG_NET_DIVERT */ + ++ unsigned orig_mtu; /* MTU value before move to VE */ ++ struct ve_struct *owner_env; /* Owner VE of the interface */ ++ struct netdev_bc dev_bc; ++ + /* class/net/name entry */ + struct class_device class_dev; ++ ++#ifdef CONFIG_VE ++ /* List entry in global devices list to keep track of their names ++ * assignment */ ++ struct list_head dev_global_list_entry; ++#endif + }; + + #define NETDEV_ALIGN 32 +@@ -535,9 +554,23 @@ struct packet_type { + #include <linux/notifier.h> + + extern struct net_device loopback_dev; /* The loopback */ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define loopback_dev (*get_exec_env()->_loopback_dev) ++#define ve0_loopback (*get_ve0()->_loopback_dev) ++#define dev_base (get_exec_env()->_net_dev_base) ++#define visible_dev_head(x) (&(x)->_net_dev_head) ++#define visible_dev_index_head(x) (&(x)->_net_dev_index_head) ++#else + extern struct net_device *dev_base; /* All devices */ ++#define ve0_loopback loopback_dev ++#define visible_dev_head(x) NULL ++#define visible_dev_index_head(x) NULL ++#endif + extern rwlock_t dev_base_lock; /* Device list lock */ + ++struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env); ++struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env); ++ + extern int netdev_boot_setup_check(struct net_device *dev); + extern unsigned long netdev_boot_base(const char *prefix, int unit); + extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr); +@@ -554,6 +587,7 @@ extern int dev_alloc_name(struct net_de + extern int dev_open(struct net_device *dev); + extern int dev_close(struct net_device *dev); + extern int dev_queue_xmit(struct sk_buff *skb); ++extern int dev_set_mtu(struct net_device *dev, int new_mtu); + extern int register_netdevice(struct net_device *dev); + extern int unregister_netdevice(struct net_device *dev); + extern void free_netdev(struct net_device *dev); +@@ -951,6 +985,18 @@ extern void dev_seq_stop(struct seq_file + + extern void linkwatch_run_queue(void); + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static inline int ve_is_dev_movable(struct net_device *dev) ++{ ++ return !(dev->features & NETIF_F_VIRTUAL); ++} ++#else ++static inline int ve_is_dev_movable(struct net_device *dev) ++{ ++ return 0; ++} ++#endif ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_DEV_H */ +diff -upr linux-2.6.16.orig/include/linux/netfilter/nf_conntrack_ftp.h linux-2.6.16-026test015/include/linux/netfilter/nf_conntrack_ftp.h +--- linux-2.6.16.orig/include/linux/netfilter/nf_conntrack_ftp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter/nf_conntrack_ftp.h 2006-07-04 14:41:39.000000000 +0400 +@@ -32,13 +32,22 @@ struct ip_conntrack_expect; + + /* For NAT to hook in when we find a packet which describes what other + * connection we should expect. */ +-extern unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, ++typedef unsigned int (*ip_nat_helper_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq); ++extern ip_nat_helper_ftp_hook ip_nat_ftp_hook; ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_nat_ftp_hook \ ++ ((ip_nat_helper_ftp_hook) \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook)) ++#else ++#define ve_ip_nat_ftp_hook ip_nat_ftp_hook ++#endif + #endif /* __KERNEL__ */ + + #endif /* _NF_CONNTRACK_FTP_H */ +diff -upr linux-2.6.16.orig/include/linux/netfilter/x_tables.h linux-2.6.16-026test015/include/linux/netfilter/x_tables.h +--- linux-2.6.16.orig/include/linux/netfilter/x_tables.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter/x_tables.h 2006-07-04 14:41:39.000000000 +0400 +@@ -80,12 +80,19 @@ struct xt_counters_info + + #ifdef __KERNEL__ + ++#include <linux/config.h> + #include <linux/netdevice.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) + #include <linux/netfilter_ipv4/listhelp.h> + ++#ifdef CONFIG_COMPAT ++#define COMPAT_TO_USER 1 ++#define COMPAT_FROM_USER -1 ++#define COMPAT_CALC_SIZE 0 ++#endif ++ + struct xt_match + { + struct list_head list; +@@ -118,6 +125,10 @@ struct xt_match + /* Called when entry of this type deleted. */ + void (*destroy)(void *matchinfo, unsigned int matchinfosize); + ++#ifdef CONFIG_COMPAT ++ /* Called when userspace align differs from kernel space one */ ++ int (*compat)(void *match, void **dstptr, int *size, int convert); ++#endif + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ + struct module *me; + }; +@@ -154,6 +165,10 @@ struct xt_target + /* Called when entry of this type deleted. */ + void (*destroy)(void *targinfo, unsigned int targinfosize); + ++#ifdef CONFIG_COMPAT ++ /* Called when userspace align differs from kernel space one */ ++ int (*compat)(void *target, void **dstptr, int *size, int convert); ++#endif + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ + struct module *me; + }; +@@ -211,6 +226,10 @@ extern int xt_register_table(struct xt_t + struct xt_table_info *bootstrap, + struct xt_table_info *newinfo); + extern void *xt_unregister_table(struct xt_table *table); ++extern struct xt_table *virt_xt_register_table(struct xt_table *table, ++ struct xt_table_info *bootstrap, ++ struct xt_table_info *newinfo); ++extern void *virt_xt_unregister_table(struct xt_table *table); + + extern struct xt_table_info *xt_replace_table(struct xt_table *table, + unsigned int num_counters, +@@ -233,6 +252,34 @@ extern void xt_proto_fini(int af); + extern struct xt_table_info *xt_alloc_table_info(unsigned int size); + extern void xt_free_table_info(struct xt_table_info *info); + ++#ifdef CONFIG_COMPAT ++#include <net/compat.h> ++ ++/* FIXME: this works only on 32 bit tasks ++ * need to change whole approach in order to calculate align as function of ++ * current task alignment */ ++ ++struct compat_xt_counters ++{ ++ u_int32_t cnt[4]; ++}; ++ ++struct compat_xt_counters_info ++{ ++ char name[XT_TABLE_MAXNAMELEN]; ++ compat_uint_t num_counters; ++ struct compat_xt_counters counters[0]; ++}; ++ ++#define COMPAT_XT_ALIGN(s) (((s) + (__alignof__(struct compat_xt_counters)-1)) \ ++ & ~(__alignof__(struct compat_xt_counters)-1)) ++ ++extern int ipt_match_align_compat(void *match, void **dstptr, ++ int *size, int off, int convert); ++extern int ipt_target_align_compat(void *target, void **dstptr, ++ int *size, int off, int convert); ++ ++#endif /* CONFIG_COMPAT */ + #endif /* __KERNEL__ */ + + #endif /* _X_TABLES_H */ +diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_conntrack.h linux-2.6.16-026test015/include/linux/netfilter/xt_conntrack.h +--- linux-2.6.16.orig/include/linux/netfilter/xt_conntrack.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter/xt_conntrack.h 2006-07-04 14:41:36.000000000 +0400 +@@ -5,6 +5,7 @@ + #ifndef _XT_CONNTRACK_H + #define _XT_CONNTRACK_H + ++#include <linux/config.h> + #include <linux/netfilter/nf_conntrack_tuple_common.h> + #include <linux/in.h> + +@@ -60,4 +61,21 @@ struct xt_conntrack_info + /* Inverse flags */ + u_int8_t invflags; + }; ++ ++#ifdef CONFIG_COMPAT ++struct compat_xt_conntrack_info ++{ ++ compat_uint_t statemask, statusmask; ++ ++ struct ip_conntrack_tuple tuple[IP_CT_DIR_MAX]; ++ struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX]; ++ ++ compat_ulong_t expires_min, expires_max; ++ ++ /* Flags word */ ++ u_int8_t flags; ++ /* Inverse flags */ ++ u_int8_t invflags; ++}; ++#endif + #endif /*_XT_CONNTRACK_H*/ +diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_helper.h linux-2.6.16-026test015/include/linux/netfilter/xt_helper.h +--- linux-2.6.16.orig/include/linux/netfilter/xt_helper.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter/xt_helper.h 2006-07-04 14:41:36.000000000 +0400 +@@ -1,8 +1,17 @@ + #ifndef _XT_HELPER_H + #define _XT_HELPER_H + ++#include <linux/config.h> ++ + struct xt_helper_info { + int invert; + char name[30]; + }; ++ ++#ifdef CONFIG_COMPAT ++struct compat_xt_helper_info { ++ compat_int_t invert; ++ char name[30]; ++}; ++#endif + #endif /* _XT_HELPER_H */ +diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_limit.h linux-2.6.16-026test015/include/linux/netfilter/xt_limit.h +--- linux-2.6.16.orig/include/linux/netfilter/xt_limit.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter/xt_limit.h 2006-07-04 14:41:36.000000000 +0400 +@@ -1,6 +1,8 @@ + #ifndef _XT_RATE_H + #define _XT_RATE_H + ++#include <linux/config.h> ++ + /* timings are in milliseconds. */ + #define XT_LIMIT_SCALE 10000 + +@@ -18,4 +20,19 @@ struct xt_rateinfo { + /* Ugly, ugly fucker. */ + struct xt_rateinfo *master; + }; ++ ++#ifdef CONFIG_COMPAT ++struct compat_xt_rateinfo { ++ u_int32_t avg; /* Average secs between packets * scale */ ++ u_int32_t burst; /* Period multiplier for upper limit. */ ++ ++ /* Used internally by the kernel */ ++ compat_ulong_t prev; ++ u_int32_t credit; ++ u_int32_t credit_cap, cost; ++ ++ /* Ugly, ugly fucker. */ ++ compat_uptr_t master; ++}; ++#endif + #endif /*_XT_RATE_H*/ +diff -upr linux-2.6.16.orig/include/linux/netfilter/xt_state.h linux-2.6.16-026test015/include/linux/netfilter/xt_state.h +--- linux-2.6.16.orig/include/linux/netfilter/xt_state.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter/xt_state.h 2006-07-04 14:41:36.000000000 +0400 +@@ -1,6 +1,8 @@ + #ifndef _XT_STATE_H + #define _XT_STATE_H + ++#include <linux/config.h> ++ + #define XT_STATE_BIT(ctinfo) (1 << ((ctinfo)%IP_CT_IS_REPLY+1)) + #define XT_STATE_INVALID (1 << 0) + +@@ -10,4 +12,11 @@ struct xt_state_info + { + unsigned int statemask; + }; ++ ++#ifdef CONFIG_COMPAT ++struct compat_xt_state_info ++{ ++ compat_uint_t statemask; ++}; ++#endif + #endif /*_XT_STATE_H*/ +diff -upr linux-2.6.16.orig/include/linux/netfilter.h linux-2.6.16-026test015/include/linux/netfilter.h +--- linux-2.6.16.orig/include/linux/netfilter.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter.h 2006-07-04 14:41:39.000000000 +0400 +@@ -107,12 +107,21 @@ struct nf_info + int nf_register_hook(struct nf_hook_ops *reg); + void nf_unregister_hook(struct nf_hook_ops *reg); + ++int virt_nf_register_hook(struct nf_hook_ops *reg); ++int virt_nf_unregister_hook(struct nf_hook_ops *reg); ++ + /* Functions to register get/setsockopt ranges (non-inclusive). You + need to check permissions yourself! */ + int nf_register_sockopt(struct nf_sockopt_ops *reg); + void nf_unregister_sockopt(struct nf_sockopt_ops *reg); + ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_hooks \ ++ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) ++#else + extern struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; ++#define ve_nf_hooks nf_hooks ++#endif + + /* those NF_LOG_* defines and struct nf_loginfo are legacy definitios that will + * disappear once iptables is replaced with pkttables. Please DO NOT use them +@@ -190,7 +199,7 @@ static inline int nf_hook_thresh(int pf, + if (!cond) + return 1; + #ifndef CONFIG_NETFILTER_DEBUG +- if (list_empty(&nf_hooks[pf][hook])) ++ if (list_empty(&ve_nf_hooks[pf][hook])) + return 1; + #endif + return nf_hook_slow(pf, hook, pskb, indev, outdev, okfn, thresh); +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack.h 2006-07-04 14:41:39.000000000 +0400 +@@ -71,6 +71,11 @@ do { \ + + struct ip_conntrack_helper; + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/ve.h> ++#include <linux/ve_owner.h> ++#endif ++ + struct ip_conntrack + { + /* Usage count in here is 1 for hash table/destruct timer, 1 per skb, +@@ -122,8 +127,15 @@ struct ip_conntrack + /* Traversed often, so hopefully in different cacheline to top */ + /* These are my tuples; original and reply */ + struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *ct_owner_env; ++#endif + }; + ++#ifdef CONFIG_VE_IPTABLES ++DCL_VE_OWNER_PROTO(CT, struct ip_conntrack, ct_owner_env) ++#endif ++ + struct ip_conntrack_expect + { + /* Internal linked list (global expectation list) */ +@@ -232,7 +244,15 @@ extern void ip_conntrack_tcp_update(stru + enum ip_conntrack_dir dir); + + /* Call me when a conntrack is destroyed. */ ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_conntrack_destroyed \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_destroyed) ++#else + extern void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack); ++#define ve_ip_conntrack_destroyed ip_conntrack_destroyed ++#endif ++ + + /* Fake conntrack entry for untracked connections */ + extern struct ip_conntrack ip_conntrack_untracked; +@@ -261,7 +281,7 @@ extern void ip_conntrack_proto_put(struc + extern void ip_ct_remove_expectations(struct ip_conntrack *ct); + + extern struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *, +- struct ip_conntrack_tuple *); ++ struct ip_conntrack_tuple *, struct user_beancounter *); + + extern void ip_conntrack_free(struct ip_conntrack *ct); + +@@ -270,6 +290,8 @@ extern void ip_conntrack_hash_insert(str + extern struct ip_conntrack_expect * + __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); + ++extern void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp); ++ + extern struct ip_conntrack_expect * + ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple); + +@@ -291,6 +313,7 @@ static inline int is_dying(struct ip_con + } + + extern unsigned int ip_conntrack_htable_size; ++extern int ip_conntrack_disable_ve0; + + #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) + +@@ -341,6 +364,9 @@ ip_conntrack_event_cache(enum ip_conntra + struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; + struct ip_conntrack_ecache *ecache; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ct != ecache->ct) +@@ -352,7 +378,7 @@ ip_conntrack_event_cache(enum ip_conntra + static inline void ip_conntrack_event(enum ip_conntrack_events event, + struct ip_conntrack *ct) + { +- if (is_confirmed(ct) && !is_dying(ct)) ++ if (is_confirmed(ct) && !is_dying(ct) && ve_is_super(get_exec_env())) + notifier_call_chain(&ip_conntrack_chain, event, ct); + } + +@@ -360,7 +386,8 @@ static inline void + ip_conntrack_expect_event(enum ip_conntrack_expect_events event, + struct ip_conntrack_expect *exp) + { +- notifier_call_chain(&ip_conntrack_expect_chain, event, exp); ++ if (ve_is_super(get_exec_env())) ++ notifier_call_chain(&ip_conntrack_expect_chain, event, exp); + } + #else /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_core.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_core.h 2006-07-04 14:41:39.000000000 +0400 +@@ -3,7 +3,6 @@ + #include <linux/netfilter.h> + + #define MAX_IP_CT_PROTO 256 +-extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + + /* This header is used to share core functionality between the + standalone connection tracking module, and the compatibility layer's use +@@ -54,8 +53,26 @@ static inline int ip_conntrack_confirm(s + + extern void ip_ct_unlink_expect(struct ip_conntrack_expect *exp); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_ct_protos \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_protos) ++#define ve_ip_conntrack_hash \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_hash) ++#define ve_ip_conntrack_expect_list \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_expect_list) ++#define ve_ip_conntrack_vmalloc \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_vmalloc) ++#else ++extern struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + extern struct list_head *ip_conntrack_hash; + extern struct list_head ip_conntrack_expect_list; ++#define ve_ip_ct_protos ip_ct_protos ++#define ve_ip_conntrack_hash ip_conntrack_hash ++#define ve_ip_conntrack_expect_list ip_conntrack_expect_list ++#define ve_ip_conntrack_vmalloc ip_conntrack_vmalloc ++#endif /* CONFIG_VE_IPTABLES */ ++ + extern rwlock_t ip_conntrack_lock; + #endif /* _IP_CONNTRACK_CORE_H */ + +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_helper.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_helper.h 2006-07-04 14:41:39.000000000 +0400 +@@ -31,6 +31,9 @@ struct ip_conntrack_helper + extern int ip_conntrack_helper_register(struct ip_conntrack_helper *); + extern void ip_conntrack_helper_unregister(struct ip_conntrack_helper *); + ++extern int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *); ++extern void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *); ++ + /* Allocate space for an expectation: this is mandatory before calling + ip_conntrack_expect_related. You will have to call put afterwards. */ + extern struct ip_conntrack_expect * +@@ -41,4 +44,5 @@ extern void ip_conntrack_expect_put(stru + extern int ip_conntrack_expect_related(struct ip_conntrack_expect *exp); + extern void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp); + ++extern struct list_head helpers; + #endif /*_IP_CONNTRACK_HELPER_H*/ +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_irc.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_irc.h 2006-07-04 14:41:39.000000000 +0400 +@@ -14,16 +14,26 @@ + #ifndef _IP_CONNTRACK_IRC_H + #define _IP_CONNTRACK_IRC_H + ++#include <linux/netfilter_ipv4/ip_conntrack_helper.h> ++ + /* This structure exists only once per master */ + struct ip_ct_irc_master { + }; + + #ifdef __KERNEL__ +-extern unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, +- enum ip_conntrack_info ctinfo, +- unsigned int matchoff, +- unsigned int matchlen, +- struct ip_conntrack_expect *exp); ++typedef unsigned int (*ip_nat_helper_irc_hook)(struct sk_buff **, ++ enum ip_conntrack_info, unsigned int, unsigned int, ++ struct ip_conntrack_expect *); ++ ++extern ip_nat_helper_irc_hook ip_nat_irc_hook; ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_nat_irc_hook \ ++ ((ip_nat_helper_irc_hook) \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook)) ++#else ++#define ve_ip_nat_irc_hook ip_nat_irc_hook ++#endif + + #define IRC_PORT 6667 + +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_protocol.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_conntrack_protocol.h 2006-07-04 14:41:39.000000000 +0400 +@@ -67,6 +67,7 @@ struct ip_conntrack_protocol + /* Protocol registration. */ + extern int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto); + extern void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto); ++ + /* Existing built-in protocols */ + extern struct ip_conntrack_protocol ip_conntrack_protocol_tcp; + extern struct ip_conntrack_protocol ip_conntrack_protocol_udp; +@@ -74,6 +75,41 @@ extern struct ip_conntrack_protocol ip_c + extern struct ip_conntrack_protocol ip_conntrack_generic_protocol; + extern int ip_conntrack_protocol_tcp_init(void); + ++#if defined(CONFIG_VE_IPTABLES) && defined(CONFIG_SYSCTL) ++#include <linux/sched.h> ++#define ve_ip_ct_tcp_timeouts \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeouts) ++#define ve_ip_ct_udp_timeout \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout) ++#define ve_ip_ct_udp_timeout_stream \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_udp_timeout_stream) ++#define ve_ip_ct_icmp_timeout \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_icmp_timeout) ++#define ve_ip_ct_generic_timeout \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_generic_timeout) ++#define ve_ip_ct_log_invalid \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_log_invalid) ++#define ve_ip_ct_tcp_timeout_max_retrans \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_timeout_max_retrans) ++#define ve_ip_ct_tcp_loose \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_loose) ++#define ve_ip_ct_tcp_be_liberal \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_be_liberal) ++#define ve_ip_ct_tcp_max_retrans \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_tcp_max_retrans) ++#else ++#define ve_ip_ct_tcp_timeouts *tcp_timeouts ++#define ve_ip_ct_udp_timeout ip_ct_udp_timeout ++#define ve_ip_ct_udp_timeout_stream ip_ct_udp_timeout_stream ++#define ve_ip_ct_icmp_timeout ip_ct_icmp_timeout ++#define ve_ip_ct_generic_timeout ip_ct_generic_timeout ++#define ve_ip_ct_log_invalid ip_ct_log_invalid ++#define ve_ip_ct_tcp_timeout_max_retrans ip_ct_tcp_timeout_max_retrans ++#define ve_ip_ct_tcp_loose ip_ct_tcp_loose ++#define ve_ip_ct_tcp_be_liberal ip_ct_tcp_be_liberal ++#define ve_ip_ct_tcp_max_retrans ip_ct_tcp_max_retrans ++#endif ++ + /* Log invalid packets */ + extern unsigned int ip_ct_log_invalid; + +@@ -85,10 +121,10 @@ extern int ip_ct_port_nfattr_to_tuple(st + #ifdef CONFIG_SYSCTL + #ifdef DEBUG_INVALID_PACKETS + #define LOG_INVALID(proto) \ +- (ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) ++ (ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) + #else + #define LOG_INVALID(proto) \ +- ((ip_ct_log_invalid == (proto) || ip_ct_log_invalid == IPPROTO_RAW) \ ++ ((ve_ip_ct_log_invalid == (proto) || ve_ip_ct_log_invalid == IPPROTO_RAW) \ + && net_ratelimit()) + #endif + #else +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat.h 2006-07-04 14:41:39.000000000 +0400 +@@ -1,5 +1,6 @@ + #ifndef _IP_NAT_H + #define _IP_NAT_H ++#include <linux/config.h> + #include <linux/netfilter_ipv4.h> + #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> + +@@ -72,10 +73,29 @@ extern unsigned int ip_nat_setup_info(st + extern int ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack); + ++extern void ip_nat_hash_conntrack(struct ip_conntrack *conntrack); ++ + /* Calculate relative checksum. */ + extern u_int16_t ip_nat_cheat_check(u_int32_t oldvalinv, + u_int32_t newval, + u_int16_t oldcheck); ++ ++#ifdef CONFIG_COMPAT ++#include <net/compat.h> ++ ++struct compat_ip_nat_range ++{ ++ compat_uint_t flags; ++ u_int32_t min_ip, max_ip; ++ union ip_conntrack_manip_proto min, max; ++}; ++ ++struct compat_ip_nat_multi_range ++{ ++ compat_uint_t rangesize; ++ struct compat_ip_nat_range range[1]; ++}; ++#endif + #else /* !__KERNEL__: iptables wants this to compile. */ + #define ip_nat_multi_range ip_nat_multi_range_compat + #endif /*__KERNEL__*/ +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat_rule.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat_rule.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_nat_rule.h 2006-07-04 14:41:39.000000000 +0400 +@@ -6,7 +6,7 @@ + + #ifdef __KERNEL__ + +-extern int ip_nat_rule_init(void) __init; ++extern int ip_nat_rule_init(void); + extern void ip_nat_rule_cleanup(void); + extern int ip_nat_rule_find(struct sk_buff **pskb, + unsigned int hooknum, +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_tables.h linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_tables.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv4/ip_tables.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv4/ip_tables.h 2006-07-04 14:41:39.000000000 +0400 +@@ -16,6 +16,7 @@ + #define _IPTABLES_H + + #ifdef __KERNEL__ ++#include <linux/config.h> + #include <linux/if.h> + #include <linux/types.h> + #include <linux/in.h> +@@ -330,7 +331,7 @@ extern void ipt_init(void) __init; + //#define ipt_register_table(tbl, repl) xt_register_table(AF_INET, tbl, repl) + //#define ipt_unregister_table(tbl) xt_unregister_table(AF_INET, tbl) + +-extern int ipt_register_table(struct ipt_table *table, ++extern struct ipt_table *ipt_register_table(struct ipt_table *table, + const struct ipt_replace *repl); + extern void ipt_unregister_table(struct ipt_table *table); + +@@ -364,5 +365,62 @@ extern unsigned int ipt_do_table(struct + void *userdata); + + #define IPT_ALIGN(s) XT_ALIGN(s) ++ ++#ifdef CONFIG_COMPAT ++#include <net/compat.h> ++ ++struct compat_ipt_getinfo ++{ ++ char name[IPT_TABLE_MAXNAMELEN]; ++ compat_uint_t valid_hooks; ++ compat_uint_t hook_entry[NF_IP_NUMHOOKS]; ++ compat_uint_t underflow[NF_IP_NUMHOOKS]; ++ compat_uint_t num_entries; ++ compat_uint_t size; ++}; ++ ++struct compat_ipt_entry ++{ ++ struct ipt_ip ip; ++ compat_uint_t nfcache; ++ u_int16_t target_offset; ++ u_int16_t next_offset; ++ compat_uint_t comefrom; ++ struct compat_xt_counters counters; ++ unsigned char elems[0]; ++}; ++ ++struct compat_ipt_entry_match ++{ ++ union { ++ struct { ++ u_int16_t match_size; ++ char name[IPT_FUNCTION_MAXNAMELEN]; ++ } user; ++ u_int16_t match_size; ++ } u; ++ unsigned char data[0]; ++}; ++ ++struct compat_ipt_entry_target ++{ ++ union { ++ struct { ++ u_int16_t target_size; ++ char name[IPT_FUNCTION_MAXNAMELEN]; ++ } user; ++ u_int16_t target_size; ++ } u; ++ unsigned char data[0]; ++}; ++ ++#define COMPAT_IPT_ALIGN(s) COMPAT_XT_ALIGN(s) ++ ++extern int ipt_match_align_compat(void *match, void **dstptr, ++ int *size, int off, int convert); ++extern int ipt_target_align_compat(void *target, void **dstptr, ++ int *size, int off, int convert); ++ ++#endif /* CONFIG_COMPAT */ + #endif /*__KERNEL__*/ + #endif /* _IPTABLES_H */ +diff -upr linux-2.6.16.orig/include/linux/netfilter_ipv6/ip6_tables.h linux-2.6.16-026test015/include/linux/netfilter_ipv6/ip6_tables.h +--- linux-2.6.16.orig/include/linux/netfilter_ipv6/ip6_tables.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/netfilter_ipv6/ip6_tables.h 2006-07-04 14:41:39.000000000 +0400 +@@ -340,7 +340,7 @@ extern void ip6t_init(void) __init; + #define ip6t_register_match(match) xt_register_match(AF_INET6, match) + #define ip6t_unregister_match(match) xt_unregister_match(AF_INET6, match) + +-extern int ip6t_register_table(struct ip6t_table *table, ++extern struct ip6t_table *ip6t_register_table(struct ip6t_table *table, + const struct ip6t_replace *repl); + extern void ip6t_unregister_table(struct ip6t_table *table); + extern unsigned int ip6t_do_table(struct sk_buff **pskb, +diff -upr linux-2.6.16.orig/include/linux/nfcalls.h linux-2.6.16-026test015/include/linux/nfcalls.h +--- linux-2.6.16.orig/include/linux/nfcalls.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/nfcalls.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,254 @@ ++/* ++ * include/linux/nfcalls.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_NFCALLS_H ++#define _LINUX_NFCALLS_H ++ ++#include <linux/rcupdate.h> ++ ++#ifdef CONFIG_MODULES ++extern struct module no_module; ++ ++#define DECL_KSYM_MODULE(name) \ ++ extern struct module *vz_mod_##name ++#define DECL_KSYM_CALL(type, name, args) \ ++ extern type (*vz_##name) args ++ ++#define INIT_KSYM_MODULE(name) \ ++ struct module *vz_mod_##name = &no_module; \ ++ EXPORT_SYMBOL(vz_mod_##name) ++#define INIT_KSYM_CALL(type, name, args) \ ++ type (*vz_##name) args; \ ++ EXPORT_SYMBOL(vz_##name) ++ ++#define __KSYMERRCALL(err, type, mod, name, args) \ ++({ \ ++ type ret = (type)err; \ ++ if (!__vzksym_module_get(vz_mod_##mod)) { \ ++ if (vz_##name) \ ++ ret = ((*vz_##name)args); \ ++ __vzksym_module_put(vz_mod_##mod); \ ++ } \ ++ ret; \ ++}) ++#define __KSYMSAFECALL_VOID(mod, name, args) \ ++do { \ ++ if (!__vzksym_module_get(vz_mod_##mod)) { \ ++ if (vz_##name) \ ++ ((*vz_##name)args); \ ++ __vzksym_module_put(vz_mod_##mod); \ ++ } \ ++} while (0) ++#else ++#define DECL_KSYM_CALL(type, name, args) \ ++ extern type name args ++#define INIT_KSYM_MODULE(name) ++#define INIT_KSYM_CALL(type, name, args) \ ++ type name args ++#define __KSYMERRCALL(err, type, mod, name, args) ((*name)args) ++#define __KSYMSAFECALL_VOID(mod, name, args) ((*name)args) ++#endif ++ ++#define KSYMERRCALL(err, mod, name, args) \ ++ __KSYMERRCALL(err, int, mod, name, args) ++#define KSYMSAFECALL(type, mod, name, args) \ ++ __KSYMERRCALL(0, type, mod, name, args) ++#define KSYMSAFECALL_VOID(mod, name, args) \ ++ __KSYMSAFECALL_VOID(mod, name, args) ++ ++#if defined(CONFIG_VE) && defined(CONFIG_MODULES) ++/* should be called _after_ KSYMRESOLVE's */ ++#define KSYMMODRESOLVE(name) \ ++ __vzksym_modresolve(&vz_mod_##name, THIS_MODULE) ++#define KSYMMODUNRESOLVE(name) \ ++ __vzksym_modunresolve(&vz_mod_##name) ++ ++#define KSYMRESOLVE(name) \ ++ vz_##name = &name ++#define KSYMUNRESOLVE(name) \ ++ vz_##name = NULL ++#else ++#define KSYMRESOLVE(name) do { } while (0) ++#define KSYMUNRESOLVE(name) do { } while (0) ++#define KSYMMODRESOLVE(name) do { } while (0) ++#define KSYMMODUNRESOLVE(name) do { } while (0) ++#endif ++ ++#ifdef CONFIG_MODULES ++static inline void __vzksym_modresolve(struct module **modp, struct module *mod) ++{ ++ /* ++ * we want to be sure, that pointer updates are visible first: ++ * 1. wmb() is here only for piece of sure ++ * (note, no rmb() in KSYMSAFECALL) ++ * 2. synchronize_sched() guarantees that updates are visible ++ * on all cpus and allows us to remove rmb() in KSYMSAFECALL ++ */ ++ wmb(); synchronize_sched(); ++ *modp = mod; ++ /* just to be sure, our changes are visible as soon as possible */ ++ wmb(); synchronize_sched(); ++} ++ ++static inline void __vzksym_modunresolve(struct module **modp) ++{ ++ /* ++ * try_module_get() in KSYMSAFECALL should fail at this moment since ++ * THIS_MODULE in in unloading state (we should be called from fini), ++ * no need to syncronize pointers/ve_module updates. ++ */ ++ *modp = &no_module; ++ /* ++ * synchronize_sched() guarantees here that we see ++ * updated module pointer before the module really gets away ++ */ ++ synchronize_sched(); ++} ++ ++static inline int __vzksym_module_get(struct module *mod) ++{ ++ /* ++ * we want to avoid rmb(), so use synchronize_sched() in KSYMUNRESOLVE ++ * and smp_read_barrier_depends() here... ++ */ ++ smp_read_barrier_depends(); /* for module loading */ ++ if (!try_module_get(mod)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static inline void __vzksym_module_put(struct module *mod) ++{ ++ module_put(mod); ++} ++#endif ++ ++#if defined(CONFIG_VE) ++#ifdef CONFIG_MODULES ++DECL_KSYM_MODULE(x_tables); ++DECL_KSYM_MODULE(xt_tcpudp); ++DECL_KSYM_MODULE(ip_tables); ++DECL_KSYM_MODULE(ip6_tables); ++DECL_KSYM_MODULE(iptable_filter); ++DECL_KSYM_MODULE(ip6table_filter); ++DECL_KSYM_MODULE(iptable_mangle); ++DECL_KSYM_MODULE(ip6table_mangle); ++DECL_KSYM_MODULE(xt_limit); ++DECL_KSYM_MODULE(ipt_multiport); ++DECL_KSYM_MODULE(ip6t_multiport); ++DECL_KSYM_MODULE(ipt_tos); ++DECL_KSYM_MODULE(ipt_TOS); ++DECL_KSYM_MODULE(ipt_REJECT); ++DECL_KSYM_MODULE(ip6t_REJECT); ++DECL_KSYM_MODULE(ipt_TCPMSS); ++DECL_KSYM_MODULE(xt_tcpmss); ++DECL_KSYM_MODULE(ipt_ttl); ++DECL_KSYM_MODULE(ipt_LOG); ++DECL_KSYM_MODULE(ip6t_LOG); ++DECL_KSYM_MODULE(xt_length); ++DECL_KSYM_MODULE(ip_conntrack); ++DECL_KSYM_MODULE(ip_conntrack_ftp); ++DECL_KSYM_MODULE(ip_conntrack_irc); ++DECL_KSYM_MODULE(xt_conntrack); ++DECL_KSYM_MODULE(xt_state); ++DECL_KSYM_MODULE(xt_helper); ++DECL_KSYM_MODULE(ip_nat); ++DECL_KSYM_MODULE(iptable_nat); ++DECL_KSYM_MODULE(ip_nat_ftp); ++DECL_KSYM_MODULE(ip_nat_irc); ++DECL_KSYM_MODULE(ipt_REDIRECT); ++#endif ++ ++struct sk_buff; ++ ++DECL_KSYM_CALL(int, init_netfilter, (void)); ++DECL_KSYM_CALL(int, init_xtables, (void)); ++DECL_KSYM_CALL(int, init_xt_tcpudp, (void)); ++DECL_KSYM_CALL(int, init_iptables, (void)); ++DECL_KSYM_CALL(int, init_ip6tables, (void)); ++DECL_KSYM_CALL(int, init_iptable_filter, (void)); ++DECL_KSYM_CALL(int, init_ip6table_filter, (void)); ++DECL_KSYM_CALL(int, init_iptable_mangle, (void)); ++DECL_KSYM_CALL(int, init_ip6table_mangle, (void)); ++DECL_KSYM_CALL(int, init_xt_limit, (void)); ++DECL_KSYM_CALL(int, init_iptable_multiport, (void)); ++DECL_KSYM_CALL(int, init_ip6table_multiport, (void)); ++DECL_KSYM_CALL(int, init_iptable_tos, (void)); ++DECL_KSYM_CALL(int, init_iptable_TOS, (void)); ++DECL_KSYM_CALL(int, init_iptable_REJECT, (void)); ++DECL_KSYM_CALL(int, init_ip6table_REJECT, (void)); ++DECL_KSYM_CALL(int, init_iptable_TCPMSS, (void)); ++DECL_KSYM_CALL(int, init_xt_tcpmss, (void)); ++DECL_KSYM_CALL(int, init_iptable_ttl, (void)); ++DECL_KSYM_CALL(int, init_iptable_LOG, (void)); ++DECL_KSYM_CALL(int, init_ip6table_LOG, (void)); ++DECL_KSYM_CALL(int, init_xt_length, (void)); ++DECL_KSYM_CALL(int, init_iptable_conntrack, (void)); ++DECL_KSYM_CALL(int, init_iptable_ftp, (void)); ++DECL_KSYM_CALL(int, init_iptable_irc, (void)); ++DECL_KSYM_CALL(int, init_xt_conntrack_match, (void)); ++DECL_KSYM_CALL(int, init_xt_state, (void)); ++DECL_KSYM_CALL(int, init_xt_helper, (void)); ++DECL_KSYM_CALL(int, ip_nat_init, (void)); ++DECL_KSYM_CALL(int, init_iptable_nat, (void)); ++DECL_KSYM_CALL(int, init_iptable_nat_ftp, (void)); ++DECL_KSYM_CALL(int, init_iptable_nat_irc, (void)); ++DECL_KSYM_CALL(int, init_iptable_REDIRECT, (void)); ++DECL_KSYM_CALL(void, fini_iptable_nat_irc, (void)); ++DECL_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); ++DECL_KSYM_CALL(void, fini_iptable_nat, (void)); ++DECL_KSYM_CALL(void, ip_nat_cleanup, (void)); ++DECL_KSYM_CALL(void, fini_xt_helper, (void)); ++DECL_KSYM_CALL(void, fini_xt_state, (void)); ++DECL_KSYM_CALL(void, fini_xt_conntrack_match, (void)); ++DECL_KSYM_CALL(void, fini_iptable_irc, (void)); ++DECL_KSYM_CALL(void, fini_iptable_ftp, (void)); ++DECL_KSYM_CALL(void, fini_iptable_conntrack, (void)); ++DECL_KSYM_CALL(void, fini_xt_length, (void)); ++DECL_KSYM_CALL(void, fini_ip6table_LOG, (void)); ++DECL_KSYM_CALL(void, fini_iptable_LOG, (void)); ++DECL_KSYM_CALL(void, fini_iptable_ttl, (void)); ++DECL_KSYM_CALL(void, fini_xt_tcpmss, (void)); ++DECL_KSYM_CALL(void, fini_iptable_TCPMSS, (void)); ++DECL_KSYM_CALL(void, fini_ip6table_REJECT, (void)); ++DECL_KSYM_CALL(void, fini_iptable_REJECT, (void)); ++DECL_KSYM_CALL(void, fini_iptable_TOS, (void)); ++DECL_KSYM_CALL(void, fini_iptable_tos, (void)); ++DECL_KSYM_CALL(void, fini_ip6table_multiport, (void)); ++DECL_KSYM_CALL(void, fini_iptable_multiport, (void)); ++DECL_KSYM_CALL(void, fini_xt_limit, (void)); ++DECL_KSYM_CALL(void, fini_iptable_filter, (void)); ++DECL_KSYM_CALL(void, fini_ip6table_filter, (void)); ++DECL_KSYM_CALL(void, fini_iptable_mangle, (void)); ++DECL_KSYM_CALL(void, fini_ip6table_mangle, (void)); ++DECL_KSYM_CALL(void, fini_ip6tables, (void)); ++DECL_KSYM_CALL(void, fini_iptables, (void)); ++DECL_KSYM_CALL(void, fini_xt_tcpudp, (void)); ++DECL_KSYM_CALL(void, fini_xtables, (void)); ++DECL_KSYM_CALL(void, fini_netfilter, (void)); ++DECL_KSYM_CALL(void, fini_iptable_REDIRECT, (void)); ++ ++#include <linux/netfilter/x_tables.h> ++ ++DECL_KSYM_CALL(void, ipt_flush_table, (struct xt_table *table)); ++DECL_KSYM_CALL(void, ip6t_flush_table, (struct xt_table *table)); ++#endif /* CONFIG_VE */ ++ ++#ifdef CONFIG_VE_CALLS_MODULE ++DECL_KSYM_MODULE(vzmon); ++DECL_KSYM_CALL(int, real_get_device_perms_ve, ++ (int dev_type, dev_t dev, int access_mode)); ++DECL_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env)); ++DECL_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); ++DECL_KSYM_CALL(void, real_update_load_avg_ve, (void)); ++#endif ++ ++#endif /* _LINUX_NFCALLS_H */ +diff -upr linux-2.6.16.orig/include/linux/nfs_fs.h linux-2.6.16-026test015/include/linux/nfs_fs.h +--- linux-2.6.16.orig/include/linux/nfs_fs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/nfs_fs.h 2006-07-04 14:41:37.000000000 +0400 +@@ -296,7 +296,7 @@ extern struct inode *nfs_fhget(struct su + extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); + extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); + extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +-extern int nfs_permission(struct inode *, int, struct nameidata *); ++extern int nfs_permission(struct inode *, int, struct nameidata *, struct exec_perm *); + extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *); + extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); + extern int nfs_open(struct inode *, struct file *); +diff -upr linux-2.6.16.orig/include/linux/notifier.h linux-2.6.16-026test015/include/linux/notifier.h +--- linux-2.6.16.orig/include/linux/notifier.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/notifier.h 2006-07-04 14:41:39.000000000 +0400 +@@ -27,8 +27,9 @@ extern int notifier_call_chain(struct no + + #define NOTIFY_DONE 0x0000 /* Don't care */ + #define NOTIFY_OK 0x0001 /* Suits me */ ++#define NOTIFY_FAIL 0x0002 /* Reject */ + #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ +-#define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ ++#define NOTIFY_BAD (NOTIFY_STOP_MASK|NOTIFY_FAIL) /* Bad/Veto action */ + /* + * Clean way to return from the notifier and stop further calls. + */ +diff -upr linux-2.6.16.orig/include/linux/page-flags.h linux-2.6.16-026test015/include/linux/page-flags.h +--- linux-2.6.16.orig/include/linux/page-flags.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/page-flags.h 2006-07-04 14:41:36.000000000 +0400 +@@ -74,7 +74,9 @@ + #define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ + #define PG_reclaim 17 /* To be reclaimed asap */ + #define PG_nosave_free 18 /* Free, should not be written */ +-#define PG_uncached 19 /* Page has been mapped as uncached */ ++#define PG_buddy 19 /* Page is free, on buddy lists */ ++ ++#define PG_uncached 20 /* Page has been mapped as uncached */ + + /* + * Global page accounting. One instance per CPU. Only unsigned longs are +@@ -319,6 +321,10 @@ extern void __mod_page_state_offset(unsi + #define SetPageNosaveFree(page) set_bit(PG_nosave_free, &(page)->flags) + #define ClearPageNosaveFree(page) clear_bit(PG_nosave_free, &(page)->flags) + ++#define PageBuddy(page) test_bit(PG_buddy, &(page)->flags) ++#define __SetPageBuddy(page) __set_bit(PG_buddy, &(page)->flags) ++#define __ClearPageBuddy(page) __clear_bit(PG_buddy, &(page)->flags) ++ + #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) + #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) + #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) +diff -upr linux-2.6.16.orig/include/linux/pid.h linux-2.6.16-026test015/include/linux/pid.h +--- linux-2.6.16.orig/include/linux/pid.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/pid.h 2006-07-04 14:41:38.000000000 +0400 +@@ -1,6 +1,18 @@ + #ifndef _LINUX_PID_H + #define _LINUX_PID_H + ++#define VPID_BIT 10 ++#define VPID_DIV (1<<VPID_BIT) ++ ++#ifdef CONFIG_VE ++#define __is_virtual_pid(pid) ((pid) & VPID_DIV) ++#define is_virtual_pid(pid) \ ++ (__is_virtual_pid(pid) || ((pid)==1 && !ve_is_super(get_exec_env()))) ++#else ++#define __is_virtual_pid(pid) 0 ++#define is_virtual_pid(pid) 0 ++#endif ++ + enum pid_type + { + PIDTYPE_PID, +@@ -15,6 +27,9 @@ struct pid + /* Try to keep pid_chain in the same cacheline as nr for find_pid */ + int nr; + struct hlist_node pid_chain; ++#ifdef CONFIG_VE ++ int vnr; ++#endif + /* list of pids with the same nr, only one of them is in the hash */ + struct list_head pid_list; + }; +@@ -40,16 +55,89 @@ extern int alloc_pidmap(void); + extern void FASTCALL(free_pidmap(int)); + extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); + +-#define do_each_task_pid(who, type, task) \ +- if ((task = find_task_by_pid_type(type, who))) { \ ++#ifndef CONFIG_VE ++ ++#define vpid_to_pid(pid) (pid) ++#define __vpid_to_pid(pid) (pid) ++#define pid_type_to_vpid(type, pid) (pid) ++#define __pid_type_to_vpid(type, pid) (pid) ++ ++#define comb_vpid_to_pid(pid) (pid) ++#define comb_pid_to_vpid(pid) (pid) ++ ++#else ++ ++struct ve_struct; ++extern void free_vpid(int vpid, struct ve_struct *ve); ++extern int alloc_vpid(int pid, int vpid); ++extern int vpid_to_pid(int pid); ++extern int __vpid_to_pid(int pid); ++extern pid_t pid_type_to_vpid(int type, pid_t pid); ++extern pid_t _pid_type_to_vpid(int type, pid_t pid); ++ ++static inline int comb_vpid_to_pid(int vpid) ++{ ++ int pid = vpid; ++ ++ if (vpid > 0) { ++ pid = vpid_to_pid(vpid); ++ if (unlikely(pid < 0)) ++ return 0; ++ } else if (vpid < 0) { ++ pid = vpid_to_pid(-vpid); ++ if (unlikely(pid < 0)) ++ return 0; ++ pid = -pid; ++ } ++ return pid; ++} ++ ++static inline int comb_pid_to_vpid(int pid) ++{ ++ int vpid = pid; ++ ++ if (pid > 0) { ++ vpid = pid_type_to_vpid(PIDTYPE_PID, pid); ++ if (unlikely(vpid < 0)) ++ return 0; ++ } else if (pid < 0) { ++ vpid = pid_type_to_vpid(PIDTYPE_PGID, -pid); ++ if (unlikely(vpid < 0)) ++ return 0; ++ vpid = -vpid; ++ } ++ return vpid; ++} ++#endif ++ ++#define do_each_task_pid_all(who, type, task) \ ++ if ((task = find_task_by_pid_type_all(type, who))) { \ + prefetch((task)->pids[type].pid_list.next); \ + do { + +-#define while_each_task_pid(who, type, task) \ ++#define while_each_task_pid_all(who, type, task) \ + } while (task = pid_task((task)->pids[type].pid_list.next,\ + type), \ + prefetch((task)->pids[type].pid_list.next), \ + hlist_unhashed(&(task)->pids[type].pid_chain)); \ + } \ + ++#ifndef CONFIG_VE ++#define __do_each_task_pid_ve(who, type, task, owner) \ ++ do_each_task_pid_all(who, type, task) ++#define __while_each_task_pid_ve(who, type, task, owner) \ ++ while_each_task_pid_all(who, type, task) ++#else /* CONFIG_VE */ ++#define __do_each_task_pid_ve(who, type, task, owner) \ ++ do_each_task_pid_all(who, type, task) \ ++ if (ve_accessible(VE_TASK_INFO(task)->owner_env, owner)) ++#define __while_each_task_pid_ve(who, type, task, owner) \ ++ while_each_task_pid_all(who, type, task) ++#endif /* CONFIG_VE */ ++ ++#define do_each_task_pid_ve(who, type, task) \ ++ __do_each_task_pid_ve(who, type, task, get_exec_env()); ++#define while_each_task_pid_ve(who, type, task) \ ++ __while_each_task_pid_ve(who, type, task, get_exec_env()); ++ + #endif /* _LINUX_PID_H */ +diff -upr linux-2.6.16.orig/include/linux/proc_fs.h linux-2.6.16-026test015/include/linux/proc_fs.h +--- linux-2.6.16.orig/include/linux/proc_fs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/proc_fs.h 2006-07-04 14:41:38.000000000 +0400 +@@ -78,7 +78,7 @@ struct kcore_list { + struct vmcore { + struct list_head list; + unsigned long long paddr; +- unsigned long size; ++ unsigned long long size; + loff_t offset; + }; + +@@ -86,8 +86,14 @@ struct vmcore { + + extern struct proc_dir_entry proc_root; + extern struct proc_dir_entry *proc_root_fs; ++#ifdef CONFIG_VE ++#include <linux/sched.h> ++#define proc_net (get_exec_env()->_proc_net) ++#define proc_net_stat (get_exec_env()->_proc_net_stat) ++#else + extern struct proc_dir_entry *proc_net; + extern struct proc_dir_entry *proc_net_stat; ++#endif + extern struct proc_dir_entry *proc_bus; + extern struct proc_dir_entry *proc_root_driver; + extern struct proc_dir_entry *proc_root_kcore; +@@ -98,8 +104,8 @@ extern void proc_misc_init(void); + struct mm_struct; + + struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +-struct dentry *proc_pid_unhash(struct task_struct *p); +-void proc_pid_flush(struct dentry *proc_dentry); ++void proc_pid_unhash(struct task_struct *p, struct dentry * [2]); ++void proc_pid_flush(struct dentry *proc_dentry[2]); + int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); + unsigned long task_vsize(struct mm_struct *); + int task_statm(struct mm_struct *, int *, int *, int *, int *); +@@ -107,7 +113,11 @@ char *task_mem(struct mm_struct *, char + + extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, + struct proc_dir_entry *parent); ++extern struct proc_dir_entry *create_proc_glob_entry(const char *name, ++ mode_t mode, ++ struct proc_dir_entry *parent); + extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent); ++extern void remove_proc_glob_entry(const char *name, struct proc_dir_entry *parent); + + extern struct vfsmount *proc_mnt; + extern int proc_fill_super(struct super_block *,void *,int); +@@ -194,6 +204,15 @@ static inline struct proc_dir_entry *pro + return res; + } + ++static inline struct proc_dir_entry *proc_glob_fops_create(const char *name, ++ mode_t mode, struct file_operations *fops) ++{ ++ struct proc_dir_entry *res = create_proc_glob_entry(name, mode, NULL); ++ if (res) ++ res->proc_fops = fops; ++ return res; ++} ++ + static inline void proc_net_remove(const char *name) + { + remove_proc_entry(name,proc_net); +@@ -206,16 +225,21 @@ static inline void proc_net_remove(const + #define proc_bus NULL + + #define proc_net_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) ++#define proc_glob_fops_create(name, mode, fops) ({ (void)(mode), NULL; }) + #define proc_net_create(name, mode, info) ({ (void)(mode), NULL; }) + static inline void proc_net_remove(const char *name) {} + +-static inline struct dentry *proc_pid_unhash(struct task_struct *p) { return NULL; } +-static inline void proc_pid_flush(struct dentry *proc_dentry) { } ++static inline struct dentry *proc_pid_unhash(struct task_struct *p, ++ struct dentry *d[2]) { return NULL; } ++static inline void proc_pid_flush(struct dentry *proc_dentry[2]) { } + + static inline struct proc_dir_entry *create_proc_entry(const char *name, + mode_t mode, struct proc_dir_entry *parent) { return NULL; } ++static inline struct proc_dir_entry *create_proc_glob_entry(const char *name, ++ mode_t mode, struct proc_dir_entry *parent) { return NULL; } + + #define remove_proc_entry(name, parent) do {} while (0) ++#define remove_proc_glob_entry(name, parent) do {} while (0) + + static inline struct proc_dir_entry *proc_symlink(const char *name, + struct proc_dir_entry *parent,const char *dest) {return NULL;} +@@ -266,4 +290,18 @@ static inline struct proc_dir_entry *PDE + return PROC_I(inode)->pde; + } + ++static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) ++{ ++ if (de) ++ atomic_inc(&de->count); ++ return de; ++} ++ ++extern void de_put(struct proc_dir_entry *); ++ ++#define LPDE(inode) (PROC_I((inode))->pde) ++#ifdef CONFIG_VE ++#define GPDE(inode) (*(struct proc_dir_entry **)(&(inode)->i_pipe)) ++#endif ++ + #endif /* _LINUX_PROC_FS_H */ +diff -upr linux-2.6.16.orig/include/linux/quota.h linux-2.6.16-026test015/include/linux/quota.h +--- linux-2.6.16.orig/include/linux/quota.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/quota.h 2006-07-04 14:41:39.000000000 +0400 +@@ -37,7 +37,6 @@ + + #include <linux/errno.h> + #include <linux/types.h> +-#include <linux/spinlock.h> + + #define __DQUOT_VERSION__ "dquot_6.5.1" + #define __DQUOT_NUM_VERSION__ 6*10000+5*100+1 +@@ -45,8 +44,6 @@ + typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ + typedef __u64 qsize_t; /* Type in which we store sizes */ + +-extern spinlock_t dq_data_lock; +- + /* Size of blocks in which are counted size limits */ + #define QUOTABLOCK_BITS 10 + #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) +@@ -133,6 +130,10 @@ struct if_dqinfo { + + #ifdef __KERNEL__ + ++#include <linux/spinlock.h> ++ ++extern spinlock_t dq_data_lock; ++ + #include <linux/dqblk_xfs.h> + #include <linux/dqblk_v1.h> + #include <linux/dqblk_v2.h> +@@ -242,6 +243,8 @@ struct quota_format_ops { + int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ + }; + ++struct inode; ++struct iattr; + /* Operations working with dquots */ + struct dquot_operations { + int (*initialize) (struct inode *, int); +@@ -256,9 +259,11 @@ struct dquot_operations { + int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ + int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ + int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ ++ int (*rename) (struct inode *, struct inode *, struct inode *); + }; + + /* Operations handling requests from userspace */ ++struct v2_disk_dqblk; + struct quotactl_ops { + int (*quota_on)(struct super_block *, int, int, char *); + int (*quota_off)(struct super_block *, int); +@@ -271,6 +276,9 @@ struct quotactl_ops { + int (*set_xstate)(struct super_block *, unsigned int, int); + int (*get_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); + int (*set_xquota)(struct super_block *, int, qid_t, struct fs_disk_quota *); ++#ifdef CONFIG_QUOTA_COMPAT ++ int (*get_quoti)(struct super_block *, int, unsigned int, struct v2_disk_dqblk *); ++#endif + }; + + struct quota_format_type { +@@ -291,6 +299,10 @@ struct quota_info { + struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ + struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ + struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ struct vz_quota_master *vzdq_master; ++ int vzdq_count; ++#endif + }; + + /* Inline would be better but we need to dereference super_block which is not defined yet */ +diff -upr linux-2.6.16.orig/include/linux/quotaops.h linux-2.6.16-026test015/include/linux/quotaops.h +--- linux-2.6.16.orig/include/linux/quotaops.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/quotaops.h 2006-07-04 14:41:39.000000000 +0400 +@@ -171,6 +171,19 @@ static __inline__ int DQUOT_TRANSFER(str + return 0; + } + ++static __inline__ int DQUOT_RENAME(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir) ++{ ++ struct dquot_operations *q_op; ++ ++ q_op = inode->i_sb->dq_op; ++ if (q_op && q_op->rename) { ++ if (q_op->rename(inode, old_dir, new_dir) == NO_QUOTA) ++ return 1; ++ } ++ return 0; ++} ++ + /* The following two functions cannot be called inside a transaction */ + #define DQUOT_SYNC(sb) sync_dquots(sb, -1) + +@@ -197,6 +210,7 @@ static __inline__ int DQUOT_OFF(struct s + #define DQUOT_SYNC(sb) do { } while(0) + #define DQUOT_OFF(sb) do { } while(0) + #define DQUOT_TRANSFER(inode, iattr) (0) ++#define DQUOT_RENAME(inode, old_dir, new_dir) (0) + static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) + { + inode_add_bytes(inode, nr); +diff -upr linux-2.6.16.orig/include/linux/raid/raid1.h linux-2.6.16-026test015/include/linux/raid/raid1.h +--- linux-2.6.16.orig/include/linux/raid/raid1.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/raid/raid1.h 2006-07-04 14:41:36.000000000 +0400 +@@ -130,6 +130,6 @@ struct r1bio_s { + * with failure when last write completes (and all failed). + * Record that bi_end_io was called with this flag... + */ +-#define R1BIO_Returned 4 ++#define R1BIO_Returned 6 + + #endif +diff -upr linux-2.6.16.orig/include/linux/reiserfs_xattr.h linux-2.6.16-026test015/include/linux/reiserfs_xattr.h +--- linux-2.6.16.orig/include/linux/reiserfs_xattr.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/reiserfs_xattr.h 2006-07-04 14:41:37.000000000 +0400 +@@ -42,7 +42,8 @@ int reiserfs_removexattr(struct dentry * + int reiserfs_delete_xattrs(struct inode *inode); + int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); + int reiserfs_xattr_init(struct super_block *sb, int mount_flags); +-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd); ++int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, ++ struct exec_perm *); + + int reiserfs_xattr_del(struct inode *, const char *); + int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t); +diff -upr linux-2.6.16.orig/include/linux/rmap.h linux-2.6.16-026test015/include/linux/rmap.h +--- linux-2.6.16.orig/include/linux/rmap.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/rmap.h 2006-07-04 14:41:39.000000000 +0400 +@@ -74,6 +74,7 @@ void page_add_anon_rmap(struct page *, s + void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); + void page_add_file_rmap(struct page *); + void page_remove_rmap(struct page *); ++struct anon_vma *page_lock_anon_vma(struct page *page); + + /** + * page_dup_rmap - duplicate pte mapping to a page +diff -upr linux-2.6.16.orig/include/linux/rtc.h linux-2.6.16-026test015/include/linux/rtc.h +--- linux-2.6.16.orig/include/linux/rtc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/rtc.h 2006-07-04 14:41:36.000000000 +0400 +@@ -11,8 +11,6 @@ + #ifndef _LINUX_RTC_H_ + #define _LINUX_RTC_H_ + +-#include <linux/interrupt.h> +- + /* + * The struct used to pass data via the following ioctl. Similar to the + * struct tm in <time.h>, but it needs to be here so that the kernel +@@ -95,6 +93,8 @@ struct rtc_pll_info { + + #ifdef __KERNEL__ + ++#include <linux/interrupt.h> ++ + typedef struct rtc_task { + void (*func)(void *private_data); + void *private_data; +diff -upr linux-2.6.16.orig/include/linux/sched.h linux-2.6.16-026test015/include/linux/sched.h +--- linux-2.6.16.orig/include/linux/sched.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/sched.h 2006-07-04 14:41:39.000000000 +0400 +@@ -38,7 +38,10 @@ + + #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ + ++#include <ub/ub_task.h> ++ + struct exec_domain; ++struct ve_struct; + + /* + * cloning flags: +@@ -92,15 +95,34 @@ extern unsigned long avenrun[]; /* Load + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + ++#define LOAD_INT(x) ((x) >> FSHIFT) ++#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) ++ + extern unsigned long total_forks; + extern int nr_threads; + extern int last_pid; + DECLARE_PER_CPU(unsigned long, process_counts); + extern int nr_processes(void); ++ ++extern unsigned long nr_sleeping(void); ++extern unsigned long nr_stopped(void); ++extern unsigned long nr_zombie; ++extern atomic_t nr_dead; + extern unsigned long nr_running(void); + extern unsigned long nr_uninterruptible(void); + extern unsigned long nr_iowait(void); + ++#ifdef CONFIG_VE ++struct ve_struct; ++extern unsigned long nr_running_ve(struct ve_struct *); ++extern unsigned long nr_iowait_ve(struct ve_struct *); ++extern unsigned long nr_uninterruptible_ve(struct ve_struct *); ++#else ++#define nr_running_ve(ve) 0 ++#define nr_iowait_ve(ve) 0 ++#define nr_uninterruptible_ve(ve) 0 ++#endif ++ + #include <linux/time.h> + #include <linux/param.h> + #include <linux/resource.h> +@@ -189,6 +211,8 @@ extern cpumask_t nohz_cpu_mask; + + extern void show_state(void); + extern void show_regs(struct pt_regs *); ++extern void smp_show_regs(struct pt_regs *, void *); ++extern void show_vsched(void); + + /* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current +@@ -252,31 +276,7 @@ arch_get_unmapped_area_topdown(struct fi + extern void arch_unmap_area(struct mm_struct *, unsigned long); + extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); + +-#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +-/* +- * The mm counters are not protected by its page_table_lock, +- * so must be incremented atomically. +- */ +-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) +-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) +-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) +-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) +-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) +-typedef atomic_long_t mm_counter_t; +- +-#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +-/* +- * The mm counters are protected by its page_table_lock, +- * so can be incremented directly. +- */ +-#define set_mm_counter(mm, member, value) (mm)->_##member = (value) +-#define get_mm_counter(mm, member) ((mm)->_##member) +-#define add_mm_counter(mm, member, value) (mm)->_##member += (value) +-#define inc_mm_counter(mm, member) (mm)->_##member++ +-#define dec_mm_counter(mm, member) (mm)->_##member-- +-typedef unsigned long mm_counter_t; +- +-#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ ++#include <linux/mm_counter.h> + + #define get_mm_rss(mm) \ + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) +@@ -332,6 +332,7 @@ struct mm_struct { + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + + unsigned dumpable:2; ++ unsigned vps_dumpable:1; + cpumask_t cpu_vm_mask; + + /* Architecture-specific MM context */ +@@ -348,6 +349,9 @@ struct mm_struct { + /* aio bits */ + rwlock_t ioctx_list_lock; + struct kioctx *ioctx_list; ++#ifdef CONFIG_USER_RESOURCE ++ struct user_beancounter *mm_ub; ++#endif + }; + + struct sighand_struct { +@@ -364,6 +368,9 @@ static inline void sighand_free(struct s + call_rcu(&sp->rcu, sighand_free_cb); + } + ++#include <linux/ve.h> ++#include <linux/ve_task.h> ++ + /* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always +@@ -688,6 +695,8 @@ static inline void prefetch_stack(struct + + struct audit_context; /* See audit.c */ + struct mempolicy; ++struct vcpu_scheduler; ++struct vcpu_info; + + struct task_struct { + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ +@@ -701,6 +710,14 @@ struct task_struct { + #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + int oncpu; + #endif ++#ifdef CONFIG_SCHED_VCPU ++ struct vcpu_scheduler *vsched; ++ struct vcpu_info *vcpu; ++ ++ /* id's are saved to avoid locking (e.g. on vsched->id access) */ ++ int vsched_id; ++ int vcpu_id; ++#endif + int prio, static_prio; + struct list_head run_list; + prio_array_t *array; +@@ -846,6 +863,11 @@ struct task_struct { + + unsigned long ptrace_message; + siginfo_t *last_siginfo; /* For ptrace use. */ ++ ++/* state tracking for suspend */ ++ __u8 pn_state; ++ __u8 stopped_state:1; ++ + /* + * current io wait handle: wait queue entry to use for io waits + * If this thread is processing aio, this points at the waitqueue +@@ -871,6 +893,16 @@ struct task_struct { + #endif + atomic_t fs_excl; /* holding fs exclusive resources */ + struct rcu_head rcu; ++#ifdef CONFIG_USER_RESOURCE ++ struct task_beancounter task_bc; ++#endif ++#ifdef CONFIG_VE ++ struct ve_task_info ve_task_info; ++#endif ++#if defined(CONFIG_VZ_QUOTA) || defined(CONFIG_VZ_QUOTA_MODULE) ++ unsigned long magic; ++ struct inode *ino; ++#endif + }; + + static inline pid_t process_group(struct task_struct *tsk) +@@ -929,6 +961,43 @@ static inline void put_task_struct(struc + #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ + #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ + ++#ifndef CONFIG_VE ++#define set_pn_state(tsk, state) do { } while(0) ++#define clear_pn_state(tsk) do { } while(0) ++#define set_stop_state(tsk) do { } while(0) ++#define clear_stop_state(tsk) do { } while(0) ++#else ++#define PN_STOP_TF 1 /* was not in 2.6.8 */ ++#define PN_STOP_TF_RT 2 /* was not in 2.6.8 */ ++#define PN_STOP_ENTRY 3 ++#define PN_STOP_FORK 4 ++#define PN_STOP_VFORK 5 ++#define PN_STOP_SIGNAL 6 ++#define PN_STOP_EXIT 7 ++#define PN_STOP_EXEC 8 ++#define PN_STOP_LEAVE 9 ++ ++static inline void set_pn_state(struct task_struct *tsk, int state) ++{ ++ tsk->pn_state = state; ++} ++ ++static inline void clear_pn_state(struct task_struct *tsk) ++{ ++ tsk->pn_state = 0; ++} ++ ++static inline void set_stop_state(struct task_struct *tsk) ++{ ++ tsk->stopped_state = 1; ++} ++ ++static inline void clear_stop_state(struct task_struct *tsk) ++{ ++ tsk->stopped_state = 0; ++} ++#endif ++ + /* + * Only the _current_ task can read/write to tsk->flags, but other + * tasks can access tsk->flags in readonly mode for example +@@ -968,6 +1037,21 @@ static inline int set_cpus_allowed(task_ + extern unsigned long long sched_clock(void); + extern unsigned long long current_sched_time(const task_t *current_task); + ++static inline unsigned long cycles_to_clocks(cycles_t cycles) ++{ ++ extern unsigned long cycles_per_clock; ++ do_div(cycles, cycles_per_clock); ++ return cycles; ++} ++ ++static inline u64 cycles_to_jiffies(cycles_t cycles) ++{ ++ extern unsigned long cycles_per_jiffy; ++ do_div(cycles, cycles_per_jiffy); ++ return cycles; ++} ++ ++ + /* sched_exec is called by processes performing an exec */ + #ifdef CONFIG_SMP + extern void sched_exec(void); +@@ -1020,12 +1104,237 @@ extern struct task_struct init_task; + + extern struct mm_struct init_mm; + +-#define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) +-extern struct task_struct *find_task_by_pid_type(int type, int pid); ++#define find_task_by_pid_all(nr) \ ++ find_task_by_pid_type_all(PIDTYPE_PID, nr) ++extern struct task_struct *find_task_by_pid_type_all(int type, int pid); + extern void set_special_pids(pid_t session, pid_t pgrp); + extern void __set_special_pids(pid_t session, pid_t pgrp); + ++#ifndef CONFIG_VE ++#define find_task_by_pid_ve find_task_by_pid_all ++ ++#define get_exec_env() ((struct ve_struct *)NULL) ++#define set_exec_env(new_env) ((struct ve_struct *)NULL) ++ ++#define ve_is_super(env) 1 ++#define ve_accessible(target, owner) 1 ++#define ve_accessible_strict(target, owner) 1 ++#define ve_accessible_veid(target, owner) 1 ++#define ve_accessible_strict_veid(target, owner) 1 ++ ++#define VEID(envid) 0 ++#define get_ve0() NULL ++ ++static inline pid_t virt_pid(struct task_struct *tsk) ++{ ++ return tsk->pid; ++} ++ ++static inline pid_t virt_tgid(struct task_struct *tsk) ++{ ++ return tsk->tgid; ++} ++ ++static inline pid_t virt_pgid(struct task_struct *tsk) ++{ ++ return tsk->signal->pgrp; ++} ++ ++static inline pid_t virt_sid(struct task_struct *tsk) ++{ ++ return tsk->signal->session; ++} ++ ++#define get_task_pid_ve(tsk, ve) get_task_pid(tsk) ++ ++static inline pid_t get_task_pid(struct task_struct *tsk) ++{ ++ return tsk->pid; ++} ++ ++static inline pid_t get_task_tgid(struct task_struct *tsk) ++{ ++ return tsk->tgid; ++} ++ ++static inline pid_t get_task_pgid(struct task_struct *tsk) ++{ ++ return tsk->signal->pgrp; ++} ++ ++static inline pid_t get_task_sid(struct task_struct *tsk) ++{ ++ return tsk->signal->session; ++} ++ ++static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) ++{ ++} ++ ++static inline pid_t get_task_ppid(struct task_struct *p) ++{ ++ return pid_alive(p) ? p->group_leader->real_parent->tgid : 0; ++} ++ ++#else /* CONFIG_VE */ ++ ++#include <asm/current.h> ++#include <linux/ve.h> ++ ++extern struct ve_struct ve0; ++ ++#define find_task_by_pid_ve(nr) \ ++ find_task_by_pid_type_ve(PIDTYPE_PID, nr) ++ ++extern struct task_struct *find_task_by_pid_type_ve(int type, int pid); ++ ++#define get_ve0() (&ve0) ++#define VEID(envid) ((envid)->veid) ++ ++#define get_exec_env() (VE_TASK_INFO(current)->exec_env) ++static inline struct ve_struct *set_exec_env(struct ve_struct *new_env) ++{ ++ struct ve_struct *old_env; ++ ++ old_env = VE_TASK_INFO(current)->exec_env; ++ VE_TASK_INFO(current)->exec_env = new_env; ++ ++ return old_env; ++} ++ ++#define ve_is_super(env) ((env) == get_ve0()) ++#define ve_accessible_strict(target, owner) ((target) == (owner)) ++static inline int ve_accessible(struct ve_struct *target, ++ struct ve_struct *owner) { ++ return ve_is_super(owner) || ve_accessible_strict(target, owner); ++} ++ ++#define ve_accessible_strict_veid(target, owner) ((target) == (owner)) ++static inline int ve_accessible_veid(envid_t target, envid_t owner) ++{ ++ return get_ve0()->veid == owner || ++ ve_accessible_strict_veid(target, owner); ++} ++ ++static inline pid_t virt_pid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_PID].vnr; ++} ++ ++static inline pid_t virt_tgid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_TGID].vnr; ++} ++ ++static inline pid_t virt_pgid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_PGID].vnr; ++} ++ ++static inline pid_t virt_sid(struct task_struct *tsk) ++{ ++ return tsk->pids[PIDTYPE_SID].vnr; ++} ++ ++static inline pid_t get_task_pid_ve(struct task_struct *tsk, struct ve_struct *env) ++{ ++ return ve_is_super(env) ? tsk->pid : virt_pid(tsk); ++} ++ ++static inline pid_t get_task_pid(struct task_struct *tsk) ++{ ++ return get_task_pid_ve(tsk, get_exec_env()); ++} ++ ++static inline pid_t get_task_tgid(struct task_struct *tsk) ++{ ++ return ve_is_super(get_exec_env()) ? tsk->tgid : virt_tgid(tsk); ++} ++ ++static inline pid_t get_task_pgid(struct task_struct *tsk) ++{ ++ return ve_is_super(get_exec_env()) ? tsk->signal->pgrp : virt_pgid(tsk); ++} ++ ++static inline pid_t get_task_sid(struct task_struct *tsk) ++{ ++ return ve_is_super(get_exec_env()) ? tsk->signal->session : virt_sid(tsk); ++} ++ ++static inline void set_virt_pid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_PID].vnr = pid; ++} ++ ++static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_TGID].vnr = pid; ++} ++ ++static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_PGID].vnr = pid; ++} ++ ++static inline void set_virt_sid(struct task_struct *tsk, pid_t pid) ++{ ++ tsk->pids[PIDTYPE_SID].vnr = pid; ++} ++ ++static inline pid_t get_task_ppid(struct task_struct *p) ++{ ++ struct task_struct *parent; ++ struct ve_struct *env; ++ ++ if (!pid_alive(p)) ++ return 0; ++ env = get_exec_env(); ++ if (get_task_pid_ve(p, env) == 1) ++ return 0; ++ parent = p->group_leader->real_parent; ++ return ve_accessible(VE_TASK_INFO(parent)->owner_env, env) ? ++ get_task_tgid(parent) : 1; ++} ++ ++void ve_sched_get_cpu_stat(struct ve_struct *envid, cycles_t *idle, ++ cycles_t *strv, unsigned int cpu); ++void ve_sched_attach(struct ve_struct *envid); ++ ++#endif /* CONFIG_VE */ ++ ++ ++#ifdef CONFIG_VE ++extern cycles_t ve_sched_get_idle_time(struct ve_struct *, int); ++extern cycles_t ve_sched_get_iowait_time(struct ve_struct *, int); ++#else ++#define ve_sched_get_idle_time(ve, cpu) 0 ++#define ve_sched_get_iowait_time(ve, cpu) 0 ++#endif ++ ++#ifdef CONFIG_SCHED_VCPU ++struct vcpu_scheduler; ++extern void fastcall vsched_cpu_online_map(struct vcpu_scheduler *sched, ++ cpumask_t *mask); ++#else ++#define vsched_cpu_online_map(vsched, mask) do { \ ++ *mask = cpu_online_map; \ ++ } while (0) ++#endif ++ + /* per-UID process charging. */ ++extern int set_user(uid_t new_ruid, int dumpclear); + extern struct user_struct * alloc_uid(uid_t); + static inline struct user_struct *get_uid(struct user_struct *u) + { +@@ -1043,7 +1352,7 @@ extern int FASTCALL(wake_up_state(struct + extern int FASTCALL(wake_up_process(struct task_struct * tsk)); + extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, + unsigned long clone_flags)); +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined (CONFIG_SCHED_VCPU) + extern void kick_process(struct task_struct *tsk); + #else + static inline void kick_process(struct task_struct *tsk) { } +@@ -1161,12 +1470,19 @@ extern task_t *child_reaper; + + extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); + extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); ++extern long do_fork_pid(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr, ++ long pid0); + task_t *fork_idle(int); + + extern void set_task_comm(struct task_struct *tsk, char *from); + extern void get_task_comm(char *to, struct task_struct *tsk); + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined (CONFIG_SCHED_VCPU) + extern void wait_task_inactive(task_t * p); + #else + #define wait_task_inactive(p) do { } while (0) +@@ -1187,22 +1503,100 @@ extern void wait_task_inactive(task_t * + add_parent(p, (p)->parent); \ + } while (0) + +-#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) +-#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) ++#define next_task_all(p) list_entry((p)->tasks.next, struct task_struct, tasks) ++#define prev_task_all(p) list_entry((p)->tasks.prev, struct task_struct, tasks) + +-#define for_each_process(p) \ +- for (p = &init_task ; (p = next_task(p)) != &init_task ; ) ++#define for_each_process_all(p) \ ++ for (p = &init_task ; (p = next_task_all(p)) != &init_task ; ) + + /* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +-#define do_each_thread(g, t) \ +- for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do ++#define do_each_thread_all(g, t) \ ++ for (g = t = &init_task ; (g = t = next_task_all(g)) != &init_task ; ) do + +-#define while_each_thread(g, t) \ ++#define while_each_thread_all(g, t) \ + while ((t = next_thread(t)) != g) + ++#ifndef CONFIG_VE ++ ++#define SET_VE_LINKS(p) ++#define REMOVE_VE_LINKS(p) ++#define for_each_process_ve(p) for_each_process_all(p) ++#define do_each_thread_ve(g, t) do_each_thread_all(g, t) ++#define while_each_thread_ve(g, t) while_each_thread_all(g, t) ++#define first_task_ve() next_task_ve(&init_task) ++#define __first_task_ve(owner) next_task_ve(&init_task) ++#define __next_task_ve(owner, p) next_task_ve(p) ++#define next_task_ve(p) \ ++ (next_task_all(p) != &init_task ? next_task_all(p) : NULL) ++ ++#else /* CONFIG_VE */ ++ ++#define SET_VE_LINKS(p) \ ++ do { \ ++ if (thread_group_leader(p)) \ ++ list_add_tail(&VE_TASK_INFO(p)->vetask_list, \ ++ &VE_TASK_INFO(p)->owner_env->vetask_lh); \ ++ } while (0) ++ ++#define REMOVE_VE_LINKS(p) \ ++ do { \ ++ if (thread_group_leader(p)) \ ++ list_del(&VE_TASK_INFO(p)->vetask_list); \ ++ } while(0) ++ ++static inline task_t* __first_task_ve(struct ve_struct *ve) ++{ ++ task_t *tsk; ++ ++ if (unlikely(ve_is_super(ve))) { ++ tsk = next_task_all(&init_task); ++ if (tsk == &init_task) ++ tsk = NULL; ++ } else { ++ /* probably can return ve->init_entry, but it's more clear */ ++ BUG_ON(list_empty(&ve->vetask_lh)); ++ tsk = VE_TASK_LIST_2_TASK(ve->vetask_lh.next); ++ } ++ return tsk; ++} ++ ++static inline task_t* __next_task_ve(struct ve_struct *ve, task_t *tsk) ++{ ++ if (unlikely(ve_is_super(ve))) { ++ tsk = next_task_all(tsk); ++ if (tsk == &init_task) ++ tsk = NULL; ++ } else { ++ struct list_head *tmp; ++ ++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != ve); ++ tmp = VE_TASK_INFO(tsk)->vetask_list.next; ++ if (tmp == &ve->vetask_lh) ++ tsk = NULL; ++ else ++ tsk = VE_TASK_LIST_2_TASK(tmp); ++ } ++ return tsk; ++} ++ ++#define first_task_ve() __first_task_ve(get_exec_env()) ++#define next_task_ve(p) __next_task_ve(get_exec_env(), p) ++/* no one uses prev_task_ve(), copy next_task_ve() if needed */ ++ ++#define for_each_process_ve(p) \ ++ for (p = first_task_ve(); p != NULL ; p = next_task_ve(p)) ++ ++#define do_each_thread_ve(g, t) \ ++ for (g = t = first_task_ve() ; g != NULL; g = t = next_task_ve(g)) do ++ ++#define while_each_thread_ve(g, t) \ ++ while ((t = next_thread(t)) != g) ++ ++#endif /* CONFIG_VE */ ++ + extern task_t * FASTCALL(next_thread(const task_t *p)); + + #define thread_group_leader(p) (p->pid == p->tgid) +@@ -1348,28 +1742,63 @@ extern void signal_wake_up(struct task_s + */ + #ifdef CONFIG_SMP + +-static inline unsigned int task_cpu(const struct task_struct *p) ++static inline unsigned int task_pcpu(const struct task_struct *p) + { + return task_thread_info(p)->cpu; + } + +-static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) ++static inline void set_task_pcpu(struct task_struct *p, unsigned int cpu) + { + task_thread_info(p)->cpu = cpu; + } + + #else + ++static inline unsigned int task_pcpu(const struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline void set_task_pcpu(struct task_struct *p, unsigned int cpu) ++{ ++} ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_SCHED_VCPU ++ ++static inline unsigned int task_vsched_id(const struct task_struct *p) ++{ ++ return p->vsched_id; ++} ++ + static inline unsigned int task_cpu(const struct task_struct *p) + { ++ return p->vcpu_id; ++} ++ ++extern void set_task_cpu(struct task_struct *p, unsigned int vcpu); ++extern int vcpu_online(int cpu); ++ ++#else ++ ++static inline unsigned int task_vsched_id(const struct task_struct *p) ++{ + return 0; + } + ++static inline unsigned int task_cpu(const struct task_struct *p) ++{ ++ return task_pcpu(p); ++} ++ + static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) + { ++ set_task_pcpu(p, cpu); + } + +-#endif /* CONFIG_SMP */ ++#define vcpu_online(cpu) cpu_online(cpu) ++#endif /* CONFIG_SCHED_VCPU */ + + #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT + extern void arch_pick_mmap_layout(struct mm_struct *mm); +@@ -1401,7 +1830,7 @@ static inline int frozen(struct task_str + */ + static inline int freezing(struct task_struct *p) + { +- return p->flags & PF_FREEZE; ++ return test_tsk_thread_flag(p, TIF_FREEZE); + } + + /* +@@ -1410,7 +1839,7 @@ static inline int freezing(struct task_s + */ + static inline void freeze(struct task_struct *p) + { +- p->flags |= PF_FREEZE; ++ set_tsk_thread_flag(p, TIF_FREEZE); + } + + /* +@@ -1431,7 +1860,8 @@ static inline int thaw_process(struct ta + */ + static inline void frozen_process(struct task_struct *p) + { +- p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN; ++ clear_tsk_thread_flag(p, TIF_FREEZE); ++ p->flags |= PF_FROZEN; + } + + extern void refrigerator(void); +diff -upr linux-2.6.16.orig/include/linux/sem.h linux-2.6.16-026test015/include/linux/sem.h +--- linux-2.6.16.orig/include/linux/sem.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/sem.h 2006-07-04 14:41:39.000000000 +0400 +@@ -155,6 +155,9 @@ static inline void exit_sem(struct task_ + } + #endif + ++int sysvipc_walk_sem(int (*func)(int, struct sem_array*, void *), void *arg); ++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg); ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SEM_H */ +diff -upr linux-2.6.16.orig/include/linux/shm.h linux-2.6.16-026test015/include/linux/shm.h +--- linux-2.6.16.orig/include/linux/shm.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/shm.h 2006-07-04 14:41:39.000000000 +0400 +@@ -86,6 +86,7 @@ struct shmid_kernel /* private to the ke + pid_t shm_cprid; + pid_t shm_lprid; + struct user_struct *mlock_user; ++ struct ipc_ids *_shm_ids; + }; + + /* shm_mode upper byte flags */ +@@ -104,6 +105,9 @@ static inline long do_shmat(int shmid, c + } + #endif + ++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg); ++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg); ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SHM_H_ */ +diff -upr linux-2.6.16.orig/include/linux/shmem_fs.h linux-2.6.16-026test015/include/linux/shmem_fs.h +--- linux-2.6.16.orig/include/linux/shmem_fs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/shmem_fs.h 2006-07-04 14:41:37.000000000 +0400 +@@ -19,6 +19,9 @@ struct shmem_inode_info { + swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ + struct list_head swaplist; /* chain of maybes on swap */ + struct inode vfs_inode; ++#ifdef CONFIG_USER_RESOURCE ++ struct user_beancounter *shmi_ub; ++#endif + }; + + struct shmem_sb_info { +diff -upr linux-2.6.16.orig/include/linux/signal.h linux-2.6.16-026test015/include/linux/signal.h +--- linux-2.6.16.orig/include/linux/signal.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/signal.h 2006-07-04 14:41:39.000000000 +0400 +@@ -3,6 +3,7 @@ + + #include <linux/list.h> + #include <linux/spinlock.h> ++#include <linux/slab.h> + #include <asm/signal.h> + #include <asm/siginfo.h> + +@@ -41,6 +42,9 @@ struct sigqueue { + int flags; + siginfo_t info; + struct user_struct *user; ++#ifdef CONFIG_USER_RESOURCE ++ struct user_beancounter *sig_ub; ++#endif + }; + + /* flags values. */ +@@ -263,6 +267,8 @@ extern int sigprocmask(int, sigset_t *, + struct pt_regs; + extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); + ++extern kmem_cache_t *sigqueue_cachep; ++ + #endif /* __KERNEL__ */ + + #endif /* _LINUX_SIGNAL_H */ +diff -upr linux-2.6.16.orig/include/linux/skbuff.h linux-2.6.16-026test015/include/linux/skbuff.h +--- linux-2.6.16.orig/include/linux/skbuff.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/skbuff.h 2006-07-04 14:41:38.000000000 +0400 +@@ -19,6 +19,7 @@ + #include <linux/compiler.h> + #include <linux/time.h> + #include <linux/cache.h> ++#include <linux/ve_owner.h> + + #include <asm/atomic.h> + #include <asm/types.h> +@@ -211,6 +212,8 @@ enum { + * @tc_verd: traffic control verdict + */ + ++#include <ub/ub_sk.h> ++ + struct sk_buff { + /* These two members must be first. */ + struct sk_buff *next; +@@ -294,13 +297,18 @@ struct sk_buff { + *data, + *tail, + *end; ++ struct skb_beancounter skb_bc; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(SKB, struct sk_buff, owner_env) ++ + #ifdef __KERNEL__ + /* + * Handling routines are only of interest to the kernel + */ + #include <linux/slab.h> ++#include <ub/ub_net.h> + + #include <asm/system.h> + +@@ -1007,6 +1015,8 @@ static inline int pskb_trim(struct sk_bu + */ + static inline void skb_orphan(struct sk_buff *skb) + { ++ ub_skb_uncharge(skb); ++ + if (skb->destructor) + skb->destructor(skb); + skb->destructor = NULL; +diff -upr linux-2.6.16.orig/include/linux/slab.h linux-2.6.16-026test015/include/linux/slab.h +--- linux-2.6.16.orig/include/linux/slab.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/slab.h 2006-07-04 14:41:37.000000000 +0400 +@@ -48,6 +48,26 @@ typedef struct kmem_cache kmem_cache_t; + #define SLAB_PANIC 0x00040000UL /* panic if kmem_cache_create() fails */ + #define SLAB_DESTROY_BY_RCU 0x00080000UL /* defer freeing pages to RCU */ + ++/* ++ * allocation rules: __GFP_UBC 0 ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ * cache (SLAB_UBC) charge charge ++ * (usual caches: mm, vma, task_struct, ...) ++ * ++ * cache (SLAB_UBC | SLAB_NO_CHARGE) charge --- ++ * (ub_kmalloc) (kmalloc) ++ * ++ * cache (no UB flags) BUG() --- ++ * (nonub caches, mempools) ++ * ++ * pages charge --- ++ * (ub_vmalloc, (vmalloc, ++ * poll, fdsets, ...) non-ub allocs) ++ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ */ ++#define SLAB_UBC 0x20000000UL /* alloc space for ubs ... */ ++#define SLAB_NO_CHARGE 0x40000000UL /* ... but don't charge */ ++ + /* flags passed to a constructor func */ + #define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ + #define SLAB_CTOR_ATOMIC 0x002UL /* tell constructor it can't sleep */ +@@ -108,6 +128,8 @@ found: + return __kmalloc(size, flags); + } + ++#define ub_kmalloc(size, flags) kmalloc(size, ((flags) | __GFP_UBC)) ++ + extern void *kzalloc(size_t, gfp_t); + + /** +diff -upr linux-2.6.16.orig/include/linux/smp.h linux-2.6.16-026test015/include/linux/smp.h +--- linux-2.6.16.orig/include/linux/smp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/smp.h 2006-07-04 14:41:37.000000000 +0400 +@@ -10,6 +10,9 @@ + + extern void cpu_idle(void); + ++struct pt_regs; ++typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info); ++ + #ifdef CONFIG_SMP + + #include <linux/preempt.h> +@@ -49,6 +52,8 @@ extern int __cpu_up(unsigned int cpunum) + */ + extern void smp_cpus_done(unsigned int max_cpus); + ++extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait); ++ + /* + * Call a function on all other processors + */ +@@ -99,6 +104,12 @@ static inline void smp_send_reschedule(i + #define num_booting_cpus() 1 + #define smp_prepare_boot_cpu() do {} while (0) + ++static inline int smp_nmi_call_function(smp_nmi_function func, ++ void *info, int wait) ++{ ++ return 0; ++} ++ + #endif /* !SMP */ + + /* +diff -upr linux-2.6.16.orig/include/linux/socket.h linux-2.6.16-026test015/include/linux/socket.h +--- linux-2.6.16.orig/include/linux/socket.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/socket.h 2006-07-04 14:41:38.000000000 +0400 +@@ -300,6 +300,7 @@ extern int memcpy_toiovec(struct iovec * + extern int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen); + extern int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr); + extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); ++extern int vz_security_proto_check(int family, int type, int protocol); + + #endif + #endif /* not kernel and not glibc */ +diff -upr linux-2.6.16.orig/include/linux/swap.h linux-2.6.16-026test015/include/linux/swap.h +--- linux-2.6.16.orig/include/linux/swap.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/swap.h 2006-07-04 14:41:37.000000000 +0400 +@@ -80,6 +80,7 @@ struct address_space; + struct sysinfo; + struct writeback_control; + struct zone; ++struct user_beancounter; + + /* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of +@@ -119,6 +120,7 @@ enum { + /* + * The in-memory structure used to track swap areas. + */ ++struct user_beancounter; + struct swap_info_struct { + unsigned int flags; + int prio; /* swap priority */ +@@ -136,6 +138,9 @@ struct swap_info_struct { + unsigned int max; + unsigned int inuse_pages; + int next; /* next entry on swap list */ ++#ifdef CONFIG_USER_SWAP_ACCOUNTING ++ struct user_beancounter **swap_ubs; ++#endif + }; + + struct swap_list_t { +@@ -240,7 +245,7 @@ extern long total_swap_pages; + extern unsigned int nr_swapfiles; + extern struct swap_info_struct swap_info[]; + extern void si_swapinfo(struct sysinfo *); +-extern swp_entry_t get_swap_page(void); ++extern swp_entry_t get_swap_page(struct user_beancounter *); + extern swp_entry_t get_swap_page_of_type(int type); + extern int swap_duplicate(swp_entry_t); + extern int valid_swaphandles(swp_entry_t, unsigned long *); +@@ -253,7 +258,9 @@ extern int remove_exclusive_swap_page(st + struct backing_dev_info; + + extern spinlock_t swap_lock; +-extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page); ++struct page_beancounter; ++extern int remove_vma_swap(struct vm_area_struct *vma, struct page *page, ++ struct page_beancounter **pb); + + /* linux/mm/thrash.c */ + extern struct mm_struct * swap_token_mm; +@@ -310,7 +317,7 @@ static inline int remove_exclusive_swap_ + return 0; + } + +-static inline swp_entry_t get_swap_page(void) ++static inline swp_entry_t get_swap_page(struct user_beancounter *ub) + { + swp_entry_t entry; + entry.val = 0; +diff -upr linux-2.6.16.orig/include/linux/sysctl.h linux-2.6.16-026test015/include/linux/sysctl.h +--- linux-2.6.16.orig/include/linux/sysctl.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/sysctl.h 2006-07-04 14:41:39.000000000 +0400 +@@ -148,6 +148,13 @@ enum + KERN_SPIN_RETRY=70, /* int: number of spinlock retries */ + KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */ + KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */ ++ KERN_SILENCE_LEVEL=200, /* int: Console silence loglevel */ ++ KERN_ALLOC_FAIL_WARN=201, /* int: whether we'll print "alloc failure" */ ++ KERN_VIRT_PIDS=202, /* int: VE pids virtualization */ ++ KERN_VIRT_OSRELEASE=205,/* virtualization of utsname.release */ ++ KERN_FAIRSCHED_MAX_LATENCY=201, /* int: Max start_tag delta */ ++ KERN_VCPU_SCHED_TIMESLICE=202, ++ KERN_VCPU_TIMESLICE=203, + }; + + +@@ -397,10 +404,12 @@ enum + NET_TCP_CONG_CONTROL=110, + NET_TCP_ABC=111, + NET_IPV4_IPFRAG_MAX_DIST=112, ++ NET_TCP_USE_SG=245, + }; + + enum { + NET_IPV4_ROUTE_FLUSH=1, ++ NET_IPV4_ROUTE_SRC_CHECK=188, + NET_IPV4_ROUTE_MIN_DELAY=2, + NET_IPV4_ROUTE_MAX_DELAY=3, + NET_IPV4_ROUTE_GC_THRESH=4, +@@ -760,6 +769,12 @@ enum + FS_AIO_NR=18, /* current system-wide number of aio requests */ + FS_AIO_MAX_NR=19, /* system-wide maximum number of aio requests */ + FS_INOTIFY=20, /* inotify submenu */ ++ FS_AT_VSYSCALL=21, /* int: to announce vsyscall data */ ++}; ++ ++/* /proc/sys/debug */ ++enum { ++ DBG_DECODE_CALLTRACES = 1, /* int: decode call traces on oops */ + }; + + /* /proc/sys/fs/quota/ */ +@@ -900,6 +915,8 @@ extern int proc_doulongvec_minmax(ctl_ta + void __user *, size_t *, loff_t *); + extern int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int, + struct file *, void __user *, size_t *, loff_t *); ++extern int proc_doutsstring(ctl_table *table, int write, struct file *, ++ void __user *, size_t *, loff_t *); + + extern int do_sysctl (int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, +@@ -954,6 +971,8 @@ extern ctl_handler sysctl_ms_jiffies; + */ + + /* A sysctl table is an array of struct ctl_table: */ ++struct ve_struct; ++ + struct ctl_table + { + int ctl_name; /* Binary ID */ +@@ -967,6 +986,7 @@ struct ctl_table + struct proc_dir_entry *de; /* /proc control block */ + void *extra1; + void *extra2; ++ struct ve_struct *owner_env; + }; + + /* struct ctl_table_header is used to maintain dynamic lists of +@@ -983,6 +1003,9 @@ struct ctl_table_header * register_sysct + int insert_at_head); + void unregister_sysctl_table(struct ctl_table_header * table); + ++ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr); ++void free_sysctl_clone(ctl_table *clone); ++ + #else /* __KERNEL__ */ + + #endif /* __KERNEL__ */ +diff -upr linux-2.6.16.orig/include/linux/tty.h linux-2.6.16-026test015/include/linux/tty.h +--- linux-2.6.16.orig/include/linux/tty.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/tty.h 2006-07-04 14:41:38.000000000 +0400 +@@ -238,8 +238,11 @@ struct tty_struct { + spinlock_t read_lock; + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(TTY, struct tty_struct, owner_env) ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +@@ -266,6 +269,7 @@ struct tty_struct { + #define TTY_PTY_LOCK 16 /* pty private */ + #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ + #define TTY_HUPPED 18 /* Post driver->hangup() */ ++#define TTY_CHARGED 19 /* Charged as ub resource */ + + #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) + +diff -upr linux-2.6.16.orig/include/linux/tty_driver.h linux-2.6.16-026test015/include/linux/tty_driver.h +--- linux-2.6.16.orig/include/linux/tty_driver.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/tty_driver.h 2006-07-04 14:41:38.000000000 +0400 +@@ -115,6 +115,7 @@ + * character to the device. + */ + ++#include <linux/ve_owner.h> + #include <linux/fs.h> + #include <linux/list.h> + #include <linux/cdev.h> +@@ -214,9 +215,18 @@ struct tty_driver { + unsigned int set, unsigned int clear); + + struct list_head tty_drivers; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(TTYDRV, struct tty_driver, owner_env) ++ ++#ifdef CONFIG_LEGACY_PTYS ++extern struct tty_driver *pty_driver; ++extern struct tty_driver *pty_slave_driver; ++#endif ++ + extern struct list_head tty_drivers; ++extern rwlock_t tty_driver_guard; + + struct tty_driver *alloc_tty_driver(int lines); + void put_tty_driver(struct tty_driver *driver); +diff -upr linux-2.6.16.orig/include/linux/ve.h linux-2.6.16-026test015/include/linux/ve.h +--- linux-2.6.16.orig/include/linux/ve.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/ve.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,337 @@ ++/* ++ * include/linux/ve.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VE_H ++#define _LINUX_VE_H ++ ++#include <linux/config.h> ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++#include <linux/types.h> ++#include <linux/capability.h> ++#include <linux/utsname.h> ++#include <linux/sysctl.h> ++#include <linux/vzstat.h> ++#include <linux/kobject.h> ++ ++#ifdef VZMON_DEBUG ++# define VZTRACE(fmt,args...) \ ++ printk(KERN_DEBUG fmt, ##args) ++#else ++# define VZTRACE(fmt,args...) ++#endif /* VZMON_DEBUG */ ++ ++struct tty_driver; ++struct devpts_config; ++struct task_struct; ++struct new_utsname; ++struct file_system_type; ++struct icmp_mib; ++struct ip_mib; ++struct tcp_mib; ++struct udp_mib; ++struct linux_mib; ++struct fib_info; ++struct fib_rule; ++struct veip_struct; ++struct ve_monitor; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++struct fib_table; ++struct devcnfv4_struct; ++#ifdef CONFIG_VE_IPTABLES ++struct xt_af; ++struct xt_table; ++struct xt_target; ++struct ip_conntrack; ++typedef unsigned int (*ip_nat_helper_func)(void); ++struct ve_ip_conntrack { ++ struct list_head *_ip_conntrack_hash; ++ struct list_head _ip_conntrack_expect_list; ++ struct list_head _ip_conntrack_unconfirmed; ++ struct ip_conntrack_protocol ** _ip_ct_protos; ++ struct list_head _ip_conntrack_helpers; ++ int _ip_conntrack_max; ++ int _ip_conntrack_vmalloc; ++ atomic_t _ip_conntrack_count; ++ void (*_ip_conntrack_destroyed)(struct ip_conntrack *conntrack); ++#ifdef CONFIG_SYSCTL ++ unsigned long _ip_ct_tcp_timeouts[10]; ++ unsigned long _ip_ct_udp_timeout; ++ unsigned long _ip_ct_udp_timeout_stream; ++ unsigned long _ip_ct_icmp_timeout; ++ unsigned long _ip_ct_generic_timeout; ++ unsigned int _ip_ct_log_invalid; ++ unsigned long _ip_ct_tcp_timeout_max_retrans; ++ int _ip_ct_tcp_loose; ++ int _ip_ct_tcp_be_liberal; ++ int _ip_ct_tcp_max_retrans; ++ struct ctl_table_header *_ip_ct_sysctl_header; ++ ctl_table *_ip_ct_net_table; ++ ctl_table *_ip_ct_ipv4_table; ++ ctl_table *_ip_ct_netfilter_table; ++ ctl_table *_ip_ct_sysctl_table; ++#endif /*CONFIG_SYSCTL*/ ++ ++ struct ip_nat_protocol **_ip_nat_protos; ++ ip_nat_helper_func _ip_nat_ftp_hook; ++ ip_nat_helper_func _ip_nat_irc_hook; ++ struct list_head *_ip_nat_bysource; ++ struct xt_table *_ip_nat_table; ++ ++ /* resource accounting */ ++ struct user_beancounter *ub; ++}; ++#endif ++#endif ++ ++#define UIDHASH_BITS_VE 6 ++#define UIDHASH_SZ_VE (1 << UIDHASH_BITS_VE) ++ ++struct ve_cpu_stats { ++ cycles_t idle_time; ++ cycles_t iowait_time; ++ cycles_t strt_idle_time; ++ cycles_t used_time; ++ seqcount_t stat_lock; ++ int nr_running; ++ int nr_unint; ++ int nr_iowait; ++ cputime64_t user; ++ cputime64_t nice; ++ cputime64_t system; ++} ____cacheline_aligned; ++ ++struct ve_struct { ++ struct ve_struct *prev; ++ struct ve_struct *next; ++ ++ envid_t veid; ++ struct task_struct *init_entry; ++ struct list_head vetask_lh; ++ kernel_cap_t cap_default; ++ atomic_t pcounter; ++ /* ref counter to ve from ipc */ ++ atomic_t counter; ++ unsigned int class_id; ++ struct veip_struct *veip; ++ struct rw_semaphore op_sem; ++ int is_running; ++ int is_locked; ++ int virt_pids; ++ /* see vzcalluser.h for VE_FEATURE_XXX definitions */ ++ __u64 features; ++ ++/* VE's root */ ++ struct vfsmount *fs_rootmnt; ++ struct dentry *fs_root; ++ ++/* sysctl */ ++ struct new_utsname *utsname; ++ struct list_head sysctl_lh; ++ struct ctl_table_header *kern_header; ++ struct ctl_table *kern_table; ++ struct ctl_table_header *quota_header; ++ struct ctl_table *quota_table; ++ struct file_system_type *proc_fstype; ++ struct vfsmount *proc_mnt; ++ struct proc_dir_entry *proc_root; ++ struct proc_dir_entry *proc_sys_root; ++ struct proc_dir_entry *_proc_net; ++ struct proc_dir_entry *_proc_net_stat; ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ struct proc_dir_entry *_proc_net_devsnmp6; ++#endif ++ ++/* SYSV IPC */ ++ struct ipc_ids *_shm_ids; ++ struct ipc_ids *_msg_ids; ++ struct ipc_ids *_sem_ids; ++ int _used_sems; ++ int _shm_tot; ++ size_t _shm_ctlmax; ++ size_t _shm_ctlall; ++ int _shm_ctlmni; ++ int _msg_ctlmax; ++ int _msg_ctlmni; ++ int _msg_ctlmnb; ++ int _sem_ctls[4]; ++ ++/* BSD pty's */ ++ struct tty_driver *pty_driver; ++ struct tty_driver *pty_slave_driver; ++ ++#ifdef CONFIG_UNIX98_PTYS ++ struct tty_driver *ptm_driver; ++ struct tty_driver *pts_driver; ++ struct idr *allocated_ptys; ++ struct file_system_type *devpts_fstype; ++ struct vfsmount *devpts_mnt; ++ struct dentry *devpts_root; ++ struct devpts_config *devpts_config; ++#endif ++ ++ struct file_system_type *shmem_fstype; ++ struct vfsmount *shmem_mnt; ++#ifdef CONFIG_SYSFS ++ struct file_system_type *sysfs_fstype; ++ struct vfsmount *sysfs_mnt; ++ struct super_block *sysfs_sb; ++ struct sysfs_dirent *sysfs_root; ++#endif ++ struct subsystem *class_subsys; ++ struct subsystem *class_obj_subsys; ++ struct class *net_class; ++ ++/* User uids hash */ ++ struct list_head uidhash_table[UIDHASH_SZ_VE]; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct hlist_head _net_dev_head; ++ struct hlist_head _net_dev_index_head; ++ struct net_device *_net_dev_base, **_net_dev_tail; ++ int ifindex; ++ struct net_device *_loopback_dev; ++ struct net_device *_venet_dev; ++ struct ipv4_devconf *_ipv4_devconf; ++ struct ipv4_devconf *_ipv4_devconf_dflt; ++ struct ctl_table_header *forward_header; ++ struct ctl_table *forward_table; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ struct ipv6_devconf *_ipv6_devconf; ++ struct ipv6_devconf *_ipv6_devconf_dflt; ++#endif ++#endif ++ unsigned long rt_flush_required; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ struct neigh_table *ve_nd_tbl; ++#endif ++ struct neigh_table *ve_arp_tbl; ++ ++/* per VE CPU stats*/ ++ struct timespec start_timespec; ++ u64 start_jiffies; ++ cycles_t start_cycles; ++ unsigned long avenrun[3]; /* loadavg data */ ++ ++ cycles_t cpu_used_ve; ++ struct kstat_lat_pcpu_struct sched_lat_ve; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct hlist_head *_fib_info_hash; ++ struct hlist_head *_fib_info_laddrhash; ++ int _fib_hash_size; ++ int _fib_info_cnt; ++ ++ struct fib_rule *_local_rule; ++ struct fib_rule *_fib_rules; ++#ifdef CONFIG_IP_MULTIPLE_TABLES ++ /* XXX: why a magic constant? */ ++ struct fib_table *_fib_tables[256]; /* RT_TABLE_MAX - for now */ ++#else ++ struct fib_table *_main_table; ++ struct fib_table *_local_table; ++#endif ++ struct icmp_mib *_icmp_statistics[2]; ++ struct ipstats_mib *_ip_statistics[2]; ++ struct tcp_mib *_tcp_statistics[2]; ++ struct udp_mib *_udp_statistics[2]; ++ struct linux_mib *_net_statistics[2]; ++ struct venet_stat *stat; ++#ifdef CONFIG_VE_IPTABLES ++/* core/netfilter.c virtualization */ ++ void *_nf_hooks; ++ struct xt_table *_ve_ipt_filter_pf; /* packet_filter struct */ ++ struct xt_table *_ve_ip6t_filter_pf; ++ struct xt_table *_ipt_mangle_table; ++ struct xt_table *_ip6t_mangle_table; ++ struct xt_af *_xt; ++ struct xt_target *_ipt_standard_target; ++ struct xt_target *_ip6t_standard_target; ++ ++ __u64 _iptables_modules; ++ struct ve_ip_conntrack *_ip_conntrack; ++#endif /* CONFIG_VE_IPTABLES */ ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ struct fib6_table *_fib6_table; ++ struct ipstats_mib *_ipv6_statistics[2]; ++ struct icmpv6_mib *_icmpv6_statistics[2]; ++ struct udp_mib *_udp_stats_in6[2]; ++#endif ++#endif ++ wait_queue_head_t *_log_wait; ++ unsigned long *_log_start; ++ unsigned long *_log_end; ++ unsigned long *_logged_chars; ++ char *log_buf; ++#define VE_DEFAULT_LOG_BUF_LEN 4096 ++ ++ struct ve_cpu_stats ve_cpu_stats[NR_CPUS] ____cacheline_aligned; ++ unsigned long down_at; ++ struct list_head cleanup_list; ++ ++ unsigned long jiffies_fixup; ++ unsigned char disable_net; ++ unsigned char sparse_vpid; ++ struct ve_monitor *monitor; ++ struct proc_dir_entry *monitor_proc; ++ unsigned long meminfo_val; ++}; ++ ++#define VE_CPU_STATS(ve, cpu) (&((ve)->ve_cpu_stats[(cpu)])) ++ ++extern int nr_ve; ++ ++#ifdef CONFIG_VE ++ ++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode); ++void do_env_cleanup(struct ve_struct *envid); ++void do_update_load_avg_ve(void); ++void do_env_free(struct ve_struct *ptr); ++ ++#define ve_utsname (*get_exec_env()->utsname) ++ ++static inline struct ve_struct *get_ve(struct ve_struct *ptr) ++{ ++ if (ptr != NULL) ++ atomic_inc(&ptr->counter); ++ return ptr; ++} ++ ++static inline void put_ve(struct ve_struct *ptr) ++{ ++ if (ptr && atomic_dec_and_test(&ptr->counter)) { ++ if (atomic_read(&ptr->pcounter) > 0) ++ BUG(); ++ if (ptr->is_running) ++ BUG(); ++ do_env_free(ptr); ++ } ++} ++ ++#ifdef CONFIG_FAIRSCHED ++#define ve_cpu_online_map(ve, mask) fairsched_cpu_online_map(ve->veid, mask) ++#else ++#define ve_cpu_online_map(ve, mask) do { *(mask) = cpu_online_map; } while (0) ++#endif ++#else /* CONFIG_VE */ ++#define ve_utsname system_utsname ++#define get_ve(ve) (NULL) ++#define put_ve(ve) do { } while (0) ++#endif /* CONFIG_VE */ ++ ++#endif /* _LINUX_VE_H */ +diff -upr linux-2.6.16.orig/include/linux/ve_owner.h linux-2.6.16-026test015/include/linux/ve_owner.h +--- linux-2.6.16.orig/include/linux/ve_owner.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/ve_owner.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,32 @@ ++/* ++ * include/linux/ve_owner.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_OWNER_H__ ++#define __VE_OWNER_H__ ++ ++#include <linux/config.h> ++#include <linux/vmalloc.h> ++ ++ ++#define DCL_VE_OWNER(name, type, member) ++ /* prototype declares static inline functions */ ++ ++#define DCL_VE_OWNER_PROTO(name, type, member) \ ++type; \ ++static inline struct ve_struct *VE_OWNER_##name(const type *obj) \ ++{ \ ++ return obj->member; \ ++} \ ++static inline void SET_VE_OWNER_##name(type *obj, struct ve_struct *ve) \ ++{ \ ++ obj->member = ve; \ ++} ++ ++#endif /* __VE_OWNER_H__ */ +diff -upr linux-2.6.16.orig/include/linux/ve_proto.h linux-2.6.16-026test015/include/linux/ve_proto.h +--- linux-2.6.16.orig/include/linux/ve_proto.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/ve_proto.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,75 @@ ++/* ++ * include/linux/ve_proto.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_H__ ++#define __VE_H__ ++ ++#ifdef CONFIG_VE ++ ++extern struct semaphore ve_call_guard; ++extern rwlock_t ve_call_lock; ++ ++#ifdef CONFIG_SYSVIPC ++extern void prepare_ipc(void); ++extern int init_ve_ipc(struct ve_struct *); ++extern void fini_ve_ipc(struct ve_struct *); ++extern void ve_ipc_cleanup(void); ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ ++extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ ++#endif ++ ++extern rwlock_t tty_driver_guard; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++void ip_fragment_cleanup(struct ve_struct *envid); ++void tcp_v4_kill_ve_sockets(struct ve_struct *envid); ++struct fib_table * fib_hash_init(int id); ++int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr); ++extern int main_loopback_init(struct net_device*); ++int venet_init(void); ++#endif ++ ++extern struct ve_struct *ve_list_head; ++extern rwlock_t ve_list_guard; ++extern struct ve_struct *get_ve_by_id(envid_t); ++extern struct ve_struct *__find_ve_by_id(envid_t); ++ ++struct env_create_param2; ++extern int real_env_create(envid_t veid, unsigned flags, u32 class_id, ++ struct env_create_param2 *data, int datalen); ++ ++extern int do_setdevperms(envid_t veid, unsigned type, ++ dev_t dev, unsigned mask); ++ ++#define VE_HOOK_INIT 0 ++#define VE_HOOK_FINI 1 ++#define VE_MAX_HOOKS 2 ++ ++typedef int ve_hookfn(unsigned int hooknum, void *data); ++ ++struct ve_hook ++{ ++ struct list_head list; ++ ve_hookfn *hook; ++ ve_hookfn *undo; ++ struct module *owner; ++ int hooknum; ++ /* Functions are called in ascending priority. */ ++ int priority; ++}; ++ ++extern int ve_hook_register(struct ve_hook *vh); ++extern void ve_hook_unregister(struct ve_hook *vh); ++ ++#endif ++#endif +diff -upr linux-2.6.16.orig/include/linux/ve_task.h linux-2.6.16-026test015/include/linux/ve_task.h +--- linux-2.6.16.orig/include/linux/ve_task.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/ve_task.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,34 @@ ++/* ++ * include/linux/ve_task.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_TASK_H__ ++#define __VE_TASK_H__ ++ ++#include <linux/seqlock.h> ++ ++struct ve_task_info { ++/* virtualization */ ++ struct ve_struct *owner_env; ++ struct ve_struct *exec_env; ++ struct list_head vetask_list; ++ struct dentry *glob_proc_dentry; ++/* statistics: scheduling latency */ ++ cycles_t sleep_time; ++ cycles_t sched_time; ++ cycles_t sleep_stamp; ++ cycles_t wakeup_stamp; ++ seqcount_t wakeup_lock; ++}; ++ ++#define VE_TASK_INFO(task) (&(task)->ve_task_info) ++#define VE_TASK_LIST_2_TASK(lh) \ ++ list_entry(lh, struct task_struct, ve_task_info.vetask_list) ++ ++#endif /* __VE_TASK_H__ */ +diff -upr linux-2.6.16.orig/include/linux/venet.h linux-2.6.16-026test015/include/linux/venet.h +--- linux-2.6.16.orig/include/linux/venet.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/venet.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,70 @@ ++/* ++ * include/linux/venet.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VENET_H ++#define _VENET_H ++ ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/vzcalluser.h> ++ ++#define VEIP_HASH_SZ 512 ++ ++struct ve_struct; ++struct venet_stat; ++struct ip_entry_struct ++{ ++ __u32 key[4]; ++ int family; ++ struct ve_struct *active_env; ++ struct venet_stat *stat; ++ struct veip_struct *veip; ++ struct list_head ip_hash; ++ struct list_head ve_list; ++}; ++ ++struct veip_struct ++{ ++ struct list_head src_lh; ++ struct list_head dst_lh; ++ struct list_head ip_lh; ++ struct list_head list; ++ envid_t veid; ++}; ++ ++/* veip_hash_lock should be taken for write by caller */ ++void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip); ++/* veip_hash_lock should be taken for write by caller */ ++void ip_entry_unhash(struct ip_entry_struct *entry); ++/* veip_hash_lock should be taken for read by caller */ ++struct ip_entry_struct *ip_entry_lookup(u32 addr); ++struct ip_entry_struct *venet_entry_lookup(u32 *addr, int family); ++ ++/* veip_hash_lock should be taken for read by caller */ ++struct veip_struct *veip_find(envid_t veid); ++/* veip_hash_lock should be taken for write by caller */ ++struct veip_struct *veip_findcreate(envid_t veid); ++/* veip_hash_lock should be taken for write by caller */ ++void veip_put(struct veip_struct *veip); ++ ++int veip_start(struct ve_struct *ve); ++void veip_stop(struct ve_struct *ve); ++int veip_entry_add(struct ve_struct *ve, struct sockaddr *addr); ++int veip_entry_del(envid_t veid, struct sockaddr *addr); ++int venet_change_skb_owner(struct sk_buff *skb); ++ ++extern struct list_head ip_entry_hash_table[]; ++extern rwlock_t veip_hash_lock; ++ ++#ifdef CONFIG_PROC_FS ++int veip_seq_show(struct seq_file *m, void *v); ++#endif ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/veprintk.h linux-2.6.16-026test015/include/linux/veprintk.h +--- linux-2.6.16.orig/include/linux/veprintk.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/veprintk.h 2006-07-04 14:41:38.000000000 +0400 +@@ -0,0 +1,38 @@ ++/* ++ * include/linux/veprintk.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VE_PRINTK_H__ ++#define __VE_PRINTK_H__ ++ ++#ifdef CONFIG_VE ++ ++#define ve_log_wait (*(get_exec_env()->_log_wait)) ++#define ve_log_start (*(get_exec_env()->_log_start)) ++#define ve_log_end (*(get_exec_env()->_log_end)) ++#define ve_logged_chars (*(get_exec_env()->_logged_chars)) ++#define ve_log_buf (get_exec_env()->log_buf) ++#define ve_log_buf_len (ve_is_super(get_exec_env()) ? \ ++ log_buf_len : VE_DEFAULT_LOG_BUF_LEN) ++#define VE_LOG_BUF_MASK (ve_log_buf_len - 1) ++#define VE_LOG_BUF(idx) (ve_log_buf[(idx) & VE_LOG_BUF_MASK]) ++ ++#else ++ ++#define ve_log_wait log_wait ++#define ve_log_start log_start ++#define ve_log_end log_end ++#define ve_logged_chars logged_chars ++#define ve_log_buf log_buf ++#define ve_log_buf_len log_buf_len ++#define VE_LOG_BUF_MASK LOG_BUF_MASK ++#define VE_LOG_BUF(idx) LOG_BUF(idx) ++ ++#endif /* CONFIG_VE */ ++#endif /* __VE_PRINTK_H__ */ +diff -upr linux-2.6.16.orig/include/linux/virtinfo.h linux-2.6.16-026test015/include/linux/virtinfo.h +--- linux-2.6.16.orig/include/linux/virtinfo.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/virtinfo.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,52 @@ ++/* ++ * include/linux/virtinfo.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_VIRTINFO_H ++#define __LINUX_VIRTINFO_H ++ ++#include <linux/kernel.h> ++#include <linux/page-flags.h> ++#include <linux/rwsem.h> ++#include <linux/notifier.h> ++ ++struct vnotifier_block ++{ ++ int (*notifier_call)(struct vnotifier_block *self, ++ unsigned long, void *, int); ++ struct vnotifier_block *next; ++ int priority; ++}; ++ ++void virtinfo_notifier_register(int type, struct vnotifier_block *nb); ++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb); ++int virtinfo_notifier_call(int type, unsigned long n, void *data); ++ ++struct meminfo { ++ struct sysinfo si; ++ unsigned long active, inactive; ++ unsigned long cache, swapcache; ++ unsigned long committed_space; ++ unsigned long allowed; ++ struct page_state ps; ++ unsigned long vmalloc_total, vmalloc_used, vmalloc_largest; ++}; ++ ++#define VIRTINFO_MEMINFO 0 ++#define VIRTINFO_ENOUGHMEM 1 ++ ++enum virt_info_types { ++ VITYPE_GENERAL, ++ VITYPE_FAUDIT, ++ VITYPE_QUOTA, ++ ++ VIRT_TYPES ++}; ++ ++#endif /* __LINUX_VIRTINFO_H */ +diff -upr linux-2.6.16.orig/include/linux/vmalloc.h linux-2.6.16-026test015/include/linux/vmalloc.h +--- linux-2.6.16.orig/include/linux/vmalloc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/linux/vmalloc.h 2006-07-04 14:41:37.000000000 +0400 +@@ -18,6 +18,10 @@ + #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ + #endif + ++/* align size to 2^n page boundary */ ++#define POWER2_PAGE_ALIGN(size) \ ++ ((typeof(size))(1UL << (PAGE_SHIFT + get_order(size)))) ++ + struct vm_struct { + void *addr; + unsigned long size; +@@ -32,10 +36,14 @@ struct vm_struct { + * Highlevel APIs for driver use + */ + extern void *vmalloc(unsigned long size); ++extern void *ub_vmalloc(unsigned long size); + extern void *vmalloc_node(unsigned long size, int node); ++extern void *ub_vmalloc_node(unsigned long size, int node); + extern void *vmalloc_exec(unsigned long size); + extern void *vmalloc_32(unsigned long size); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc_best(unsigned long size); ++extern void *ub_vmalloc_best(unsigned long size); + extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot); + extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, +@@ -52,6 +60,9 @@ extern void vunmap(void *addr); + extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); + extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end); ++extern struct vm_struct * get_vm_area_best(unsigned long size, ++ unsigned long flags); ++extern void vprintstat(void); + extern struct vm_struct *get_vm_area_node(unsigned long size, + unsigned long flags, int node); + extern struct vm_struct *remove_vm_area(void *addr); +diff -upr linux-2.6.16.orig/include/linux/vsched.h linux-2.6.16-026test015/include/linux/vsched.h +--- linux-2.6.16.orig/include/linux/vsched.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vsched.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,26 @@ ++/* ++ * include/linux/vsched.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VSCHED_H__ ++#define __VSCHED_H__ ++ ++#include <linux/config.h> ++#include <linux/cache.h> ++#include <linux/fairsched.h> ++#include <linux/sched.h> ++ ++extern int vsched_create(int id, struct fairsched_node *node); ++extern int vsched_destroy(struct vcpu_scheduler *vsched); ++ ++extern int vsched_mvpr(struct task_struct *p, struct vcpu_scheduler *vsched); ++ ++extern int vcpu_online(int cpu); ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/vzcalluser.h linux-2.6.16-026test015/include/linux/vzcalluser.h +--- linux-2.6.16.orig/include/linux/vzcalluser.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzcalluser.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,228 @@ ++/* ++ * include/linux/vzcalluser.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VZCALLUSER_H ++#define _LINUX_VZCALLUSER_H ++ ++#include <linux/types.h> ++#include <linux/ioctl.h> ++ ++#define KERN_VZ_PRIV_RANGE 51 ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++/* ++ * VE management ioctls ++ */ ++ ++struct vzctl_old_env_create { ++ envid_t veid; ++ unsigned flags; ++#define VE_CREATE 1 /* Create VE, VE_ENTER added automatically */ ++#define VE_EXCLUSIVE 2 /* Fail if exists */ ++#define VE_ENTER 4 /* Enter existing VE */ ++#define VE_TEST 8 /* Test if VE exists */ ++#define VE_LOCK 16 /* Do not allow entering created VE */ ++#define VE_SKIPLOCK 32 /* Allow entering embrion VE */ ++ __u32 addr; ++}; ++ ++struct vzctl_mark_env_to_down { ++ envid_t veid; ++}; ++ ++struct vzctl_setdevperms { ++ envid_t veid; ++ unsigned type; ++#define VE_USE_MAJOR 010 /* Test MAJOR supplied in rule */ ++#define VE_USE_MINOR 030 /* Test MINOR supplied in rule */ ++#define VE_USE_MASK 030 /* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */ ++ unsigned dev; ++ unsigned mask; ++}; ++ ++struct vzctl_ve_netdev { ++ envid_t veid; ++ int op; ++#define VE_NETDEV_ADD 1 ++#define VE_NETDEV_DEL 2 ++ char *dev_name; ++}; ++ ++struct vzctl_ve_meminfo { ++ envid_t veid; ++ unsigned long val; ++}; ++ ++/* these masks represent modules */ ++#define VE_IP_IPTABLES_MOD (1U<<0) ++#define VE_IP_FILTER_MOD (1U<<1) ++#define VE_IP_MANGLE_MOD (1U<<2) ++#define VE_IP_MATCH_LIMIT_MOD (1U<<3) ++#define VE_IP_MATCH_MULTIPORT_MOD (1U<<4) ++#define VE_IP_MATCH_TOS_MOD (1U<<5) ++#define VE_IP_TARGET_TOS_MOD (1U<<6) ++#define VE_IP_TARGET_REJECT_MOD (1U<<7) ++#define VE_IP_TARGET_TCPMSS_MOD (1U<<8) ++#define VE_IP_MATCH_TCPMSS_MOD (1U<<9) ++#define VE_IP_MATCH_TTL_MOD (1U<<10) ++#define VE_IP_TARGET_LOG_MOD (1U<<11) ++#define VE_IP_MATCH_LENGTH_MOD (1U<<12) ++#define VE_IP_CONNTRACK_MOD (1U<<14) ++#define VE_IP_CONNTRACK_FTP_MOD (1U<<15) ++#define VE_IP_CONNTRACK_IRC_MOD (1U<<16) ++#define VE_IP_MATCH_CONNTRACK_MOD (1U<<17) ++#define VE_IP_MATCH_STATE_MOD (1U<<18) ++#define VE_IP_MATCH_HELPER_MOD (1U<<19) ++#define VE_IP_NAT_MOD (1U<<20) ++#define VE_IP_NAT_FTP_MOD (1U<<21) ++#define VE_IP_NAT_IRC_MOD (1U<<22) ++#define VE_IP_TARGET_REDIRECT_MOD (1U<<23) ++ ++/* these masks represent modules with their dependences */ ++#define VE_IP_IPTABLES (VE_IP_IPTABLES_MOD) ++#define VE_IP_FILTER (VE_IP_FILTER_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MANGLE (VE_IP_MANGLE_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_LIMIT (VE_IP_MATCH_LIMIT_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_MULTIPORT (VE_IP_MATCH_MULTIPORT_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_TOS (VE_IP_MATCH_TOS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_TOS (VE_IP_TARGET_TOS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_REJECT (VE_IP_TARGET_REJECT_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_TCPMSS (VE_IP_TARGET_TCPMSS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_TCPMSS (VE_IP_MATCH_TCPMSS_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_TTL (VE_IP_MATCH_TTL_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_TARGET_LOG (VE_IP_TARGET_LOG_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_MATCH_LENGTH (VE_IP_MATCH_LENGTH_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_CONNTRACK (VE_IP_CONNTRACK_MOD \ ++ | VE_IP_IPTABLES) ++#define VE_IP_CONNTRACK_FTP (VE_IP_CONNTRACK_FTP_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_CONNTRACK_IRC (VE_IP_CONNTRACK_IRC_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_MATCH_CONNTRACK (VE_IP_MATCH_CONNTRACK_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_MATCH_STATE (VE_IP_MATCH_STATE_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_MATCH_HELPER (VE_IP_MATCH_HELPER_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_NAT (VE_IP_NAT_MOD \ ++ | VE_IP_CONNTRACK) ++#define VE_IP_NAT_FTP (VE_IP_NAT_FTP_MOD \ ++ | VE_IP_NAT | VE_IP_CONNTRACK_FTP) ++#define VE_IP_NAT_IRC (VE_IP_NAT_IRC_MOD \ ++ | VE_IP_NAT | VE_IP_CONNTRACK_IRC) ++#define VE_IP_TARGET_REDIRECT (VE_IP_TARGET_REDIRECT_MOD \ ++ | VE_IP_NAT) ++ ++/* safe iptables mask to be used by default */ ++#define VE_IP_DEFAULT \ ++ (VE_IP_IPTABLES | \ ++ VE_IP_FILTER | VE_IP_MANGLE | \ ++ VE_IP_MATCH_LIMIT | VE_IP_MATCH_MULTIPORT | \ ++ VE_IP_MATCH_TOS | VE_IP_TARGET_REJECT | \ ++ VE_IP_TARGET_TCPMSS | VE_IP_MATCH_TCPMSS | \ ++ VE_IP_MATCH_TTL | VE_IP_MATCH_LENGTH) ++ ++#define VE_IPT_CMP(x,y) (((x) & (y)) == (y)) ++ ++struct vzctl_env_create_cid { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++}; ++ ++struct vzctl_env_create { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++}; ++ ++struct env_create_param { ++ __u64 iptables_mask; ++}; ++ ++#define VZCTL_ENV_CREATE_DATA_MINLEN sizeof(struct env_create_param) ++ ++struct env_create_param2 { ++ __u64 iptables_mask; ++ __u64 feature_mask; ++#define VE_FEATURE_SYSFS (1ULL << 0) ++ __u32 total_vcpus; /* 0 - don't care, same as in host */ ++}; ++#define VZCTL_ENV_CREATE_DATA_MAXLEN sizeof(struct env_create_param2) ++ ++typedef struct env_create_param2 env_create_param_t; ++ ++struct vzctl_env_create_data { ++ envid_t veid; ++ unsigned flags; ++ __u32 class_id; ++ env_create_param_t *data; ++ int datalen; ++}; ++ ++struct vz_load_avg { ++ int val_int; ++ int val_frac; ++}; ++ ++struct vz_cpu_stat { ++ unsigned long user_jif; ++ unsigned long nice_jif; ++ unsigned long system_jif; ++ unsigned long uptime_jif; ++ __u64 idle_clk; ++ __u64 strv_clk; ++ __u64 uptime_clk; ++ struct vz_load_avg avenrun[3]; /* loadavg data */ ++}; ++ ++struct vzctl_cpustatctl { ++ envid_t veid; ++ struct vz_cpu_stat *cpustat; ++}; ++ ++#define VZCTLTYPE '.' ++#define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, \ ++ struct vzctl_old_env_create) ++#define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, \ ++ struct vzctl_mark_env_to_down) ++#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, \ ++ struct vzctl_setdevperms) ++#define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, \ ++ struct vzctl_env_create_cid) ++#define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, \ ++ struct vzctl_env_create) ++#define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, \ ++ struct vzctl_cpustatctl) ++#define VZCTL_ENV_CREATE_DATA _IOW(VZCTLTYPE, 10, \ ++ struct vzctl_env_create_data) ++#define VZCTL_VE_NETDEV _IOW(VZCTLTYPE, 11, \ ++ struct vzctl_ve_netdev) ++#define VZCTL_VE_MEMINFO _IOW(VZCTLTYPE, 13, \ ++ struct vzctl_ve_meminfo) ++ ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/vzctl.h linux-2.6.16-026test015/include/linux/vzctl.h +--- linux-2.6.16.orig/include/linux/vzctl.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzctl.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,30 @@ ++/* ++ * include/linux/vzctl.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_VZCTL_H ++#define _LINUX_VZCTL_H ++ ++#include <linux/list.h> ++ ++struct module; ++struct inode; ++struct file; ++struct vzioctlinfo { ++ unsigned type; ++ int (*func)(struct inode *, struct file *, ++ unsigned int, unsigned long); ++ struct module *owner; ++ struct list_head list; ++}; ++ ++extern void vzioctl_register(struct vzioctlinfo *inf); ++extern void vzioctl_unregister(struct vzioctlinfo *inf); ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/vzctl_quota.h linux-2.6.16-026test015/include/linux/vzctl_quota.h +--- linux-2.6.16.orig/include/linux/vzctl_quota.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzctl_quota.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,43 @@ ++/* ++ * include/linux/vzctl_quota.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __LINUX_VZCTL_QUOTA_H__ ++#define __LINUX_VZCTL_QUOTA_H__ ++ ++/* ++ * Quota management ioctl ++ */ ++ ++struct vz_quota_stat; ++struct vzctl_quotactl { ++ int cmd; ++ unsigned int quota_id; ++ struct vz_quota_stat *qstat; ++ char *ve_root; ++}; ++ ++struct vzctl_quotaugidctl { ++ int cmd; /* subcommand */ ++ unsigned int quota_id; /* quota id where it applies to */ ++ unsigned int ugid_index;/* for reading statistic. index of first ++ uid/gid record to read */ ++ unsigned int ugid_size; /* size of ugid_buf array */ ++ void *addr; /* user-level buffer */ ++}; ++ ++#define VZDQCTLTYPE '+' ++#define VZCTL_QUOTA_CTL _IOWR(VZDQCTLTYPE, 1, \ ++ struct vzctl_quotactl) ++#define VZCTL_QUOTA_NEW_CTL _IOWR(VZDQCTLTYPE, 2, \ ++ struct vzctl_quotactl) ++#define VZCTL_QUOTA_UGID_CTL _IOWR(VZDQCTLTYPE, 3, \ ++ struct vzctl_quotaugidctl) ++ ++#endif /* __LINUX_VZCTL_QUOTA_H__ */ +diff -upr linux-2.6.16.orig/include/linux/vzctl_venet.h linux-2.6.16-026test015/include/linux/vzctl_venet.h +--- linux-2.6.16.orig/include/linux/vzctl_venet.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzctl_venet.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,36 @@ ++/* ++ * include/linux/vzctl_venet.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZCTL_VENET_H ++#define _VZCTL_VENET_H ++ ++#include <linux/types.h> ++#include <linux/ioctl.h> ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++struct vzctl_ve_ip_map { ++ envid_t veid; ++ int op; ++#define VE_IP_ADD 1 ++#define VE_IP_DEL 2 ++ struct sockaddr *addr; ++ int addrlen; ++}; ++ ++#define VENETCTLTYPE '(' ++ ++#define VENETCTL_VE_IP_MAP _IOW(VENETCTLTYPE, 3, \ ++ struct vzctl_ve_ip_map) ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/vzctl_veth.h linux-2.6.16-026test015/include/linux/vzctl_veth.h +--- linux-2.6.16.orig/include/linux/vzctl_veth.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzctl_veth.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,40 @@ ++/* ++ * include/linux/vzctl_veth.h ++ * ++ * Copyright (C) 2006 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZCTL_VETH_H ++#define _VZCTL_VETH_H ++ ++#include <linux/types.h> ++#include <linux/ioctl.h> ++ ++#ifndef __ENVID_T_DEFINED__ ++typedef unsigned envid_t; ++#define __ENVID_T_DEFINED__ ++#endif ++ ++struct vzctl_ve_hwaddr { ++ envid_t veid; ++ int op; ++#define VE_ETH_ADD 1 ++#define VE_ETH_DEL 2 ++ unsigned char dev_addr[6]; ++ int addrlen; ++ char dev_name[16]; ++ unsigned char dev_addr_ve[6]; ++ int addrlen_ve; ++ char dev_name_ve[16]; ++}; ++ ++#define VETHCTLTYPE '[' ++ ++#define VETHCTL_VE_HWADDR _IOW(VETHCTLTYPE, 3, \ ++ struct vzctl_ve_hwaddr) ++ ++#endif +diff -upr linux-2.6.16.orig/include/linux/vzdq_tree.h linux-2.6.16-026test015/include/linux/vzdq_tree.h +--- linux-2.6.16.orig/include/linux/vzdq_tree.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzdq_tree.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,99 @@ ++/* ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo disk quota tree definition ++ */ ++ ++#ifndef _VZDQ_TREE_H ++#define _VZDQ_TREE_H ++ ++#include <linux/list.h> ++#include <asm/string.h> ++ ++typedef unsigned int quotaid_t; ++#define QUOTAID_BITS 32 ++#define QUOTAID_BBITS 4 ++#define QUOTAID_EBITS 8 ++ ++#if QUOTAID_EBITS % QUOTAID_BBITS ++#error Quota bit assumption failure ++#endif ++ ++#define QUOTATREE_BSIZE (1 << QUOTAID_BBITS) ++#define QUOTATREE_BMASK (QUOTATREE_BSIZE - 1) ++#define QUOTATREE_DEPTH ((QUOTAID_BITS + QUOTAID_BBITS - 1) \ ++ / QUOTAID_BBITS) ++#define QUOTATREE_EDEPTH ((QUOTAID_BITS + QUOTAID_EBITS - 1) \ ++ / QUOTAID_EBITS) ++#define QUOTATREE_BSHIFT(lvl) ((QUOTATREE_DEPTH - (lvl) - 1) * QUOTAID_BBITS) ++ ++/* ++ * Depth of keeping unused node (not inclusive). ++ * 0 means release all nodes including root, ++ * QUOTATREE_DEPTH means never release nodes. ++ * Current value: release all nodes strictly after QUOTATREE_EDEPTH ++ * (measured in external shift units). ++ */ ++#define QUOTATREE_CDEPTH (QUOTATREE_DEPTH \ ++ - 2 * QUOTATREE_DEPTH / QUOTATREE_EDEPTH \ ++ + 1) ++ ++/* ++ * Levels 0..(QUOTATREE_DEPTH-1) are tree nodes. ++ * On level i the maximal number of nodes is 2^(i*QUOTAID_BBITS), ++ * and each node contains 2^QUOTAID_BBITS pointers. ++ * Level 0 is a (single) tree root node. ++ * ++ * Nodes of level (QUOTATREE_DEPTH-1) contain pointers to caller's data. ++ * Nodes of lower levels contain pointers to nodes. ++ * ++ * Double pointer in array of i-level node, pointing to a (i+1)-level node ++ * (such as inside quotatree_find_state) are marked by level (i+1), not i. ++ * Level 0 double pointer is a pointer to root inside tree struct. ++ * ++ * The tree is permanent, i.e. all index blocks allocated are keeped alive to ++ * preserve the blocks numbers in the quota file tree to keep its changes ++ * locally. ++ */ ++struct quotatree_node { ++ struct list_head list; ++ quotaid_t num; ++ void *blocks[QUOTATREE_BSIZE]; ++}; ++ ++struct quotatree_level { ++ struct list_head usedlh, freelh; ++ quotaid_t freenum; ++}; ++ ++struct quotatree_tree { ++ struct quotatree_level levels[QUOTATREE_DEPTH]; ++ struct quotatree_node *root; ++ unsigned int leaf_num; ++}; ++ ++struct quotatree_find_state { ++ void **block; ++ int level; ++}; ++ ++/* number of leafs (objects) and leaf level of the tree */ ++#define QTREE_LEAFNUM(tree) ((tree)->leaf_num) ++#define QTREE_LEAFLVL(tree) (&(tree)->levels[QUOTATREE_DEPTH - 1]) ++ ++struct quotatree_tree *quotatree_alloc(void); ++void *quotatree_find(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st); ++int quotatree_insert(struct quotatree_tree *tree, quotaid_t id, ++ struct quotatree_find_state *st, void *data); ++void quotatree_remove(struct quotatree_tree *tree, quotaid_t id); ++void quotatree_free(struct quotatree_tree *tree, void (*dtor)(void *)); ++void *quotatree_get_next(struct quotatree_tree *tree, quotaid_t id); ++void *quotatree_leaf_byindex(struct quotatree_tree *tree, unsigned int index); ++ ++#endif /* _VZDQ_TREE_H */ ++ +diff -upr linux-2.6.16.orig/include/linux/vzquota.h linux-2.6.16-026test015/include/linux/vzquota.h +--- linux-2.6.16.orig/include/linux/vzquota.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzquota.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,291 @@ ++/* ++ * ++ * Copyright (C) 2001-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * This file contains Virtuozzo disk quota implementation ++ */ ++ ++#ifndef _VZDQUOTA_H ++#define _VZDQUOTA_H ++ ++#include <linux/types.h> ++#include <linux/quota.h> ++ ++/* vzquotactl syscall commands */ ++#define VZ_DQ_CREATE 5 /* create quota master block */ ++#define VZ_DQ_DESTROY 6 /* destroy qmblk */ ++#define VZ_DQ_ON 7 /* mark dentry with already created qmblk */ ++#define VZ_DQ_OFF 8 /* remove mark, don't destroy qmblk */ ++#define VZ_DQ_SETLIMIT 9 /* set new limits */ ++#define VZ_DQ_GETSTAT 10 /* get usage statistic */ ++/* set of syscalls to maintain UGID quotas */ ++#define VZ_DQ_UGID_GETSTAT 1 /* get usage/limits for ugid(s) */ ++#define VZ_DQ_UGID_ADDSTAT 2 /* set usage/limits statistic for ugid(s) */ ++#define VZ_DQ_UGID_GETGRACE 3 /* get expire times */ ++#define VZ_DQ_UGID_SETGRACE 4 /* set expire times */ ++#define VZ_DQ_UGID_GETCONFIG 5 /* get ugid_max limit, cnt, flags of qmblk */ ++#define VZ_DQ_UGID_SETCONFIG 6 /* set ugid_max limit, flags of qmblk */ ++#define VZ_DQ_UGID_SETLIMIT 7 /* set ugid B/I limits */ ++#define VZ_DQ_UGID_SETINFO 8 /* set ugid info */ ++ ++/* common structure for vz and ugid quota */ ++struct dq_stat { ++ /* blocks limits */ ++ __u64 bhardlimit; /* absolute limit in bytes */ ++ __u64 bsoftlimit; /* preferred limit in bytes */ ++ time_t btime; /* time limit for excessive disk use */ ++ __u64 bcurrent; /* current bytes count */ ++ /* inodes limits */ ++ __u32 ihardlimit; /* absolute limit on allocated inodes */ ++ __u32 isoftlimit; /* preferred inode limit */ ++ time_t itime; /* time limit for excessive inode use */ ++ __u32 icurrent; /* current # allocated inodes */ ++}; ++ ++/* One second resolution for grace times */ ++#define CURRENT_TIME_SECONDS (get_seconds()) ++ ++/* Values for dq_info->flags */ ++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ ++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ ++ ++struct dq_info { ++ time_t bexpire; /* expire timeout for excessive disk use */ ++ time_t iexpire; /* expire timeout for excessive inode use */ ++ unsigned flags; /* see previos defines */ ++}; ++ ++struct vz_quota_stat { ++ struct dq_stat dq_stat; ++ struct dq_info dq_info; ++}; ++ ++/* UID/GID interface record - for user-kernel level exchange */ ++struct vz_quota_iface { ++ unsigned int qi_id; /* UID/GID this applies to */ ++ unsigned int qi_type; /* USRQUOTA|GRPQUOTA */ ++ struct dq_stat qi_stat; /* limits, options, usage stats */ ++}; ++ ++/* values for flags and dq_flags */ ++/* this flag is set if the userspace has been unable to provide usage ++ * information about all ugids ++ * if the flag is set, we don't allocate new UG quota blocks (their ++ * current usage is unknown) or free existing UG quota blocks (not to ++ * lose information that this block is ok) */ ++#define VZDQUG_FIXED_SET 0x01 ++/* permit to use ugid quota */ ++#define VZDQUG_ON 0x02 ++#define VZDQ_USRQUOTA 0x10 ++#define VZDQ_GRPQUOTA 0x20 ++#define VZDQ_NOACT 0x1000 /* not actual */ ++#define VZDQ_NOQUOT 0x2000 /* not under quota tree */ ++ ++struct vz_quota_ugid_stat { ++ unsigned int limit; /* max amount of ugid records */ ++ unsigned int count; /* amount of ugid records */ ++ unsigned int flags; ++}; ++ ++struct vz_quota_ugid_setlimit { ++ unsigned int type; /* quota type (USR/GRP) */ ++ unsigned int id; /* ugid */ ++ struct if_dqblk dqb; /* limits info */ ++}; ++ ++struct vz_quota_ugid_setinfo { ++ unsigned int type; /* quota type (USR/GRP) */ ++ struct if_dqinfo dqi; /* grace info */ ++}; ++ ++#ifdef __KERNEL__ ++#include <linux/list.h> ++#include <asm/atomic.h> ++#include <asm/semaphore.h> ++#include <linux/time.h> ++#include <linux/vzquota_qlnk.h> ++#include <linux/vzdq_tree.h> ++ ++/* Values for dq_info flags */ ++#define VZ_QUOTA_INODES 0x01 /* inodes limit warning printed */ ++#define VZ_QUOTA_SPACE 0x02 /* space limit warning printed */ ++ ++/* values for dq_state */ ++#define VZDQ_STARTING 0 /* created, not turned on yet */ ++#define VZDQ_WORKING 1 /* quota created, turned on */ ++#define VZDQ_STOPING 2 /* created, turned on and off */ ++ ++/* master quota record - one per veid */ ++struct vz_quota_master { ++ struct list_head dq_hash; /* next quota in hash list */ ++ atomic_t dq_count; /* inode reference count */ ++ unsigned int dq_flags; /* see VZDQUG_FIXED_SET */ ++ unsigned int dq_state; /* see values above */ ++ unsigned int dq_id; /* VEID this applies to */ ++ struct dq_stat dq_stat; /* limits, grace, usage stats */ ++ struct dq_info dq_info; /* grace times and flags */ ++ spinlock_t dq_data_lock; /* for dq_stat */ ++ ++ struct semaphore dq_sem; /* semaphore to protect ++ ugid tree */ ++ ++ struct list_head dq_ilink_list; /* list of vz_quota_ilink */ ++ struct quotatree_tree *dq_uid_tree; /* vz_quota_ugid tree for UIDs */ ++ struct quotatree_tree *dq_gid_tree; /* vz_quota_ugid tree for GIDs */ ++ unsigned int dq_ugid_count; /* amount of ugid records */ ++ unsigned int dq_ugid_max; /* max amount of ugid records */ ++ struct dq_info dq_ugid_info[MAXQUOTAS]; /* ugid grace times */ ++ ++ struct dentry *dq_root_dentry;/* dentry of fs tree */ ++ struct vfsmount *dq_root_mnt; /* vfsmnt of this dentry */ ++ struct super_block *dq_sb; /* superblock of our quota root */ ++}; ++ ++/* UID/GID quota record - one per pair (quota_master, uid or gid) */ ++struct vz_quota_ugid { ++ unsigned int qugid_id; /* UID/GID this applies to */ ++ struct dq_stat qugid_stat; /* limits, options, usage stats */ ++ int qugid_type; /* USRQUOTA|GRPQUOTA */ ++ atomic_t qugid_count; /* reference count */ ++}; ++ ++#define VZ_QUOTA_UGBAD ((struct vz_quota_ugid *)0xfeafea11) ++ ++struct vz_quota_datast { ++ struct vz_quota_ilink qlnk; ++}; ++ ++#define VIRTINFO_QUOTA_GETSTAT 0 ++#define VIRTINFO_QUOTA_ON 1 ++#define VIRTINFO_QUOTA_OFF 2 ++ ++struct virt_info_quota { ++ struct super_block *super; ++ struct dq_stat *qstat; ++}; ++ ++/* ++ * Interface to VZ quota core ++ */ ++#define INODE_QLNK(inode) (&(inode)->i_qlnk) ++#define QLNK_INODE(qlnk) container_of((qlnk), struct inode, i_qlnk) ++ ++#define VZ_QUOTA_BAD ((struct vz_quota_master *)0xefefefef) ++ ++#define VZ_QUOTAO_SETE 1 ++#define VZ_QUOTAO_INIT 2 ++#define VZ_QUOTAO_DESTR 3 ++#define VZ_QUOTAO_SWAP 4 ++#define VZ_QUOTAO_INICAL 5 ++#define VZ_QUOTAO_DRCAL 6 ++#define VZ_QUOTAO_QSET 7 ++#define VZ_QUOTAO_TRANS 8 ++#define VZ_QUOTAO_ACT 9 ++#define VZ_QUOTAO_DTREE 10 ++#define VZ_QUOTAO_DET 11 ++#define VZ_QUOTAO_ON 12 ++ ++extern struct semaphore vz_quota_sem; ++void inode_qmblk_lock(struct super_block *sb); ++void inode_qmblk_unlock(struct super_block *sb); ++void qmblk_data_read_lock(struct vz_quota_master *qmblk); ++void qmblk_data_read_unlock(struct vz_quota_master *qmblk); ++void qmblk_data_write_lock(struct vz_quota_master *qmblk); ++void qmblk_data_write_unlock(struct vz_quota_master *qmblk); ++ ++/* for quota operations */ ++void vzquota_inode_init_call(struct inode *inode); ++void vzquota_inode_drop_call(struct inode *inode); ++int vzquota_inode_transfer_call(struct inode *, struct iattr *); ++struct vz_quota_master *vzquota_inode_data(struct inode *inode, ++ struct vz_quota_datast *); ++void vzquota_data_unlock(struct inode *inode, struct vz_quota_datast *); ++int vzquota_rename_check(struct inode *inode, ++ struct inode *old_dir, struct inode *new_dir); ++struct vz_quota_master *vzquota_inode_qmblk(struct inode *inode); ++/* for second-level quota */ ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *); ++/* for management operations */ ++struct vz_quota_master *vzquota_alloc_master(unsigned int quota_id, ++ struct vz_quota_stat *qstat); ++void vzquota_free_master(struct vz_quota_master *); ++struct vz_quota_master *vzquota_find_master(unsigned int quota_id); ++int vzquota_on_qmblk(struct super_block *sb, struct inode *inode, ++ struct vz_quota_master *qmblk); ++int vzquota_off_qmblk(struct super_block *sb, struct vz_quota_master *qmblk); ++int vzquota_get_super(struct super_block *sb); ++void vzquota_put_super(struct super_block *sb); ++ ++static inline struct vz_quota_master *qmblk_get(struct vz_quota_master *qmblk) ++{ ++ if (!atomic_read(&qmblk->dq_count)) ++ BUG(); ++ atomic_inc(&qmblk->dq_count); ++ return qmblk; ++} ++ ++static inline void __qmblk_put(struct vz_quota_master *qmblk) ++{ ++ atomic_dec(&qmblk->dq_count); ++} ++ ++static inline void qmblk_put(struct vz_quota_master *qmblk) ++{ ++ if (!atomic_dec_and_test(&qmblk->dq_count)) ++ return; ++ vzquota_free_master(qmblk); ++} ++ ++extern struct list_head vzquota_hash_table[]; ++extern int vzquota_hash_size; ++ ++/* ++ * Interface to VZ UGID quota ++ */ ++extern struct quotactl_ops vz_quotactl_operations; ++extern struct dquot_operations vz_quota_operations2; ++extern struct quota_format_type vz_quota_empty_v2_format; ++ ++#define QUGID_TREE(qmblk, type) (((type) == USRQUOTA) ? \ ++ qmblk->dq_uid_tree : \ ++ qmblk->dq_gid_tree) ++ ++#define VZDQUG_FIND_DONT_ALLOC 1 ++#define VZDQUG_FIND_FAKE 2 ++struct vz_quota_ugid *vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags); ++struct vz_quota_ugid *__vzquota_find_ugid(struct vz_quota_master *qmblk, ++ unsigned int quota_id, int type, int flags); ++struct vz_quota_ugid *vzquota_get_ugid(struct vz_quota_ugid *qugid); ++void vzquota_put_ugid(struct vz_quota_master *qmblk, ++ struct vz_quota_ugid *qugid); ++void vzquota_kill_ugid(struct vz_quota_master *qmblk); ++int vzquota_ugid_init(void); ++void vzquota_ugid_release(void); ++int vzquota_transfer_usage(struct inode *inode, int mask, ++ struct vz_quota_ilink *qlnk); ++ ++struct vzctl_quotaugidctl; ++long do_vzquotaugidctl(struct vzctl_quotaugidctl *qub); ++ ++/* ++ * Other VZ quota parts ++ */ ++extern struct dquot_operations vz_quota_operations; ++ ++long do_vzquotactl(int cmd, unsigned int quota_id, ++ struct vz_quota_stat *qstat, const char *ve_root); ++int vzquota_proc_init(void); ++void vzquota_proc_release(void); ++struct vz_quota_master *vzquota_find_qmblk(struct super_block *); ++extern struct semaphore vz_quota_sem; ++ ++void vzaquota_init(void); ++void vzaquota_fini(void); ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _VZDQUOTA_H */ +diff -upr linux-2.6.16.orig/include/linux/vzquota_qlnk.h linux-2.6.16-026test015/include/linux/vzquota_qlnk.h +--- linux-2.6.16.orig/include/linux/vzquota_qlnk.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzquota_qlnk.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,25 @@ ++/* ++ * include/linux/vzquota_qlnk.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _VZDQUOTA_QLNK_H ++#define _VZDQUOTA_QLNK_H ++ ++struct vz_quota_master; ++struct vz_quota_ugid; ++ ++/* inode link, used to track inodes using quota via dq_ilink_list */ ++struct vz_quota_ilink { ++ struct vz_quota_master *qmblk; ++ struct vz_quota_ugid *qugid[MAXQUOTAS]; ++ struct list_head list; ++ unsigned char origin; ++}; ++ ++#endif /* _VZDQUOTA_QLNK_H */ +diff -upr linux-2.6.16.orig/include/linux/vzratelimit.h linux-2.6.16-026test015/include/linux/vzratelimit.h +--- linux-2.6.16.orig/include/linux/vzratelimit.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzratelimit.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,28 @@ ++/* ++ * include/linux/vzratelimit.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VZ_RATELIMIT_H__ ++#define __VZ_RATELIMIT_H__ ++ ++/* ++ * Generic ratelimiting stuff. ++ */ ++ ++struct vz_rate_info { ++ int burst; ++ int interval; /* jiffy_t per event */ ++ int bucket; /* kind of leaky bucket */ ++ unsigned long last; /* last event */ ++}; ++ ++/* Return true if rate limit permits. */ ++int vz_ratelimit(struct vz_rate_info *p); ++ ++#endif /* __VZ_RATELIMIT_H__ */ +diff -upr linux-2.6.16.orig/include/linux/vzstat.h linux-2.6.16-026test015/include/linux/vzstat.h +--- linux-2.6.16.orig/include/linux/vzstat.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/linux/vzstat.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,182 @@ ++/* ++ * include/linux/vzstat.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __VZSTAT_H__ ++#define __VZSTAT_H__ ++ ++struct swap_cache_info_struct { ++ unsigned long add_total; ++ unsigned long del_total; ++ unsigned long find_success; ++ unsigned long find_total; ++ unsigned long noent_race; ++ unsigned long exist_race; ++ unsigned long remove_race; ++}; ++ ++struct kstat_lat_snap_struct { ++ cycles_t maxlat, totlat; ++ unsigned long count; ++}; ++struct kstat_lat_pcpu_snap_struct { ++ cycles_t maxlat, totlat; ++ unsigned long count; ++ seqcount_t lock; ++} ____cacheline_aligned_in_smp; ++ ++struct kstat_lat_struct { ++ struct kstat_lat_snap_struct cur, last; ++ cycles_t avg[3]; ++}; ++struct kstat_lat_pcpu_struct { ++ struct kstat_lat_pcpu_snap_struct cur[NR_CPUS]; ++ cycles_t max_snap; ++ struct kstat_lat_snap_struct last; ++ cycles_t avg[3]; ++}; ++ ++struct kstat_perf_snap_struct { ++ cycles_t wall_tottime, cpu_tottime; ++ cycles_t wall_maxdur, cpu_maxdur; ++ unsigned long count; ++}; ++struct kstat_perf_struct { ++ struct kstat_perf_snap_struct cur, last; ++}; ++ ++struct kstat_zone_avg { ++ unsigned long free_pages_avg[3], ++ nr_active_avg[3], ++ nr_inactive_avg[3]; ++}; ++ ++#define KSTAT_ALLOCSTAT_NR 5 ++ ++struct kernel_stat_glob { ++ unsigned long nr_unint_avg[3]; ++ ++ unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR]; ++ struct kstat_lat_struct alloc_lat[KSTAT_ALLOCSTAT_NR]; ++ struct kstat_lat_pcpu_struct sched_lat; ++ struct kstat_lat_struct swap_in; ++ ++ struct kstat_perf_struct ttfp, cache_reap, ++ refill_inact, shrink_icache, shrink_dcache; ++ ++ struct kstat_zone_avg zone_avg[3]; /* MAX_NR_ZONES */ ++} ____cacheline_aligned; ++ ++extern struct kernel_stat_glob kstat_glob ____cacheline_aligned; ++extern spinlock_t kstat_glb_lock; ++ ++#ifdef CONFIG_VE ++#define KSTAT_PERF_ENTER(name) \ ++ unsigned long flags; \ ++ cycles_t start, sleep_time; \ ++ \ ++ start = get_cycles(); \ ++ sleep_time = VE_TASK_INFO(current)->sleep_time; \ ++ ++#define KSTAT_PERF_LEAVE(name) \ ++ spin_lock_irqsave(&kstat_glb_lock, flags); \ ++ kstat_glob.name.cur.count++; \ ++ start = get_cycles() - start; \ ++ if (kstat_glob.name.cur.wall_maxdur < start) \ ++ kstat_glob.name.cur.wall_maxdur = start;\ ++ kstat_glob.name.cur.wall_tottime += start; \ ++ start -= VE_TASK_INFO(current)->sleep_time - \ ++ sleep_time; \ ++ if (kstat_glob.name.cur.cpu_maxdur < start) \ ++ kstat_glob.name.cur.cpu_maxdur = start; \ ++ kstat_glob.name.cur.cpu_tottime += start; \ ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); \ ++ ++#else ++#define KSTAT_PERF_ENTER(name) ++#define KSTAT_PERF_LEAVE(name) ++#endif ++ ++/* ++ * Add another statistics reading. ++ * Serialization is the caller's due. ++ */ ++static inline void KSTAT_LAT_ADD(struct kstat_lat_struct *p, ++ cycles_t dur) ++{ ++ p->cur.count++; ++ if (p->cur.maxlat < dur) ++ p->cur.maxlat = dur; ++ p->cur.totlat += dur; ++} ++ ++static inline void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, ++ cycles_t dur) ++{ ++ struct kstat_lat_pcpu_snap_struct *cur; ++ ++ cur = &p->cur[cpu]; ++ write_seqcount_begin(&cur->lock); ++ cur->count++; ++ if (cur->maxlat < dur) ++ cur->maxlat = dur; ++ cur->totlat += dur; ++ write_seqcount_end(&cur->lock); ++} ++ ++/* ++ * Move current statistics to last, clear last. ++ * Serialization is the caller's due. ++ */ ++static inline void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p) ++{ ++ cycles_t m; ++ memcpy(&p->last, &p->cur, sizeof(p->last)); ++ p->cur.maxlat = 0; ++ m = p->last.maxlat; ++ CALC_LOAD(p->avg[0], EXP_1, m) ++ CALC_LOAD(p->avg[1], EXP_5, m) ++ CALC_LOAD(p->avg[2], EXP_15, m) ++} ++ ++static inline void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p) ++{ ++ unsigned i, cpu; ++ struct kstat_lat_pcpu_snap_struct snap, *cur; ++ cycles_t m; ++ ++ memset(&p->last, 0, sizeof(p->last)); ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ cur = &p->cur[cpu]; ++ do { ++ i = read_seqcount_begin(&cur->lock); ++ memcpy(&snap, cur, sizeof(snap)); ++ } while (read_seqcount_retry(&cur->lock, i)); ++ /* ++ * read above and this update of maxlat is not atomic, ++ * but this is OK, since it happens rarely and losing ++ * a couple of peaks is not essential. xemul ++ */ ++ cur->maxlat = 0; ++ ++ p->last.count += snap.count; ++ p->last.totlat += snap.totlat; ++ if (p->last.maxlat < snap.maxlat) ++ p->last.maxlat = snap.maxlat; ++ } ++ ++ m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap); ++ CALC_LOAD(p->avg[0], EXP_1, m); ++ CALC_LOAD(p->avg[1], EXP_5, m); ++ CALC_LOAD(p->avg[2], EXP_15, m); ++ /* reset max_snap to calculate it correctly next time */ ++ p->max_snap = 0; ++} ++ ++#endif /* __VZSTAT_H__ */ +diff -upr linux-2.6.16.orig/include/net/addrconf.h linux-2.6.16-026test015/include/net/addrconf.h +--- linux-2.6.16.orig/include/net/addrconf.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/addrconf.h 2006-07-04 14:41:39.000000000 +0400 +@@ -244,5 +244,14 @@ extern int if6_proc_init(void); + extern void if6_proc_exit(void); + #endif + ++int addrconf_ifdown(struct net_device *dev, int how); ++int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen); ++ ++#ifdef CONFIG_VE ++int addrconf_sysctl_init(struct ve_struct *ve); ++void addrconf_sysctl_fini(struct ve_struct *ve); ++void addrconf_sysctl_free(struct ve_struct *ve); ++#endif ++ + #endif + #endif +diff -upr linux-2.6.16.orig/include/net/af_unix.h linux-2.6.16-026test015/include/net/af_unix.h +--- linux-2.6.16.orig/include/net/af_unix.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/af_unix.h 2006-07-04 14:41:38.000000000 +0400 +@@ -19,23 +19,37 @@ extern atomic_t unix_tot_inflight; + + static inline struct sock *first_unix_socket(int *i) + { ++ struct sock *s; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); + for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { +- if (!hlist_empty(&unix_socket_table[*i])) +- return __sk_head(&unix_socket_table[*i]); ++ for (s = sk_head(&unix_socket_table[*i]); ++ s != NULL && !ve_accessible(s->sk_owner_env, ve); ++ s = sk_next(s)); ++ if (s != NULL) ++ return s; + } + return NULL; + } + + static inline struct sock *next_unix_socket(int *i, struct sock *s) + { +- struct sock *next = sk_next(s); +- /* More in this chain? */ +- if (next) +- return next; ++ struct ve_struct *ve; ++ ++ ve = get_exec_env(); ++ for (s = sk_next(s); s != NULL; s = sk_next(s)) { ++ if (!ve_accessible(s->sk_owner_env, ve)) ++ continue; ++ return s; ++ } + /* Look for next non-empty chain. */ + for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { +- if (!hlist_empty(&unix_socket_table[*i])) +- return __sk_head(&unix_socket_table[*i]); ++ for (s = sk_head(&unix_socket_table[*i]); ++ s != NULL && !ve_accessible(s->sk_owner_env, ve); ++ s = sk_next(s)); ++ if (s != NULL) ++ return s; + } + return NULL; + } +diff -upr linux-2.6.16.orig/include/net/arp.h linux-2.6.16-026test015/include/net/arp.h +--- linux-2.6.16.orig/include/net/arp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/arp.h 2006-07-04 14:41:39.000000000 +0400 +@@ -7,7 +7,14 @@ + + #define HAVE_ARP_CREATE + +-extern struct neigh_table arp_tbl; ++#ifdef CONFIG_VE ++#define arp_tbl (*(get_exec_env()->ve_arp_tbl)) ++extern int ve_arp_init(struct ve_struct *ve); ++extern void ve_arp_fini(struct ve_struct *ve); ++#else ++struct neigh_table global_arp_tbl; ++#define arp_tbl global_arp_tbl ++#endif + + extern void arp_init(void); + extern int arp_rcv(struct sk_buff *skb, struct net_device *dev, +diff -upr linux-2.6.16.orig/include/net/compat.h linux-2.6.16-026test015/include/net/compat.h +--- linux-2.6.16.orig/include/net/compat.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/compat.h 2006-07-04 14:41:36.000000000 +0400 +@@ -23,6 +23,14 @@ struct compat_cmsghdr { + compat_int_t cmsg_type; + }; + ++#if defined(CONFIG_X86_64) ++#define is_current_32bits() (current_thread_info()->flags & _TIF_IA32) ++#elif defined(CONFIG_IA64) ++#define is_current_32bits() (IS_IA32_PROCESS(ia64_task_regs(current))) ++#else ++#define is_current_32bits() 0 ++#endif ++ + #else /* defined(CONFIG_COMPAT) */ + #define compat_msghdr msghdr /* to avoid compiler warnings */ + #endif /* defined(CONFIG_COMPAT) */ +diff -upr linux-2.6.16.orig/include/net/flow.h linux-2.6.16-026test015/include/net/flow.h +--- linux-2.6.16.orig/include/net/flow.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/flow.h 2006-07-04 14:41:38.000000000 +0400 +@@ -10,6 +10,7 @@ + #include <linux/in6.h> + #include <asm/atomic.h> + ++struct ve_struct; + struct flowi { + int oif; + int iif; +@@ -78,6 +79,9 @@ struct flowi { + #define fl_icmp_type uli_u.icmpt.type + #define fl_icmp_code uli_u.icmpt.code + #define fl_ipsec_spi uli_u.spi ++#ifdef CONFIG_VE ++ struct ve_struct *owner_env; ++#endif + } __attribute__((__aligned__(BITS_PER_LONG/8))); + + #define FLOW_DIR_IN 0 +diff -upr linux-2.6.16.orig/include/net/icmp.h linux-2.6.16-026test015/include/net/icmp.h +--- linux-2.6.16.orig/include/net/icmp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/icmp.h 2006-07-04 14:41:38.000000000 +0400 +@@ -31,9 +31,14 @@ struct icmp_err { + + extern struct icmp_err icmp_err_convert[]; + DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); +-#define ICMP_INC_STATS(field) SNMP_INC_STATS(icmp_statistics, field) +-#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) +-#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_icmp_statistics (get_exec_env()->_icmp_statistics) ++#else ++#define ve_icmp_statistics icmp_statistics ++#endif ++#define ICMP_INC_STATS(field) SNMP_INC_STATS(ve_icmp_statistics, field) ++#define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_icmp_statistics, field) ++#define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_icmp_statistics, field) + + struct dst_entry; + struct net_proto_family; +diff -upr linux-2.6.16.orig/include/net/if_inet6.h linux-2.6.16-026test015/include/net/if_inet6.h +--- linux-2.6.16.orig/include/net/if_inet6.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/if_inet6.h 2006-07-04 14:41:39.000000000 +0400 +@@ -194,7 +194,14 @@ struct inet6_dev + unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ + }; + +-extern struct ipv6_devconf ipv6_devconf; ++extern struct ipv6_devconf global_ipv6_devconf; ++extern struct ipv6_devconf global_ipv6_devconf_dflt; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ipv6_devconf (*(get_exec_env()->_ipv6_devconf)) ++#else ++#define ve_ipv6_devconf global_ipv6_devconf ++#endif + + static inline void ipv6_eth_mc_map(struct in6_addr *addr, char *buf) + { +diff -upr linux-2.6.16.orig/include/net/inet6_hashtables.h linux-2.6.16-026test015/include/net/inet6_hashtables.h +--- linux-2.6.16.orig/include/net/inet6_hashtables.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/inet6_hashtables.h 2006-07-04 14:41:39.000000000 +0400 +@@ -27,11 +27,13 @@ struct inet_hashinfo; + + /* I have no idea if this is a good hash for v6 or not. -DaveM */ + static inline unsigned int inet6_ehashfn(const struct in6_addr *laddr, const u16 lport, +- const struct in6_addr *faddr, const u16 fport) ++ const struct in6_addr *faddr, const u16 fport, ++ const envid_t veid) + { + unsigned int hashent = (lport ^ fport); + + hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); ++ hashent ^= (veid ^ (veid >> 16)); + hashent ^= hashent >> 16; + hashent ^= hashent >> 8; + return hashent; +@@ -45,7 +47,7 @@ static inline int inet6_sk_ehashfn(const + const struct in6_addr *faddr = &np->daddr; + const __u16 lport = inet->num; + const __u16 fport = inet->dport; +- return inet6_ehashfn(laddr, lport, faddr, fport); ++ return inet6_ehashfn(laddr, lport, faddr, fport, VEID(VE_OWNER_SK(sk))); + } + + static inline void __inet6_hash(struct inet_hashinfo *hashinfo, +@@ -94,14 +96,15 @@ static inline struct sock * + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ +- unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport); ++ struct ve_struct *env = get_exec_env(); ++ unsigned int hash = inet6_ehashfn(daddr, hnum, saddr, sport, VEID(env)); + struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); + + prefetch(head->chain.first); + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ +- if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif)) ++ if (INET6_MATCH(sk, hash, saddr, daddr, ports, dif, env)) + goto hit; /* You sunk my battleship! */ + } + /* Must check for a TIME_WAIT'er before going to listener hash. */ +@@ -114,6 +117,7 @@ static inline struct sock * + + if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && ++ ve_accessible_strict(tw->tw_owner_env, VEID(env)) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + goto hit; + } +diff -upr linux-2.6.16.orig/include/net/inet_hashtables.h linux-2.6.16-026test015/include/net/inet_hashtables.h +--- linux-2.6.16.orig/include/net/inet_hashtables.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/inet_hashtables.h 2006-07-04 14:41:38.000000000 +0400 +@@ -24,6 +24,7 @@ + #include <linux/spinlock.h> + #include <linux/types.h> + #include <linux/wait.h> ++#include <linux/ve_owner.h> + + #include <net/inet_connection_sock.h> + #include <net/inet_sock.h> +@@ -75,11 +76,13 @@ struct inet_ehash_bucket { + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ + struct inet_bind_bucket { ++ struct ve_struct *owner_env; + unsigned short port; + signed short fastreuse; + struct hlist_node node; + struct hlist_head owners; + }; ++DCL_VE_OWNER_PROTO(TB, struct inet_bind_bucket, owner_env) + + #define inet_bind_bucket_for_each(tb, node, head) \ + hlist_for_each_entry(tb, node, head, node) +@@ -139,37 +142,43 @@ static inline struct inet_ehash_bucket * + extern struct inet_bind_bucket * + inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, +- const unsigned short snum); ++ const unsigned short snum, ++ struct ve_struct *env); + extern void inet_bind_bucket_destroy(kmem_cache_t *cachep, + struct inet_bind_bucket *tb); + +-static inline int inet_bhashfn(const __u16 lport, const int bhash_size) ++static inline int inet_bhashfn(const __u16 lport, const int bhash_size, ++ unsigned veid) + { +- return lport & (bhash_size - 1); ++ return ((lport + (veid ^ (veid >> 16))) & (bhash_size - 1)); + } + + extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum); + + /* These can have wildcards, don't try too hard. */ +-static inline int inet_lhashfn(const unsigned short num) ++static inline int inet_lhashfn(const unsigned short num, unsigned veid) + { +- return num & (INET_LHTABLE_SIZE - 1); ++ return ((num + (veid ^ (veid >> 16))) & (INET_LHTABLE_SIZE - 1)); + } + + static inline int inet_sk_listen_hashfn(const struct sock *sk) + { +- return inet_lhashfn(inet_sk(sk)->num); ++ return inet_lhashfn(inet_sk(sk)->num, VEID(VE_OWNER_SK(sk))); + } + + /* Caller must disable local BH processing. */ + static inline void __inet_inherit_port(struct inet_hashinfo *table, + struct sock *sk, struct sock *child) + { +- const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); +- struct inet_bind_hashbucket *head = &table->bhash[bhash]; ++ int bhash; ++ struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + ++ bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size, ++ VEID(VE_OWNER_SK(child))); ++ head = &table->bhash[bhash]; ++ + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + sk_add_bind_node(child, &tb->owners); +@@ -275,7 +284,8 @@ static inline int inet_iif(const struct + extern struct sock *__inet_lookup_listener(const struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, +- const int dif); ++ const int dif, ++ struct ve_struct *env); + + /* Optimize the common listener case. */ + static inline struct sock * +@@ -285,18 +295,21 @@ static inline struct sock * + { + struct sock *sk = NULL; + const struct hlist_head *head; ++ struct ve_struct *env; + ++ env = get_exec_env(); + read_lock(&hashinfo->lhash_lock); +- head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; ++ head = &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]; + if (!hlist_empty(head)) { + const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && ++ ve_accessible_strict(VE_OWNER_SK(sk), env) && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; +- sk = __inet_lookup_listener(head, daddr, hnum, dif); ++ sk = __inet_lookup_listener(head, daddr, hnum, dif, env); + } + if (sk) { + sherry_cache: +@@ -323,25 +336,25 @@ sherry_cache: + #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ + const __u64 __name = (((__u64)(__daddr)) << 32) | ((__u64)(__saddr)); + #endif /* __BIG_ENDIAN */ +-#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ ++#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ + (((__sk)->sk_hash == (__hash)) && \ + ((*((__u64 *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +-#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ ++#define INET_TW_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ + (((__sk)->sk_hash == (__hash)) && \ + ((*((__u64 *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ + ((*((__u32 *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) + #else /* 32-bit arch */ + #define INET_ADDR_COOKIE(__name, __saddr, __daddr) +-#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ ++#define INET_MATCH_ALLVE(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ + (((__sk)->sk_hash == (__hash)) && \ + (inet_sk(__sk)->daddr == (__saddr)) && \ + (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports)) && \ + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) +-#define INET_TW_MATCH(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ ++#define INET_TW_MATCH_ALLVE(__sk, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ + (((__sk)->sk_hash == (__hash)) && \ + (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ + (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ +@@ -349,6 +362,18 @@ sherry_cache: + (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) + #endif /* 64-bit arch */ + ++#define INET_MATCH(__sk, __hash, __cookie, __saddr, \ ++ __daddr, __ports, __dif, __ve) \ ++ (INET_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ ++ (__daddr), (__ports), (__dif)) \ ++ && ve_accessible_strict(VE_OWNER_SK(__sk), (__ve))) ++ ++#define INET_TW_MATCH(__sk, __hash, __cookie, __saddr, \ ++ __daddr, __ports, __dif, __ve) \ ++ (INET_TW_MATCH_ALLVE((__sk), (__hash), (__cookie), (__saddr), \ ++ (__daddr), (__ports), (__dif)) \ ++ && ve_accessible_strict(inet_twsk(__sk)->tw_owner_env, VEID(__ve))) ++ + /* + * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need + * not check it for lookups anymore, thanks Alexey. -DaveM +@@ -368,19 +393,25 @@ static inline struct sock * + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ +- unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); +- struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); +- ++ unsigned int hash; ++ struct inet_ehash_bucket *head; ++ struct ve_struct *env; ++ ++ env = get_exec_env(); ++ hash = inet_ehashfn(daddr, hnum, saddr, sport, VEID(env)); ++ head = inet_ehash_bucket(hashinfo, hash); + prefetch(head->chain.first); + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { +- if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_MATCH(sk, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) { +- if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ++ ports, dif, env)) + goto hit; + } + sk = NULL; +diff -upr linux-2.6.16.orig/include/net/inet_sock.h linux-2.6.16-026test015/include/net/inet_sock.h +--- linux-2.6.16.orig/include/net/inet_sock.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/inet_sock.h 2006-07-04 14:41:38.000000000 +0400 +@@ -171,9 +171,10 @@ static inline void inet_sk_copy_descenda + extern int inet_sk_rebuild_header(struct sock *sk); + + static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, +- const __u32 faddr, const __u16 fport) ++ const __u32 faddr, const __u16 fport, ++ const envid_t veid) + { +- unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); ++ int h = (laddr ^ lport) ^ (faddr ^ fport) ^ (veid ^ (veid >> 16)); + h ^= h >> 16; + h ^= h >> 8; + return h; +@@ -186,8 +187,9 @@ static inline int inet_sk_ehashfn(const + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; ++ envid_t veid = VEID(VE_OWNER_SK(sk)); + +- return inet_ehashfn(laddr, lport, faddr, fport); ++ return inet_ehashfn(laddr, lport, faddr, fport, veid); + } + + #endif /* _INET_SOCK_H */ +diff -upr linux-2.6.16.orig/include/net/inet_timewait_sock.h linux-2.6.16-026test015/include/net/inet_timewait_sock.h +--- linux-2.6.16.orig/include/net/inet_timewait_sock.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/inet_timewait_sock.h 2006-07-04 14:41:38.000000000 +0400 +@@ -134,6 +134,7 @@ struct inet_timewait_sock { + unsigned long tw_ttd; + struct inet_bind_bucket *tw_tb; + struct hlist_node tw_death_node; ++ envid_t tw_owner_env; + }; + + static inline void inet_twsk_add_node(struct inet_timewait_sock *tw, +diff -upr linux-2.6.16.orig/include/net/ip.h linux-2.6.16-026test015/include/net/ip.h +--- linux-2.6.16.orig/include/net/ip.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/ip.h 2006-07-04 14:41:38.000000000 +0400 +@@ -95,6 +95,7 @@ extern int ip_local_deliver(struct sk_b + extern int ip_mr_input(struct sk_buff *skb); + extern int ip_output(struct sk_buff *skb); + extern int ip_mc_output(struct sk_buff *skb); ++extern int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + extern int ip_do_nat(struct sk_buff *skb); + extern void ip_send_check(struct iphdr *ip); + extern int ip_queue_xmit(struct sk_buff *skb, int ipfragok); +@@ -152,15 +153,25 @@ struct ipv4_config + + extern struct ipv4_config ipv4_config; + DECLARE_SNMP_STAT(struct ipstats_mib, ip_statistics); +-#define IP_INC_STATS(field) SNMP_INC_STATS(ip_statistics, field) +-#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ip_statistics, field) +-#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ip_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ip_statistics (get_exec_env()->_ip_statistics) ++#else ++#define ve_ip_statistics ip_statistics ++#endif ++#define IP_INC_STATS(field) SNMP_INC_STATS(ve_ip_statistics, field) ++#define IP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ip_statistics, field) ++#define IP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ip_statistics, field) + DECLARE_SNMP_STAT(struct linux_mib, net_statistics); +-#define NET_INC_STATS(field) SNMP_INC_STATS(net_statistics, field) +-#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(net_statistics, field) +-#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(net_statistics, field) +-#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(net_statistics, field, adnd) +-#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(net_statistics, field, adnd) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_net_statistics (get_exec_env()->_net_statistics) ++#else ++#define ve_net_statistics net_statistics ++#endif ++#define NET_INC_STATS(field) SNMP_INC_STATS(ve_net_statistics, field) ++#define NET_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_net_statistics, field) ++#define NET_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_net_statistics, field) ++#define NET_ADD_STATS_BH(field, adnd) SNMP_ADD_STATS_BH(ve_net_statistics, field, adnd) ++#define NET_ADD_STATS_USER(field, adnd) SNMP_ADD_STATS_USER(ve_net_statistics, field, adnd) + + extern int sysctl_local_port_range[2]; + extern int sysctl_ip_default_ttl; +@@ -380,4 +391,11 @@ extern int ip_misc_proc_init(void); + + extern struct ctl_table ipv4_table[]; + ++#ifdef CONFIG_SYSCTL ++extern int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++extern int ipv4_sysctl_forward_strategy(ctl_table *table, int __user *name, ++ int nlen, void __user *oldval, size_t __user *oldlenp, ++ void __user *newval, size_t newlen, void **context); ++#endif + #endif /* _IP_H */ +diff -upr linux-2.6.16.orig/include/net/ip6_fib.h linux-2.6.16-026test015/include/net/ip6_fib.h +--- linux-2.6.16.orig/include/net/ip6_fib.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/ip6_fib.h 2006-07-04 14:41:39.000000000 +0400 +@@ -78,6 +78,15 @@ struct rt6_info + u8 rt6i_protocol; + }; + ++struct fib6_table ++{ ++ struct list_head list; ++ struct fib6_node root; ++ struct ve_struct *owner_env; ++}; ++ ++extern struct list_head fib6_table_list; ++ + struct fib6_walker_t + { + struct fib6_walker_t *prev, *next; +@@ -143,7 +152,7 @@ struct rt6_statistics { + + typedef void (*f_pnode)(struct fib6_node *fn, void *); + +-extern struct fib6_node ip6_routing_table; ++extern struct fib6_node ve0_ip6_routing_table; + + /* + * exported functions +diff -upr linux-2.6.16.orig/include/net/ip6_route.h linux-2.6.16-026test015/include/net/ip6_route.h +--- linux-2.6.16.orig/include/net/ip6_route.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/ip6_route.h 2006-07-04 14:41:39.000000000 +0400 +@@ -139,5 +139,10 @@ static inline int ipv6_unicast_destinati + return rt->rt6i_flags & RTF_LOCAL; + } + ++#ifdef CONFIG_VE ++int init_ve_route6(struct ve_struct *ve); ++void fini_ve_route6(struct ve_struct *ve); ++#endif ++ + #endif + #endif +diff -upr linux-2.6.16.orig/include/net/ip_fib.h linux-2.6.16-026test015/include/net/ip_fib.h +--- linux-2.6.16.orig/include/net/ip_fib.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/ip_fib.h 2006-07-04 14:41:38.000000000 +0400 +@@ -168,10 +168,22 @@ struct fib_table { + unsigned char tb_data[0]; + }; + ++struct fn_zone; ++struct fn_hash ++{ ++ struct fn_zone *fn_zones[33]; ++ struct fn_zone *fn_zone_list; ++}; ++ + #ifndef CONFIG_IP_MULTIPLE_TABLES + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ip_fib_local_table get_exec_env()->_local_table ++#define ip_fib_main_table get_exec_env()->_main_table ++#else + extern struct fib_table *ip_fib_local_table; + extern struct fib_table *ip_fib_main_table; ++#endif + + static inline struct fib_table *fib_get_table(int id) + { +@@ -203,7 +215,12 @@ static inline void fib_select_default(co + #define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL]) + #define ip_fib_main_table (fib_tables[RT_TABLE_MAIN]) + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define fib_tables get_exec_env()->_fib_tables ++#else + extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; ++#endif ++ + extern int fib_lookup(const struct flowi *flp, struct fib_result *res); + extern struct fib_table *__fib_new_table(int id); + extern void fib_rule_put(struct fib_rule *r); +@@ -250,10 +267,19 @@ extern u32 __fib_res_prefsrc(struct fib + + /* Exported by fib_hash.c */ + extern struct fib_table *fib_hash_init(int id); ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++struct ve_struct; ++extern int init_ve_route(struct ve_struct *ve); ++extern void fini_ve_route(struct ve_struct *ve); ++#else ++#define init_ve_route(ve) (0) ++#define fini_ve_route(ve) do { } while (0) ++#endif + + #ifdef CONFIG_IP_MULTIPLE_TABLES + /* Exported by fib_rules.c */ +- ++extern int fib_rules_create(void); ++extern void fib_rules_destroy(void); + extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); + extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); + extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); +diff -upr linux-2.6.16.orig/include/net/ipv6.h linux-2.6.16-026test015/include/net/ipv6.h +--- linux-2.6.16.orig/include/net/ipv6.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/ipv6.h 2006-07-04 14:41:39.000000000 +0400 +@@ -113,39 +113,48 @@ extern int sysctl_mld_max_msf; + + /* MIBs */ + DECLARE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); +-#define IP6_INC_STATS(field) SNMP_INC_STATS(ipv6_statistics, field) +-#define IP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ipv6_statistics, field) +-#define IP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ipv6_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ipv6_statistics (get_exec_env()->_ipv6_statistics) ++#define ve_icmpv6_statistics (get_exec_env()->_icmpv6_statistics) ++#define ve_udp_stats_in6 (get_exec_env()->_udp_stats_in6) ++#else ++#define ve_ipv6_statistics ipv6_statistics ++#define ve_icmpv6_statistics icmpv6_statistics ++#define ve_udp_stats_in6 udp_stats_in6 ++#endif ++#define IP6_INC_STATS(field) SNMP_INC_STATS(ve_ipv6_statistics, field) ++#define IP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_ipv6_statistics, field) ++#define IP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_ipv6_statistics, field) + DECLARE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); + #define ICMP6_INC_STATS(idev, field) ({ \ + struct inet6_dev *_idev = (idev); \ + if (likely(_idev != NULL)) \ + SNMP_INC_STATS(idev->stats.icmpv6, field); \ +- SNMP_INC_STATS(icmpv6_statistics, field); \ ++ SNMP_INC_STATS(ve_icmpv6_statistics, field); \ + }) + #define ICMP6_INC_STATS_BH(idev, field) ({ \ + struct inet6_dev *_idev = (idev); \ + if (likely(_idev != NULL)) \ + SNMP_INC_STATS_BH((_idev)->stats.icmpv6, field); \ +- SNMP_INC_STATS_BH(icmpv6_statistics, field); \ ++ SNMP_INC_STATS_BH(ve_icmpv6_statistics, field); \ + }) + #define ICMP6_INC_STATS_USER(idev, field) ({ \ + struct inet6_dev *_idev = (idev); \ + if (likely(_idev != NULL)) \ + SNMP_INC_STATS_USER(_idev->stats.icmpv6, field); \ +- SNMP_INC_STATS_USER(icmpv6_statistics, field); \ ++ SNMP_INC_STATS_USER(ve_icmpv6_statistics, field); \ + }) + #define ICMP6_INC_STATS_OFFSET_BH(idev, field, offset) ({ \ + struct inet6_dev *_idev = idev; \ + __typeof__(offset) _offset = (offset); \ + if (likely(_idev != NULL)) \ + SNMP_INC_STATS_OFFSET_BH(_idev->stats.icmpv6, field, _offset); \ +- SNMP_INC_STATS_OFFSET_BH(icmpv6_statistics, field, _offset); \ ++ SNMP_INC_STATS_OFFSET_BH(ve_icmpv6_statistics, field, _offset); \ + }) + DECLARE_SNMP_STAT(struct udp_mib, udp_stats_in6); +-#define UDP6_INC_STATS(field) SNMP_INC_STATS(udp_stats_in6, field) +-#define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_stats_in6, field) +-#define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_stats_in6, field) ++#define UDP6_INC_STATS(field) SNMP_INC_STATS(ve_udp_stats_in6, field) ++#define UDP6_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_stats_in6, field) ++#define UDP6_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_stats_in6, field) + + int snmp6_register_dev(struct inet6_dev *idev); + int snmp6_unregister_dev(struct inet6_dev *idev); +@@ -154,6 +163,11 @@ int snmp6_free_dev(struct inet6_dev *ide + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign); + void snmp6_mib_free(void *ptr[2]); + ++#ifdef CONFIG_VE ++int ve_snmp_proc_init(void); ++void ve_snmp_proc_fini(void); ++#endif ++ + struct ip6_ra_chain + { + struct ip6_ra_chain *next; +diff -upr linux-2.6.16.orig/include/net/ndisc.h linux-2.6.16-026test015/include/net/ndisc.h +--- linux-2.6.16.orig/include/net/ndisc.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/ndisc.h 2006-07-04 14:41:39.000000000 +0400 +@@ -50,7 +50,14 @@ struct net_device; + struct net_proto_family; + struct sk_buff; + +-extern struct neigh_table nd_tbl; ++#ifdef CONFIG_VE ++#define nd_tbl (*(get_exec_env()->ve_nd_tbl)) ++extern int ve_ndisc_init(struct ve_struct *ve); ++extern void ve_ndisc_fini(struct ve_struct *ve); ++#else ++extern struct neigh_table global_nd_tbl; ++#define nd_tbl global_nd_tbl ++#endif + + struct nd_msg { + struct icmp6hdr icmph; +@@ -128,6 +135,7 @@ extern int ndisc_ifinfo_sysctl_change + extern void inet6_ifinfo_notify(int event, + struct inet6_dev *idev); + ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + static inline struct neighbour * ndisc_get_neigh(struct net_device *dev, struct in6_addr *addr) + { + +@@ -136,6 +144,7 @@ static inline struct neighbour * ndisc_g + + return NULL; + } ++#endif + + + #endif /* __KERNEL__ */ +diff -upr linux-2.6.16.orig/include/net/neighbour.h linux-2.6.16-026test015/include/net/neighbour.h +--- linux-2.6.16.orig/include/net/neighbour.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/neighbour.h 2006-07-04 14:41:39.000000000 +0400 +@@ -191,6 +191,8 @@ struct neigh_table + atomic_t entries; + rwlock_t lock; + unsigned long last_rand; ++ struct ve_struct *owner_env; ++ struct user_beancounter *owner_ub; + kmem_cache_t *kmem_cachep; + struct neigh_statistics *stats; + struct neighbour **hash_buckets; +@@ -210,7 +212,7 @@ struct neigh_table + #define NEIGH_UPDATE_F_ISROUTER 0x40000000 + #define NEIGH_UPDATE_F_ADMIN 0x80000000 + +-extern void neigh_table_init(struct neigh_table *tbl); ++extern int neigh_table_init(struct neigh_table *tbl); + extern int neigh_table_clear(struct neigh_table *tbl); + extern struct neighbour * neigh_lookup(struct neigh_table *tbl, + const void *pkey, +diff -upr linux-2.6.16.orig/include/net/netlink_sock.h linux-2.6.16-026test015/include/net/netlink_sock.h +--- linux-2.6.16.orig/include/net/netlink_sock.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/net/netlink_sock.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,22 @@ ++#ifndef __NET_NETLINK_SOCK_H ++#define __NET_NETLINK_SOCK_H ++ ++struct netlink_sock { ++ /* struct sock has to be the first member of netlink_sock */ ++ struct sock sk; ++ u32 pid; ++ u32 dst_pid; ++ u32 dst_group; ++ u32 flags; ++ u32 subscriptions; ++ u32 ngroups; ++ unsigned long *groups; ++ unsigned long state; ++ wait_queue_head_t wait; ++ struct netlink_callback *cb; ++ spinlock_t cb_lock; ++ void (*data_ready)(struct sock *sk, int bytes); ++ struct module *module; ++}; ++ ++#endif /* __NET_NETLINK_SOCK_H */ +diff -upr linux-2.6.16.orig/include/net/route.h linux-2.6.16-026test015/include/net/route.h +--- linux-2.6.16.orig/include/net/route.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/route.h 2006-07-04 14:41:38.000000000 +0400 +@@ -201,4 +201,14 @@ static inline struct inet_peer *rt_get_p + + extern ctl_table ipv4_route_table[]; + ++#ifdef CONFIG_SYSCTL ++extern int ipv4_flush_delay; ++extern int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, ++ struct file *filp, void __user *buffer, size_t *lenp, ++ loff_t *ppos); ++extern int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, ++ int __user *name, int nlen, void __user *oldval, ++ size_t __user *oldlenp, void __user *newval, ++ size_t newlen, void **context); ++#endif + #endif /* _ROUTE_H */ +diff -upr linux-2.6.16.orig/include/net/scm.h linux-2.6.16-026test015/include/net/scm.h +--- linux-2.6.16.orig/include/net/scm.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/scm.h 2006-07-04 14:41:38.000000000 +0400 +@@ -40,7 +40,7 @@ static __inline__ int scm_send(struct so + memset(scm, 0, sizeof(*scm)); + scm->creds.uid = current->uid; + scm->creds.gid = current->gid; +- scm->creds.pid = current->tgid; ++ scm->creds.pid = virt_tgid(current); + if (msg->msg_controllen <= 0) + return 0; + return __scm_send(sock, msg, scm); +diff -upr linux-2.6.16.orig/include/net/sctp/sctp.h linux-2.6.16-026test015/include/net/sctp/sctp.h +--- linux-2.6.16.orig/include/net/sctp/sctp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/sctp/sctp.h 2006-07-04 14:41:36.000000000 +0400 +@@ -461,12 +461,12 @@ static inline int sctp_frag_point(const + * there is room for a param header too. + */ + #define sctp_walk_params(pos, chunk, member)\ +-_sctp_walk_params((pos), (chunk), WORD_ROUND(ntohs((chunk)->chunk_hdr.length)), member) ++_sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) + + #define _sctp_walk_params(pos, chunk, end, member)\ + for (pos.v = chunk->member;\ + pos.v <= (void *)chunk + end - sizeof(sctp_paramhdr_t) &&\ +- pos.v <= (void *)chunk + end - WORD_ROUND(ntohs(pos.p->length)) &&\ ++ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ + ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\ + pos.v += WORD_ROUND(ntohs(pos.p->length))) + +@@ -477,7 +477,7 @@ _sctp_walk_errors((err), (chunk_hdr), nt + for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ + sizeof(sctp_chunkhdr_t));\ + (void *)err <= (void *)chunk_hdr + end - sizeof(sctp_errhdr_t) &&\ +- (void *)err <= (void *)chunk_hdr + end - WORD_ROUND(ntohs(err->length)) &&\ ++ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ + ntohs(err->length) >= sizeof(sctp_errhdr_t); \ + err = (sctp_errhdr_t *)((void *)err + WORD_ROUND(ntohs(err->length)))) + +diff -upr linux-2.6.16.orig/include/net/sctp/structs.h linux-2.6.16-026test015/include/net/sctp/structs.h +--- linux-2.6.16.orig/include/net/sctp/structs.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/sctp/structs.h 2006-07-04 14:41:36.000000000 +0400 +@@ -702,6 +702,7 @@ struct sctp_chunk { + __u8 tsn_gap_acked; /* Is this chunk acked by a GAP ACK? */ + __s8 fast_retransmit; /* Is this chunk fast retransmitted? */ + __u8 tsn_missing_report; /* Data chunk missing counter. */ ++ __u8 data_accepted; /* At least 1 chunk in this packet accepted */ + }; + + void sctp_chunk_hold(struct sctp_chunk *); +diff -upr linux-2.6.16.orig/include/net/sock.h linux-2.6.16-026test015/include/net/sock.h +--- linux-2.6.16.orig/include/net/sock.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/sock.h 2006-07-04 14:41:38.000000000 +0400 +@@ -55,6 +55,8 @@ + #include <net/dst.h> + #include <net/checksum.h> + ++#include <ub/ub_net.h> ++ + /* + * This structure really needs to be cleaned up. + * Most of it is for TCP, and not used by any of +@@ -251,8 +253,12 @@ struct sock { + int (*sk_backlog_rcv)(struct sock *sk, + struct sk_buff *skb); + void (*sk_destruct)(struct sock *sk); ++ struct sock_beancounter sk_bc; ++ struct ve_struct *sk_owner_env; + }; + ++DCL_VE_OWNER_PROTO(SK, struct sock, sk_owner_env) ++ + /* + * Hashed lists helper routines + */ +@@ -485,7 +491,8 @@ static inline void sk_add_backlog(struct + }) + + extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); +-extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); ++extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p, ++ unsigned long amount); + extern void sk_stream_wait_close(struct sock *sk, long timeo_p); + extern int sk_stream_error(struct sock *sk, int flags, int err); + extern void sk_stream_kill_queues(struct sock *sk); +@@ -706,8 +713,11 @@ static inline void sk_stream_writequeue_ + + static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) + { +- return (int)skb->truesize <= sk->sk_forward_alloc || +- sk_stream_mem_schedule(sk, skb->truesize, 1); ++ if ((int)skb->truesize > sk->sk_forward_alloc && ++ !sk_stream_mem_schedule(sk, skb->truesize, 1)) ++ /* The situation is bad according to mainstream. Den */ ++ return 0; ++ return ub_tcprcvbuf_charge(sk, skb) == 0; + } + + static inline int sk_stream_wmem_schedule(struct sock *sk, int size) +@@ -765,6 +775,11 @@ extern struct sk_buff *sock_alloc_send + unsigned long size, + int noblock, + int *errcode); ++extern struct sk_buff *sock_alloc_send_skb2(struct sock *sk, ++ unsigned long size, ++ unsigned long size2, ++ int noblock, ++ int *errcode); + extern void *sock_kmalloc(struct sock *sk, int size, + gfp_t priority); + extern void sock_kfree_s(struct sock *sk, void *mem, int size); +@@ -1062,12 +1077,16 @@ sk_dst_check(struct sock *sk, u32 cookie + + static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) + { ++ extern int sysctl_tcp_use_sg; ++ + __sk_dst_set(sk, dst); + sk->sk_route_caps = dst->dev->features; + if (sk->sk_route_caps & NETIF_F_TSO) { + if (sock_flag(sk, SOCK_NO_LARGESEND) || dst->header_len) + sk->sk_route_caps &= ~NETIF_F_TSO; + } ++ if (!sysctl_tcp_use_sg) ++ sk->sk_route_caps &= ~NETIF_F_SG; + } + + static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) +@@ -1142,6 +1161,10 @@ static inline int sock_queue_rcv_skb(str + goto out; + } + ++ err = ub_sockrcvbuf_charge(sk, skb); ++ if (err < 0) ++ goto out; ++ + /* It would be deadlock, if sock_queue_rcv_skb is used + with socket lock! We assume that users of this + function are lock free. +diff -upr linux-2.6.16.orig/include/net/tcp.h linux-2.6.16-026test015/include/net/tcp.h +--- linux-2.6.16.orig/include/net/tcp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/tcp.h 2006-07-04 14:41:39.000000000 +0400 +@@ -40,6 +40,7 @@ + #include <net/tcp_states.h> + + #include <linux/seq_file.h> ++#include <ub/ub_net.h> + + extern struct inet_hashinfo tcp_hashinfo; + +@@ -219,6 +220,7 @@ extern int sysctl_tcp_nometrics_save; + extern int sysctl_tcp_moderate_rcvbuf; + extern int sysctl_tcp_tso_win_divisor; + extern int sysctl_tcp_abc; ++extern int sysctl_tcp_use_sg; + + extern atomic_t tcp_memory_allocated; + extern atomic_t tcp_sockets_allocated; +@@ -250,12 +252,17 @@ static inline int between(__u32 seq1, __ + extern struct proto tcp_prot; + + DECLARE_SNMP_STAT(struct tcp_mib, tcp_statistics); +-#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) +-#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) +-#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) +-#define TCP_DEC_STATS(field) SNMP_DEC_STATS(tcp_statistics, field) +-#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(tcp_statistics, field, val) +-#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(tcp_statistics, field, val) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_tcp_statistics (get_exec_env()->_tcp_statistics) ++#else ++#define ve_tcp_statistics tcp_statistics ++#endif ++#define TCP_INC_STATS(field) SNMP_INC_STATS(ve_tcp_statistics, field) ++#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_tcp_statistics, field) ++#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_tcp_statistics, field) ++#define TCP_DEC_STATS(field) SNMP_DEC_STATS(ve_tcp_statistics, field) ++#define TCP_ADD_STATS_BH(field, val) SNMP_ADD_STATS_BH(ve_tcp_statistics, field, val) ++#define TCP_ADD_STATS_USER(field, val) SNMP_ADD_STATS_USER(ve_tcp_statistics, field, val) + + extern void tcp_v4_err(struct sk_buff *skb, u32); + +@@ -493,7 +500,7 @@ extern u32 __tcp_select_window(struct so + * to use only the low 32-bits of jiffies and hide the ugly + * casts with the following macro. + */ +-#define tcp_time_stamp ((__u32)(jiffies)) ++#define tcp_time_stamp ((__u32)(jiffies + get_exec_env()->jiffies_fixup)) + + /* This is what the send packet queuing engine uses to pass + * TCP per-packet control information to the transmission +diff -upr linux-2.6.16.orig/include/net/udp.h linux-2.6.16-026test015/include/net/udp.h +--- linux-2.6.16.orig/include/net/udp.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/include/net/udp.h 2006-07-04 14:41:38.000000000 +0400 +@@ -39,13 +39,19 @@ extern rwlock_t udp_hash_lock; + + extern int udp_port_rover; + +-static inline int udp_lport_inuse(u16 num) ++static inline int udp_hashfn(u16 num, unsigned veid) ++{ ++ return ((num + (veid ^ (veid >> 16))) & (UDP_HTABLE_SIZE - 1)); ++} ++ ++static inline int udp_lport_inuse(u16 num, struct ve_struct *env) + { + struct sock *sk; + struct hlist_node *node; + +- sk_for_each(sk, node, &udp_hash[num & (UDP_HTABLE_SIZE - 1)]) +- if (inet_sk(sk)->num == num) ++ sk_for_each(sk, node, &udp_hash[udp_hashfn(num, VEID(env))]) ++ if (inet_sk(sk)->num == num && ++ ve_accessible_strict(sk->sk_owner_env, env)) + return 1; + return 0; + } +@@ -75,9 +81,14 @@ extern unsigned int udp_poll(struct file + poll_table *wait); + + DECLARE_SNMP_STAT(struct udp_mib, udp_statistics); +-#define UDP_INC_STATS(field) SNMP_INC_STATS(udp_statistics, field) +-#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(udp_statistics, field) +-#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(udp_statistics, field) ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_udp_statistics (get_exec_env()->_udp_statistics) ++#else ++#define ve_udp_statistics udp_statistics ++#endif ++#define UDP_INC_STATS(field) SNMP_INC_STATS(ve_udp_statistics, field) ++#define UDP_INC_STATS_BH(field) SNMP_INC_STATS_BH(ve_udp_statistics, field) ++#define UDP_INC_STATS_USER(field) SNMP_INC_STATS_USER(ve_udp_statistics, field) + + /* /proc */ + struct udp_seq_afinfo { +diff -upr linux-2.6.16.orig/include/ub/beancounter.h linux-2.6.16-026test015/include/ub/beancounter.h +--- linux-2.6.16.orig/include/ub/beancounter.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/beancounter.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,329 @@ ++/* ++ * include/ub/beancounter.h ++ * ++ * Copyright (C) 1999-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Andrey Savochkin saw@sw-soft.com ++ * ++ */ ++ ++#ifndef _LINUX_BEANCOUNTER_H ++#define _LINUX_BEANCOUNTER_H ++ ++#include <linux/config.h> ++ ++/* ++ * Generic ratelimiting stuff. ++ */ ++ ++struct ub_rate_info { ++ int burst; ++ int interval; /* jiffy_t per event */ ++ int bucket; /* kind of leaky bucket */ ++ unsigned long last; /* last event */ ++}; ++ ++/* Return true if rate limit permits. */ ++int ub_ratelimit(struct ub_rate_info *); ++ ++ ++/* ++ * This magic is used to distinuish user beancounter and pages beancounter ++ * in struct page. page_ub and page_bc are placed in union and MAGIC ++ * ensures us that we don't use pbc as ubc in ub_page_uncharge(). ++ */ ++#define UB_MAGIC 0x62756275 ++ ++/* ++ * Resource list. ++ */ ++ ++#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including ++ * struct task, page directories, etc. ++ */ ++#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ ++#define UB_PRIVVMPAGES 2 /* Total number of pages, counting potentially ++ * private pages as private and used. ++ */ ++#define UB_SHMPAGES 3 /* IPC SHM segment size. */ ++#define UB_ZSHMPAGES 4 /* Anonymous shared memory. */ ++#define UB_NUMPROC 5 /* Number of processes. */ ++#define UB_PHYSPAGES 6 /* All resident pages, for swapout guarantee. */ ++#define UB_VMGUARPAGES 7 /* Guarantee for memory allocation, ++ * checked against PRIVVMPAGES. ++ */ ++#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. ++ * Only limit is used, no accounting. ++ */ ++#define UB_NUMTCPSOCK 9 /* Number of TCP sockets. */ ++#define UB_NUMFLOCK 10 /* Number of file locks. */ ++#define UB_NUMPTY 11 /* Number of PTYs. */ ++#define UB_NUMSIGINFO 12 /* Number of siginfos. */ ++#define UB_TCPSNDBUF 13 /* Total size of tcp send buffers. */ ++#define UB_TCPRCVBUF 14 /* Total size of tcp receive buffers. */ ++#define UB_OTHERSOCKBUF 15 /* Total size of other socket ++ * send buffers (all buffers for PF_UNIX). ++ */ ++#define UB_DGRAMRCVBUF 16 /* Total size of other socket ++ * receive buffers. ++ */ ++#define UB_NUMOTHERSOCK 17 /* Number of other sockets. */ ++#define UB_DCACHESIZE 18 /* Size of busy dentry/inode cache. */ ++#define UB_NUMFILE 19 /* Number of open files. */ ++ ++#define UB_RESOURCES 24 ++ ++#define UB_UNUSEDPRIVVM (UB_RESOURCES + 0) ++#define UB_TMPFSPAGES (UB_RESOURCES + 1) ++#define UB_SWAPPAGES (UB_RESOURCES + 2) ++#define UB_HELDPAGES (UB_RESOURCES + 3) ++ ++struct ubparm { ++ /* ++ * A barrier over which resource allocations are failed gracefully. ++ * If the amount of consumed memory is over the barrier further sbrk() ++ * or mmap() calls fail, the existing processes are not killed. ++ */ ++ unsigned long barrier; ++ /* hard resource limit */ ++ unsigned long limit; ++ /* consumed resources */ ++ unsigned long held; ++ /* maximum amount of consumed resources through the last period */ ++ unsigned long maxheld; ++ /* minimum amount of consumed resources through the last period */ ++ unsigned long minheld; ++ /* count of failed charges */ ++ unsigned long failcnt; ++}; ++ ++/* ++ * Kernel internal part. ++ */ ++ ++#ifdef __KERNEL__ ++ ++#include <ub/ub_debug.h> ++#include <linux/interrupt.h> ++#include <asm/atomic.h> ++#include <linux/spinlock.h> ++#include <linux/cache.h> ++#include <linux/threads.h> ++ ++/* ++ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. ++ */ ++#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) ++ ++ ++/* ++ * Resource management structures ++ * Serialization issues: ++ * beancounter list management is protected via ub_hash_lock ++ * task pointers are set only for current task and only once ++ * refcount is managed atomically ++ * value and limit comparison and change are protected by per-ub spinlock ++ */ ++ ++struct page_beancounter; ++struct task_beancounter; ++struct sock_beancounter; ++ ++struct page_private { ++ unsigned long ubp_unused_privvmpages; ++ unsigned long ubp_tmpfs_respages; ++ unsigned long ubp_swap_pages; ++ unsigned long long ubp_held_pages; ++}; ++ ++struct sock_private { ++ unsigned long ubp_rmem_thres; ++ unsigned long ubp_wmem_pressure; ++ unsigned long ubp_maxadvmss; ++ unsigned long ubp_rmem_pressure; ++#define UB_RMEM_EXPAND 0 ++#define UB_RMEM_KEEP 1 ++#define UB_RMEM_SHRINK 2 ++ struct list_head ubp_other_socks; ++ struct list_head ubp_tcp_socks; ++ atomic_t ubp_orphan_count; ++}; ++ ++struct ub_perfstat { ++ unsigned long unmap; ++ unsigned long swapin; ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ long pages_charged; ++ long vmalloc_charged; ++ long pbcs; ++#endif ++} ____cacheline_aligned_in_smp; ++ ++struct user_beancounter ++{ ++ unsigned long ub_magic; ++ atomic_t ub_refcount; ++ struct user_beancounter *ub_next; ++ spinlock_t ub_lock; ++ uid_t ub_uid; ++ ++ struct ub_rate_info ub_limit_rl; ++ int ub_oom_noproc; ++ ++ struct page_private ppriv; ++#define ub_unused_privvmpages ppriv.ubp_unused_privvmpages ++#define ub_tmpfs_respages ppriv.ubp_tmpfs_respages ++#define ub_swap_pages ppriv.ubp_swap_pages ++#define ub_held_pages ppriv.ubp_held_pages ++ struct sock_private spriv; ++#define ub_rmem_thres spriv.ubp_rmem_thres ++#define ub_maxadvmss spriv.ubp_maxadvmss ++#define ub_rmem_pressure spriv.ubp_rmem_pressure ++#define ub_wmem_pressure spriv.ubp_wmem_pressure ++#define ub_tcp_sk_list spriv.ubp_tcp_socks ++#define ub_other_sk_list spriv.ubp_other_socks ++#define ub_orphan_count spriv.ubp_orphan_count ++ ++ struct user_beancounter *parent; ++ void *private_data; ++ ++ /* resources statistic and settings */ ++ struct ubparm ub_parms[UB_RESOURCES]; ++ /* resources statistic for last interval */ ++ struct ubparm ub_store[UB_RESOURCES]; ++ ++ struct ub_perfstat ub_stat[NR_CPUS]; ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ struct list_head ub_cclist; ++#endif ++}; ++ ++enum severity { UB_HARD, UB_SOFT, UB_FORCE }; ++ ++static inline int ub_barrier_hit(struct user_beancounter *ub, int resource) ++{ ++ return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier; ++} ++ ++static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource) ++{ ++ return (ub->ub_parms[resource].held > ++ ((ub->ub_parms[resource].barrier) >> 1)); ++} ++ ++#ifndef CONFIG_USER_RESOURCE ++ ++extern inline struct user_beancounter *get_beancounter_byuid ++ (uid_t uid, int create) { return NULL; } ++extern inline struct user_beancounter *get_beancounter ++ (struct user_beancounter *ub) { return NULL; } ++extern inline void put_beancounter(struct user_beancounter *ub) {;} ++ ++static inline void ub_init_cache(unsigned long mempages) { }; ++static inline void ub_init_ub0(void) { }; ++ ++#define get_ub0() NULL ++ ++#else /* CONFIG_USER_RESOURCE */ ++ ++/* ++ * Charge/uncharge operations ++ */ ++ ++extern int __charge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict); ++ ++extern void __uncharge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val); ++ ++extern void __put_beancounter(struct user_beancounter *ub); ++ ++extern void uncharge_warn(struct user_beancounter *ub, int resource, ++ unsigned long val, unsigned long held); ++ ++extern const char *ub_rnames[]; ++/* ++ * Put a beancounter reference ++ */ ++ ++static inline void put_beancounter(struct user_beancounter *ub) ++{ ++ if (unlikely(ub == NULL)) ++ return; ++ ++ __put_beancounter(ub); ++} ++ ++/* ++ * Create a new beancounter reference ++ */ ++extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create); ++ ++static inline ++struct user_beancounter *get_beancounter(struct user_beancounter *ub) ++{ ++ if (unlikely(ub == NULL)) ++ return NULL; ++ ++ atomic_inc(&ub->ub_refcount); ++ return ub; ++} ++ ++extern struct user_beancounter *get_subbeancounter_byid( ++ struct user_beancounter *, ++ int id, int create); ++extern struct user_beancounter *subbeancounter_findcreate( ++ struct user_beancounter *p, int id); ++ ++extern struct user_beancounter ub0; ++ ++extern void ub_init_cache(unsigned long); ++extern void ub_init_ub0(void); ++#define get_ub0() (&ub0) ++ ++extern void print_ub_uid(struct user_beancounter *ub, char *buf, int size); ++ ++/* ++ * Resource charging ++ * Change user's account and compare against limits ++ */ ++ ++static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource) ++{ ++ if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held) ++ ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held; ++ if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held) ++ ub->ub_parms[resource].minheld = ub->ub_parms[resource].held; ++} ++ ++#endif /* CONFIG_USER_RESOURCE */ ++ ++#include <ub/ub_decl.h> ++UB_DECLARE_FUNC(int, charge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict)); ++UB_DECLARE_VOID_FUNC(uncharge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val)); ++ ++UB_DECLARE_VOID_FUNC(charge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val)); ++UB_DECLARE_VOID_FUNC(uncharge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val)); ++ ++#ifndef CONFIG_USER_RESOURCE_PROC ++static inline void ub_init_proc(void) { }; ++#else ++extern void ub_init_proc(void); ++#endif ++ ++#ifdef CONFIG_USER_RSS_ACCOUNTING ++extern void ub_init_pbc(void); ++#else ++static inline void ub_ini_pbc(void) { } ++#endif ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_BEANCOUNTER_H */ +diff -upr linux-2.6.16.orig/include/ub/ub_dcache.h linux-2.6.16-026test015/include/ub/ub_dcache.h +--- linux-2.6.16.orig/include/ub/ub_dcache.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_dcache.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,57 @@ ++/* ++ * include/ub/ub_dcache.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_DCACHE_H_ ++#define __UB_DCACHE_H_ ++ ++#include <ub/ub_decl.h> ++ ++/* ++ * UB_DCACHESIZE accounting ++ */ ++ ++struct dentry_beancounter ++{ ++ /* ++ * d_inuse = ++ * <number of external refs> + ++ * <number of 'used' childs> ++ * ++ * d_inuse == -1 means that dentry is unused ++ * state change -1 => 0 causes charge ++ * state change 0 => -1 causes uncharge ++ */ ++ atomic_t d_inuse; ++ /* charged size, including name length if name is not inline */ ++ unsigned long d_ubsize; ++ struct user_beancounter *d_ub; ++}; ++ ++struct dentry; ++ ++UB_DECLARE_FUNC(int, ub_dentry_alloc(struct dentry *d)) ++UB_DECLARE_VOID_FUNC(ub_dentry_charge_nofail(struct dentry *d)) ++UB_DECLARE_VOID_FUNC(ub_dentry_uncharge(struct dentry *d)) ++ ++#ifdef CONFIG_USER_RESOURCE ++UB_DECLARE_FUNC(int, ub_dentry_charge(struct dentry *d)) ++#define ub_dget_testone(d) (atomic_inc_and_test(&(d)->dentry_bc.d_inuse)) ++#define ub_dput_testzero(d) (atomic_add_negative(-1, &(d)->dentry_bc.d_inuse)) ++#define INUSE_INIT 0 ++#else ++#define ub_dentry_charge(d) ({ \ ++ spin_unlock(&d->d_lock); \ ++ rcu_read_unlock(); \ ++ 0; \ ++ }) ++#define ub_dget_testone(d) (0) ++#define ub_dput_testzero(d) (0) ++#endif ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_debug.h linux-2.6.16-026test015/include/ub/ub_debug.h +--- linux-2.6.16.orig/include/ub/ub_debug.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_debug.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,95 @@ ++/* ++ * include/ub/ub_debug.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_DEBUG_H_ ++#define __UB_DEBUG_H_ ++ ++/* ++ * general debugging ++ */ ++ ++#define UBD_ALLOC 0x1 ++#define UBD_CHARGE 0x2 ++#define UBD_LIMIT 0x4 ++#define UBD_TRACE 0x8 ++ ++/* ++ * ub_net debugging ++ */ ++ ++#define UBD_NET_SOCKET 0x10 ++#define UBD_NET_SLEEP 0x20 ++#define UBD_NET_SEND 0x40 ++#define UBD_NET_RECV 0x80 ++ ++/* ++ * Main routines ++ */ ++ ++#define UB_DEBUG (0) ++#define DEBUG_RESOURCE (0ULL) ++ ++#define ub_dbg_cond(__cond, __str, args...) \ ++ do { \ ++ if ((__cond) != 0) \ ++ printk(__str, ##args); \ ++ } while(0) ++ ++#define ub_debug(__section, __str, args...) \ ++ ub_dbg_cond(UB_DEBUG & (__section), __str, ##args) ++ ++#define ub_debug_resource(__resource, __str, args...) \ ++ ub_dbg_cond((UB_DEBUG & UBD_CHARGE) && \ ++ (DEBUG_RESOURCE & (1 << (__resource))), \ ++ __str, ##args) ++ ++#if UB_DEBUG & UBD_TRACE ++#define ub_debug_trace(__cond, __b, __r) \ ++ do { \ ++ static struct ub_rate_info ri = { __b, __r }; \ ++ if ((__cond) != 0 && ub_ratelimit(&ri)) \ ++ dump_stack(); \ ++ } while(0) ++#else ++#define ub_debug_trace(__cond, __burst, __rate) ++#endif ++ ++#include <linux/config.h> ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++#include <linux/list.h> ++#include <linux/kmem_cache.h> ++ ++struct user_beancounter; ++struct ub_cache_counter { ++ struct list_head ulist; ++ struct ub_cache_counter *next; ++ struct user_beancounter *ub; ++ kmem_cache_t *cachep; ++ unsigned long counter; ++}; ++ ++extern spinlock_t cc_lock; ++extern void init_cache_counters(void); ++extern void ub_free_counters(struct user_beancounter *); ++extern void ub_kmemcache_free(kmem_cache_t *cachep); ++ ++struct vm_struct; ++extern void inc_vmalloc_charged(struct vm_struct *, int); ++extern void dec_vmalloc_charged(struct vm_struct *); ++#else ++#define init_cache_counters() do { } while (0) ++#define inc_vmalloc_charged(vm, f) do { } while (0) ++#define dec_vmalloc_charged(vm) do { } while (0) ++#define ub_free_counters(ub) do { } while (0) ++#define ub_kmemcache_free(cachep) do { } while (0) ++#endif ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_decl.h linux-2.6.16-026test015/include/ub/ub_decl.h +--- linux-2.6.16.orig/include/ub/ub_decl.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_decl.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,40 @@ ++/* ++ * include/ub/ub_decl.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_DECL_H_ ++#define __UB_DECL_H_ ++ ++#include <linux/config.h> ++ ++/* ++ * Naming convension: ++ * ub_<section|object>_<operation> ++ */ ++ ++#ifdef CONFIG_USER_RESOURCE ++ ++#define UB_DECLARE_FUNC(ret_type, decl) extern ret_type decl; ++#define UB_DECLARE_VOID_FUNC(decl) extern void decl; ++ ++#else /* CONFIG_USER_RESOURCE */ ++ ++#define UB_DECLARE_FUNC(ret_type, decl) \ ++ static inline ret_type decl \ ++ { \ ++ return (ret_type)0; \ ++ } ++#define UB_DECLARE_VOID_FUNC(decl) \ ++ static inline void decl \ ++ { \ ++ } ++ ++#endif /* CONFIG_USER_RESOURCE */ ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_hash.h linux-2.6.16-026test015/include/ub/ub_hash.h +--- linux-2.6.16.orig/include/ub/ub_hash.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_hash.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,41 @@ ++/* ++ * include/ub/ub_hash.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef _LINUX_UBHASH_H ++#define _LINUX_UBHASH_H ++ ++#ifdef __KERNEL__ ++ ++#define UB_HASH_SIZE 256 ++ ++struct ub_hash_slot { ++ struct user_beancounter *ubh_beans; ++}; ++ ++extern struct ub_hash_slot ub_hash[]; ++extern spinlock_t ub_hash_lock; ++ ++#ifdef CONFIG_USER_RESOURCE ++ ++/* ++ * Iterate over beancounters ++ * @__slot - hash slot ++ * @__ubp - beancounter ptr ++ * Can use break :) ++ */ ++#define for_each_beancounter(__slot, __ubp) \ ++ for (__slot = 0, __ubp = NULL; \ ++ __slot < UB_HASH_SIZE && __ubp == NULL; __slot++) \ ++ for (__ubp = ub_hash[__slot].ubh_beans; __ubp; \ ++ __ubp = __ubp->ub_next) ++ ++#endif /* CONFIG_USER_RESOURCE */ ++#endif /* __KERNEL__ */ ++#endif /* _LINUX_UBHASH_H */ +diff -upr linux-2.6.16.orig/include/ub/ub_mem.h linux-2.6.16-026test015/include/ub/ub_mem.h +--- linux-2.6.16.orig/include/ub/ub_mem.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_mem.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,76 @@ ++/* ++ * include/ub/ub_mem.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_SLAB_H_ ++#define __UB_SLAB_H_ ++ ++#include <linux/config.h> ++#include <linux/kmem_slab.h> ++#include <ub/beancounter.h> ++#include <ub/ub_decl.h> ++ ++/* ++ * UB_KMEMSIZE accounting ++ */ ++ ++#ifdef CONFIG_UBC_DEBUG_ITEMS ++#define CHARGE_ORDER(__o) (1 << __o) ++#define CHARGE_SIZE(__s) 1 ++#else ++#define CHARGE_ORDER(__o) (PAGE_SIZE << (__o)) ++#define CHARGE_SIZE(__s) (__s) ++#endif ++ ++#define page_ub(__page) ((__page)->bc.page_ub) ++ ++struct mm_struct; ++struct page; ++ ++UB_DECLARE_FUNC(struct user_beancounter *, slab_ub(void *obj)) ++UB_DECLARE_FUNC(struct user_beancounter *, vmalloc_ub(void *obj)) ++UB_DECLARE_FUNC(struct user_beancounter *, mem_ub(void *obj)) ++ ++UB_DECLARE_FUNC(int, ub_page_charge(struct page *page, int order, int mask)) ++UB_DECLARE_VOID_FUNC(ub_page_uncharge(struct page *page, int order)) ++UB_DECLARE_FUNC(int, ub_slab_charge(void *objp, int flags)) ++UB_DECLARE_VOID_FUNC(ub_slab_uncharge(void *obj)) ++ ++#define slab_ubcs(cachep, slabp) ((struct user_beancounter **)\ ++ (ALIGN((unsigned long)(slab_bufctl(slabp) + (cachep)->num),\ ++ sizeof(void *)))) ++ ++#ifdef CONFIG_USER_RESOURCE ++extern struct user_beancounter *ub_select_worst(long *); ++ ++/* mm/slab.c needed stuff */ ++#define UB_ALIGN(flags) (flags & SLAB_UBC ? sizeof(void *) : 1) ++#define UB_EXTRA(flags) (flags & SLAB_UBC ? sizeof(void *) : 0) ++#define set_cache_objuse(cachep) do { \ ++ (cachep)->objuse = ((PAGE_SIZE << (cachep)->gfporder) + \ ++ (cachep)->num - 1) / (cachep)->num; \ ++ if (!OFF_SLAB(cachep)) \ ++ break; \ ++ (cachep)->objuse += ((cachep)->slabp_cache->objuse + \ ++ (cachep)->num - 1) / (cachep)->num; \ ++ } while (0) ++#define init_slab_ubps(cachep, slabp) do { \ ++ if (!((cachep)->flags & SLAB_UBC)) \ ++ break; \ ++ memset(slab_ubcs(cachep, slabp), 0, \ ++ (cachep)->num * sizeof(void *)); \ ++ } while (0) ++#define kmem_obj_memusage(o) (virt_to_cache(o)->objuse) ++#else ++#define UB_ALIGN(flags) 1 ++#define UB_EXTRA(flags) 0 ++#define set_cache_objuse(c) do { } while (0) ++#define init_slab_ubps(c, s) do { } while (0) ++#endif ++#endif /* __UB_SLAB_H_ */ +diff -upr linux-2.6.16.orig/include/ub/ub_misc.h linux-2.6.16-026test015/include/ub/ub_misc.h +--- linux-2.6.16.orig/include/ub/ub_misc.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_misc.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,54 @@ ++/* ++ * include/ub/ub_misc.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_MISC_H_ ++#define __UB_MISC_H_ ++ ++#include <ub/ub_decl.h> ++ ++struct tty_struct; ++struct file; ++struct file_lock; ++struct sigqueue; ++ ++UB_DECLARE_FUNC(int, ub_file_charge(struct file *f)) ++UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f)) ++UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard)) ++UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl)) ++UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q, ++ struct user_beancounter *ub)) ++UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q)) ++UB_DECLARE_FUNC(int, ub_task_charge(struct task_struct *parent, ++ struct task_struct *task)) ++UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct task_struct *task)) ++UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty)) ++UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty)) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define set_flock_charged(fl) do { (fl)->fl_charged = 1; } while (0) ++#define unset_flock_charged(fl) do { \ ++ WARN_ON((fl)->fl_charged == 0); \ ++ (fl)->fl_charged = 0; \ ++ } while (0) ++#define set_mm_ub(mm, tsk) do { \ ++ (mm)->mm_ub = get_beancounter(tsk ? \ ++ tsk->task_bc.task_ub : get_exec_ub()); \ ++ } while (0) ++#define put_mm_ub(mm) do { \ ++ put_beancounter((mm)->mm_ub); \ ++ (mm)->mm_ub = NULL; \ ++ } while (0) ++#else ++#define set_flock_charged(fl) do { } while (0) ++#define ubset_flock_charged(fl) do { } while (0) ++#define set_mm_ub(mm, tsk) do { } while (0) ++#define put_mm_ub(mm) do { } while (0) ++#endif ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_net.h linux-2.6.16-026test015/include/ub/ub_net.h +--- linux-2.6.16.orig/include/ub/ub_net.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_net.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,141 @@ ++/* ++ * include/ub/ub_net.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_NET_H_ ++#define __UB_NET_H_ ++ ++/* ++ * UB_NUMXXXSOCK, UB_XXXBUF accounting ++ */ ++ ++#include <ub/ub_decl.h> ++#include <ub/ub_sk.h> ++ ++#define bid2sid(__bufid) \ ++ ((__bufid) == UB_TCPSNDBUF ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK) ++ ++#define SOCK_MIN_UBCSPACE ((int)((2048 - sizeof(struct skb_shared_info)) & \ ++ ~(SMP_CACHE_BYTES-1))) ++#define SOCK_MIN_UBCSPACE_CH skb_charge_size(SOCK_MIN_UBCSPACE) ++ ++ ++#define IS_TCP_SOCK(__family, __type) \ ++ (((__family) == PF_INET || (__family) == PF_INET6) && (__type) == SOCK_STREAM) ++ ++UB_DECLARE_FUNC(int, ub_sock_charge(struct sock *sk, int family, int type)) ++UB_DECLARE_FUNC(int, ub_tcp_sock_charge(struct sock *sk)) ++UB_DECLARE_FUNC(int, ub_other_sock_charge(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_sock_uncharge(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_skb_uncharge(struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask)) ++UB_DECLARE_VOID_FUNC(ub_skb_free_bc(struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk)) ++UB_DECLARE_FUNC(int, ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_VOID_FUNC(ub_sock_snd_queue_add(struct sock *sk, int resource, ++ unsigned long size)) ++UB_DECLARE_FUNC(long, ub_sock_wait_for_space(struct sock *sk, long timeo, ++ unsigned long size)) ++ ++UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_tcprcvbuf_charge_forced(struct sock *sk, ++ struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb)) ++UB_DECLARE_FUNC(int, ub_tcpsndbuf_charge_forced(struct sock *sk, ++ struct sk_buff *skb)) ++ ++/* Charge size */ ++static inline unsigned long skb_charge_datalen(unsigned long chargesize) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ unsigned long slabsize; ++ ++ chargesize -= sizeof(struct sk_buff); ++ slabsize = 64; ++ do { ++ slabsize <<= 1; ++ } while (slabsize <= chargesize); ++ ++ slabsize >>= 1; ++ return (slabsize - sizeof(struct skb_shared_info)) & ++ ~(SMP_CACHE_BYTES-1); ++#else ++ return 0; ++#endif ++} ++ ++static inline unsigned long skb_charge_size_gen(unsigned long size) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ unsigned int slabsize; ++ ++ size = SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info); ++ slabsize = 32; /* min size is 64 because of skb_shared_info */ ++ do { ++ slabsize <<= 1; ++ } while (slabsize < size); ++ ++ return slabsize + sizeof(struct sk_buff); ++#else ++ return 0; ++#endif ++ ++} ++ ++static inline unsigned long skb_charge_size_const(unsigned long size) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ unsigned int ret; ++ if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 64) ++ ret = 64 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 128) ++ ret = 128 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 256) ++ ret = 256 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 512) ++ ret = 512 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 1024) ++ ret = 1024 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 2048) ++ ret = 2048 + sizeof(struct sk_buff); ++ else if (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info) <= 4096) ++ ret = 4096 + sizeof(struct sk_buff); ++ else ++ ret = skb_charge_size_gen(size); ++ return ret; ++#else ++ return 0; ++#endif ++} ++ ++ ++#define skb_charge_size(__size) \ ++ (__builtin_constant_p(__size) ? \ ++ skb_charge_size_const(__size) : \ ++ skb_charge_size_gen(__size)) ++ ++UB_DECLARE_FUNC(int, skb_charge_fullsize(struct sk_buff *skb)) ++UB_DECLARE_VOID_FUNC(ub_skb_set_charge(struct sk_buff *skb, ++ struct sock *sk, unsigned long size, int res)) ++ ++/* Poll reserv */ ++UB_DECLARE_FUNC(int, ub_sock_makewres_other(struct sock *sk, unsigned long sz)) ++UB_DECLARE_FUNC(int, ub_sock_makewres_tcp(struct sock *sk, unsigned long size)) ++UB_DECLARE_FUNC(int, ub_sock_getwres_other(struct sock *sk, unsigned long size)) ++UB_DECLARE_FUNC(int, ub_sock_getwres_tcp(struct sock *sk, unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_retwres_other(struct sock *sk, unsigned long size, ++ unsigned long ressize)) ++UB_DECLARE_VOID_FUNC(ub_sock_retwres_tcp(struct sock *sk, unsigned long size, ++ unsigned long ressize)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_other(struct sock *sk, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz)) ++UB_DECLARE_VOID_FUNC(ub_sock_sndqueuedel(struct sock *sk)) ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_orphan.h linux-2.6.16-026test015/include/ub/ub_orphan.h +--- linux-2.6.16.orig/include/ub/ub_orphan.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_orphan.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,56 @@ ++/* ++ * include/ub/ub_orphan.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_ORPHAN_H_ ++#define __UB_ORPHAN_H_ ++ ++#include <net/tcp.h> ++ ++#include "ub/beancounter.h" ++#include "ub/ub_net.h" ++ ++ ++static inline atomic_t *__ub_get_orphan_count_ptr(struct sock *sk) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) ++ return &sock_bc(sk)->ub->ub_orphan_count; ++#endif ++ return sk->sk_prot->orphan_count; ++} ++ ++static inline void ub_inc_orphan_count(struct sock *sk) ++{ ++ atomic_inc(__ub_get_orphan_count_ptr(sk)); ++} ++ ++static inline void ub_dec_orphan_count(struct sock *sk) ++{ ++ atomic_dec(__ub_get_orphan_count_ptr(sk)); ++} ++ ++static inline int ub_get_orphan_count(struct sock *sk) ++{ ++ return atomic_read(__ub_get_orphan_count_ptr(sk)); ++} ++ ++extern int __ub_too_many_orphans(struct sock *sk, int count); ++static inline int ub_too_many_orphans(struct sock *sk, int count) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ if (__ub_too_many_orphans(sk, count)) ++ return 1; ++#endif ++ return (ub_get_orphan_count(sk) > sysctl_tcp_max_orphans || ++ (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && ++ atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])); ++} ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_page.h linux-2.6.16-026test015/include/ub/ub_page.h +--- linux-2.6.16.orig/include/ub/ub_page.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_page.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,48 @@ ++/* ++ * include/ub/ub_page.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_PAGE_H_ ++#define __UB_PAGE_H_ ++ ++#include <linux/config.h> ++ ++/* ++ * Page_beancounters ++ */ ++ ++struct page; ++struct user_beancounter; ++ ++#define PB_MAGIC 0x62700001UL ++ ++struct page_beancounter { ++ unsigned long pb_magic; ++ struct page *page; ++ struct user_beancounter *ub; ++ struct page_beancounter *next_hash; ++ unsigned refcount; ++ struct list_head page_list; ++}; ++ ++#define PB_REFCOUNT_BITS 24 ++#define PB_SHIFT_GET(c) ((c) >> PB_REFCOUNT_BITS) ++#define PB_SHIFT_INC(c) ((c) += (1 << PB_REFCOUNT_BITS)) ++#define PB_SHIFT_DEC(c) ((c) -= (1 << PB_REFCOUNT_BITS)) ++#define PB_COUNT_GET(c) ((c) & ((1 << PB_REFCOUNT_BITS) - 1)) ++#define PB_COUNT_INC(c) ((c)++) ++#define PB_COUNT_DEC(c) ((c)--) ++#define PB_REFCOUNT_MAKE(s, c) (((s) << PB_REFCOUNT_BITS) + (c)) ++ ++#define page_pbc(__page) ((__page)->bc.page_pb) ++ ++struct address_space; ++extern int is_shmem_mapping(struct address_space *); ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_sk.h linux-2.6.16-026test015/include/ub/ub_sk.h +--- linux-2.6.16.orig/include/ub/ub_sk.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_sk.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,43 @@ ++/* ++ * include/ub/ub_sk.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_SK_H_ ++#define __UB_SK_H_ ++ ++#include <linux/config.h> ++#include <ub/ub_task.h> ++ ++struct sock; ++struct sk_buff; ++ ++struct skb_beancounter { ++ struct user_beancounter *ub; ++ unsigned long charged:27, resource:5; ++}; ++ ++struct sock_beancounter { ++ /* ++ * already charged for future sends, to make poll work; ++ * changes are protected by bc spinlock, read is under socket ++ * semaphore for sends and unprotected in poll ++ */ ++ unsigned long poll_reserv; ++ unsigned long ub_waitspc; /* space waiting for */ ++ unsigned long ub_wcharged; ++ struct list_head ub_sock_list; ++ struct user_beancounter *ub; ++}; ++ ++#define sock_bc(__sk) (&(__sk)->sk_bc) ++#define skb_bc(__skb) (&(__skb)->skb_bc) ++#define skbc_sock(__skbc) (container_of(__skbc, struct sock, sk_bc)) ++#define sock_has_ubc(__sk) (sock_bc(__sk)->ub != NULL) ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_stat.h linux-2.6.16-026test015/include/ub/ub_stat.h +--- linux-2.6.16.orig/include/ub/ub_stat.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_stat.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,70 @@ ++/* ++ * include/ub/ub_stat.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_STAT_H_ ++#define __UB_STAT_H_ ++ ++/* sys_ubstat commands list */ ++#define UBSTAT_READ_ONE 0x010000 ++#define UBSTAT_READ_ALL 0x020000 ++#define UBSTAT_READ_FULL 0x030000 ++#define UBSTAT_UBLIST 0x040000 ++#define UBSTAT_UBPARMNUM 0x050000 ++#define UBSTAT_GETTIME 0x060000 ++ ++#define UBSTAT_CMD(func) ((func) & 0xF0000) ++#define UBSTAT_PARMID(func) ((func) & 0x0FFFF) ++ ++#define TIME_MAX_SEC (LONG_MAX / HZ) ++#define TIME_MAX_JIF (TIME_MAX_SEC * HZ) ++ ++typedef unsigned long ubstattime_t; ++ ++typedef struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstattime_t cur_time; ++} ubgettime_t; ++ ++typedef struct { ++ long maxinterval; ++ int signum; ++} ubnotifrq_t; ++ ++typedef struct { ++ unsigned long maxheld; ++ unsigned long failcnt; ++} ubstatparm_t; ++ ++typedef struct { ++ unsigned long barrier; ++ unsigned long limit; ++ unsigned long held; ++ unsigned long maxheld; ++ unsigned long minheld; ++ unsigned long failcnt; ++ unsigned long __unused1; ++ unsigned long __unused2; ++} ubstatparmf_t; ++ ++typedef struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparmf_t param[0]; ++} ubstatfull_t; ++ ++#ifdef __KERNEL__ ++struct ub_stat_notify { ++ struct list_head list; ++ struct task_struct *task; ++ int signum; ++}; ++#endif ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_task.h linux-2.6.16-026test015/include/ub/ub_task.h +--- linux-2.6.16.orig/include/ub/ub_task.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_task.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,49 @@ ++/* ++ * include/ub/ub_task.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_TASK_H_ ++#define __UB_TASK_H_ ++ ++#include <linux/config.h> ++ ++struct user_beancounter; ++ ++ ++#ifdef CONFIG_USER_RESOURCE ++ ++struct task_beancounter { ++ struct user_beancounter *exec_ub; ++ struct user_beancounter *task_ub; ++ struct user_beancounter *fork_sub; ++ void *task_fnode, *task_freserv; ++ unsigned long oom_generation; ++ unsigned long task_data[4]; ++}; ++ ++#define get_exec_ub() (current->task_bc.exec_ub) ++#define get_task_ub(__task) ((__task)->task_bc.task_ub) ++#define set_exec_ub(__newub) \ ++({ \ ++ struct user_beancounter *old; \ ++ struct task_beancounter *tbc; \ ++ tbc = ¤t->task_bc; \ ++ old = tbc->exec_ub; \ ++ tbc->exec_ub = __newub; \ ++ old; \ ++}) ++ ++#else /* CONFIG_USER_RESOURCE */ ++ ++#define get_exec_ub() (NULL) ++#define get_task_ub(task) (NULL) ++#define set_exec_ub(__ub) (NULL) ++ ++#endif /* CONFIG_USER_RESOURCE */ ++#endif /* __UB_TASK_H_ */ +diff -upr linux-2.6.16.orig/include/ub/ub_tcp.h linux-2.6.16-026test015/include/ub/ub_tcp.h +--- linux-2.6.16.orig/include/ub/ub_tcp.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_tcp.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,79 @@ ++/* ++ * include/ub/ub_tcp.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_TCP_H_ ++#define __UB_TCP_H_ ++ ++/* ++ * UB_NUMXXXSOCK, UB_XXXBUF accounting ++ */ ++ ++#include <ub/ub_sk.h> ++#include <ub/beancounter.h> ++ ++static inline void ub_tcp_update_maxadvmss(struct sock *sk) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ if (!sock_has_ubc(sk)) ++ return; ++ if (sock_bc(sk)->ub->ub_maxadvmss >= tcp_sk(sk)->advmss) ++ return; ++ ++ sock_bc(sk)->ub->ub_maxadvmss = ++ skb_charge_size(MAX_HEADER + sizeof(struct iphdr) ++ + sizeof(struct tcphdr) + tcp_sk(sk)->advmss); ++#endif ++} ++ ++static inline int ub_tcp_rmem_allows_expand(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 0; ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) { ++ struct user_beancounter *ub; ++ ++ ub = sock_bc(sk)->ub; ++ if (ub->ub_rmem_pressure == UB_RMEM_EXPAND) ++ return 1; ++ if (ub->ub_rmem_pressure == UB_RMEM_SHRINK) ++ return 0; ++ return sk->sk_rcvbuf <= ub->ub_rmem_thres; ++ } ++#endif ++ return 1; ++} ++ ++static inline int ub_tcp_memory_pressure(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 1; ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) ++ return sock_bc(sk)->ub->ub_rmem_pressure != UB_RMEM_EXPAND; ++#endif ++ return 0; ++} ++ ++static inline int ub_tcp_shrink_rcvbuf(struct sock *sk) ++{ ++ if (tcp_memory_pressure) ++ return 1; ++#ifdef CONFIG_USER_RESOURCE ++ if (sock_has_ubc(sk)) ++ return sock_bc(sk)->ub->ub_rmem_pressure == UB_RMEM_SHRINK; ++#endif ++ return 0; ++} ++ ++UB_DECLARE_FUNC(int, ub_sock_tcp_chargepage(struct sock *sk)) ++UB_DECLARE_VOID_FUNC(ub_sock_tcp_detachpage(struct sock *sk)) ++ ++#endif +diff -upr linux-2.6.16.orig/include/ub/ub_vmpages.h linux-2.6.16-026test015/include/ub/ub_vmpages.h +--- linux-2.6.16.orig/include/ub/ub_vmpages.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/include/ub/ub_vmpages.h 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,167 @@ ++/* ++ * include/ub/ub_vmpages.h ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#ifndef __UB_PAGES_H_ ++#define __UB_PAGES_H_ ++ ++#include <linux/linkage.h> ++#include <linux/config.h> ++#include <ub/beancounter.h> ++#include <ub/ub_decl.h> ++ ++/* ++ * Check whether vma has private or copy-on-write mapping. ++ * Should match checks in ub_protected_charge(). ++ */ ++#define VM_UB_PRIVATE(__flags, __file) \ ++ ( ((__flags) & VM_WRITE) ? \ ++ (__file) == NULL || !((__flags) & VM_SHARED) : \ ++ 0 \ ++ ) ++ ++/* Mprotect charging result */ ++#define PRIVVM_ERROR -1 ++#define PRIVVM_NO_CHARGE 0 /* UB_DECLARE_FUNC retval with ubc off */ ++#define PRIVVM_TO_PRIVATE 1 ++#define PRIVVM_TO_SHARED 2 ++ ++UB_DECLARE_FUNC(int, ub_protected_charge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned long newflags, ++ struct vm_area_struct *vma)) ++ ++UB_DECLARE_VOID_FUNC(ub_unused_privvm_add(struct mm_struct *mm, ++ struct vm_area_struct *vma, ++ unsigned long num)) ++#define ub_unused_privvm_inc(mm, vma) ub_unused_privvm_add(mm, vma, 1) ++UB_DECLARE_VOID_FUNC(ub_unused_privvm_sub(struct mm_struct *mm, ++ struct vm_area_struct *vma, ++ unsigned long num)) ++#define ub_unused_privvm_dec(mm, vma) ub_unused_privvm_sub(mm, vma, 1) ++ ++UB_DECLARE_VOID_FUNC(__ub_unused_privvm_dec(struct mm_struct *mm, ++ long sz)) ++ ++UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned vm_flags, ++ struct file *vm_file, ++ int strict)) ++UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm, ++ unsigned long size, ++ unsigned vm_flags, ++ struct file *vm_file)) ++ ++struct shmem_inode_info; ++UB_DECLARE_FUNC(int, ub_shmpages_charge(struct shmem_inode_info *i, ++ unsigned long sz)) ++UB_DECLARE_VOID_FUNC(ub_shmpages_uncharge(struct shmem_inode_info *i, ++ unsigned long sz)) ++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_inc(struct shmem_inode_info *shi)) ++UB_DECLARE_VOID_FUNC(ub_tmpfs_respages_sub(struct shmem_inode_info *shi, ++ unsigned long size)) ++#define ub_tmpfs_respages_dec(shi) ub_tmpfs_respages_sub(shi, 1) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define shmi_ub_set(shi, ub) do { \ ++ (shi)->shmi_ub = get_beancounter(ub); \ ++ } while (0) ++#define shmi_ub_put(shi) do { \ ++ put_beancounter((shi)->shmi_ub); \ ++ (shi)->shmi_ub = NULL; \ ++ } while (0) ++#else ++#define shmi_ub_set(shi, ub) do { } while (0) ++#define shmi_ub_put(shi) do { } while (0) ++#endif ++ ++UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm, ++ unsigned long size)) ++UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi, ++ unsigned long size)) ++UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi, ++ unsigned long size)) ++ ++UB_DECLARE_FUNC(unsigned long, pages_in_vma_range(struct vm_area_struct *vma, ++ unsigned long addr, unsigned long end)) ++UB_DECLARE_VOID_FUNC(warn_bad_rss(struct vm_area_struct *vma, ++ unsigned long freed)) ++#define pages_in_vma(vma) (pages_in_vma_range(vma, \ ++ vma->vm_start, vma->vm_end)) ++ ++#define UB_PAGE_WEIGHT_SHIFT 24 ++#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) ++ ++struct page_beancounter; ++#define PBC_COPY_SAME ((struct page_beancounter *) 1) ++ ++/* Mprotect charging result */ ++#define PRIVVM_ERROR -1 ++#define PRIVVM_NO_CHARGE 0 ++#define PRIVVM_TO_PRIVATE 1 ++#define PRIVVM_TO_SHARED 2 ++ ++extern void fastcall __ub_update_physpages(struct user_beancounter *ub); ++extern void fastcall __ub_update_oomguarpages(struct user_beancounter *ub); ++extern void fastcall __ub_update_privvm(struct user_beancounter *ub); ++ ++#ifdef CONFIG_USER_RSS_ACCOUNTING ++#define PB_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) ++#define PB_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) ++#else ++#define PB_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} ++#define PB_DECLARE_VOID_FUNC(decl) static inline void decl { } ++#endif ++ ++PB_DECLARE_FUNC(int, pb_alloc(struct page_beancounter **pbc)) ++PB_DECLARE_FUNC(int, pb_alloc_list(struct page_beancounter **pbc, int num)) ++PB_DECLARE_FUNC(int, pb_alloc_all(struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_add_ref(struct page *page, ++ struct mm_struct *mm, ++ struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_dup_ref(struct page *page, ++ struct mm_struct *mm, ++ struct page_beancounter **pbc)) ++PB_DECLARE_VOID_FUNC(pb_free_list(struct page_beancounter **pb)) ++PB_DECLARE_VOID_FUNC(pb_free(struct page_beancounter **pb)) ++PB_DECLARE_VOID_FUNC(pb_remove_ref(struct page *page, ++ struct mm_struct *mm)) ++ ++PB_DECLARE_FUNC(struct user_beancounter *, pb_grab_page_ub(struct page *page)) ++#endif ++ ++#ifdef CONFIG_USER_SWAP_ACCOUNTING ++#define SWP_DECLARE_FUNC(ret, decl) UB_DECLARE_FUNC(ret, decl) ++#define SWP_DECLARE_VOID_FUNC(decl) UB_DECLARE_VOID_FUNC(decl) ++#else ++#define SWP_DECLARE_FUNC(ret, decl) static inline ret decl {return (ret)0;} ++#define SWP_DECLARE_VOID_FUNC(decl) static inline void decl { } ++#endif ++ ++struct swap_info_struct; ++SWP_DECLARE_FUNC(int, ub_swap_init(struct swap_info_struct *si, pgoff_t n)) ++SWP_DECLARE_VOID_FUNC(ub_swap_fini(struct swap_info_struct *si)) ++SWP_DECLARE_VOID_FUNC(ub_swapentry_inc(struct swap_info_struct *si, pgoff_t n, ++ struct user_beancounter *ub)) ++SWP_DECLARE_VOID_FUNC(ub_swapentry_dec(struct swap_info_struct *si, pgoff_t n)) ++ ++#ifdef CONFIG_USER_RESOURCE ++#define ub_unmap_inc(mm) do { \ ++ (mm)->mm_ub->ub_stat[smp_processor_id()].unmap++; \ ++ } while (0) ++#define ub_swapin_inc(mm) do { \ ++ (mm)->mm_ub->ub_stat[smp_processor_id()].swapin++; \ ++ } while (0) ++#else ++#define ub_unmap_inc(mm) do { } while (0) ++#define ub_swapin_inc(mm) do { } while (0) ++#endif +diff -upr linux-2.6.16.orig/init/calibrate.c linux-2.6.16-026test015/init/calibrate.c +--- linux-2.6.16.orig/init/calibrate.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/init/calibrate.c 2006-07-04 14:41:38.000000000 +0400 +@@ -7,6 +7,7 @@ + #include <linux/sched.h> + #include <linux/delay.h> + #include <linux/init.h> ++#include <linux/module.h> + + #include <asm/timex.h> + +@@ -105,6 +106,60 @@ static unsigned long __devinit calibrate + static unsigned long __devinit calibrate_delay_direct(void) {return 0;} + #endif + ++unsigned long cycles_per_jiffy, cycles_per_clock; ++ ++static __devinit void calibrate_cycles(void) ++{ ++ unsigned long ticks; ++ cycles_t time; ++ ++ ticks = jiffies; ++ while (ticks == jiffies) ++ /* nothing */; ++ time = get_cycles(); ++ ticks = jiffies; ++ while (ticks == jiffies) ++ /* nothing */; ++ ++ time = get_cycles() - time; ++ cycles_per_jiffy = time; ++ if ((time >> 32) != 0) { ++ printk("CPU too fast! timings are incorrect\n"); ++ cycles_per_jiffy = -1; ++ } ++} ++ ++EXPORT_SYMBOL(cycles_per_jiffy); ++EXPORT_SYMBOL(cycles_per_clock); ++ ++static __devinit void calc_cycles_per_jiffy(void) ++{ ++#if defined(__i386__) ++ extern unsigned long fast_gettimeoffset_quotient; ++ unsigned long low, high; ++ ++ if (fast_gettimeoffset_quotient != 0) { ++ __asm__("divl %2" ++ :"=a" (low), "=d" (high) ++ :"r" (fast_gettimeoffset_quotient), ++ "0" (0), "1" (1000000/HZ)); ++ ++ cycles_per_jiffy = low; ++ } ++#endif ++ if (cycles_per_jiffy == 0) ++ calibrate_cycles(); ++ ++ if (cycles_per_jiffy == 0) { ++ printk(KERN_WARNING "Cycles are stuck! " ++ "Some VPS statistics will not be available."); ++ /* to prevent division by zero in cycles_to_(clocks|jiffies) */ ++ cycles_per_jiffy = 1; ++ cycles_per_clock = 1; ++ } else ++ cycles_per_clock = cycles_per_jiffy * (HZ / CLOCKS_PER_SEC); ++} ++ + /* + * This is the number of bits of precision for the loops_per_jiffy. Each + * bit takes on average 1.5/HZ seconds. This (like the original) is a little +@@ -170,4 +225,5 @@ void __devinit calibrate_delay(void) + loops_per_jiffy); + } + ++ calc_cycles_per_jiffy(); + } +diff -upr linux-2.6.16.orig/init/main.c linux-2.6.16-026test015/init/main.c +--- linux-2.6.16.orig/init/main.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/init/main.c 2006-07-04 14:41:39.000000000 +0400 +@@ -48,6 +48,8 @@ + #include <linux/mempolicy.h> + #include <linux/key.h> + ++#include <ub/beancounter.h> ++ + #include <asm/io.h> + #include <asm/bugs.h> + #include <asm/setup.h> +@@ -80,6 +82,7 @@ extern void sbus_init(void); + extern void sysctl_init(void); + extern void signals_init(void); + extern void buffer_init(void); ++extern void fairsched_init_late(void); + extern void pidhash_init(void); + extern void pidmap_init(void); + extern void prio_tree_init(void); +@@ -104,6 +107,24 @@ extern void tc_init(void); + enum system_states system_state; + EXPORT_SYMBOL(system_state); + ++#ifdef CONFIG_VE ++extern void init_ve_system(void); ++extern void prepare_ve0_process(struct task_struct *tsk); ++extern void prepare_ve0_proc_root(void); ++extern void prepare_ve0_sysctl(void); ++#else ++#define init_ve_system() do { } while (0) ++#define prepare_ve0_process(tsk) do { } while (0) ++#define prepare_ve0_proc_root() do { } while (0) ++#define prepare_ve0_sysctl() do { } while (0) ++#endif ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++extern void prepare_ve0_loopback(void); ++#else ++#define prepare_ve0_loopback() do { } while (0) ++#endif ++ + /* + * Boot command-line arguments + */ +@@ -447,6 +468,10 @@ asmlinkage void __init start_kernel(void + * enable them + */ + lock_kernel(); ++ /* ++ * Prepare ub0 to account early allocations if any ++ */ ++ ub_init_ub0(); + page_address_init(); + printk(KERN_NOTICE); + printk(linux_banner); +@@ -459,6 +484,8 @@ asmlinkage void __init start_kernel(void + */ + smp_prepare_boot_cpu(); + ++ prepare_ve0_process(&init_task); ++ + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() +@@ -524,6 +551,7 @@ asmlinkage void __init start_kernel(void + #endif + fork_init(num_physpages); + proc_caches_init(); ++ ub_init_cache(num_physpages); + buffer_init(); + unnamed_dev_init(); + key_init(); +@@ -534,7 +562,10 @@ asmlinkage void __init start_kernel(void + /* rootfs populating might need page-writeback */ + page_writeback_init(); + #ifdef CONFIG_PROC_FS ++ prepare_ve0_proc_root(); ++ prepare_ve0_sysctl(); + proc_root_init(); ++ ub_init_proc(); + #endif + cpuset_init(); + +@@ -542,6 +573,10 @@ asmlinkage void __init start_kernel(void + + acpi_early_init(); /* before LAPIC and SMP init */ + ++#ifdef CONFIG_USER_RSS_ACCOUNTING ++ ub_init_pbc(); ++#endif ++ + /* Do the rest non-__init'ed, we're now alive */ + rest_init(); + } +@@ -603,6 +638,9 @@ static void __init do_initcalls(void) + */ + static void __init do_basic_setup(void) + { ++ prepare_ve0_loopback(); ++ init_ve_system(); ++ + /* drivers will send hotplug events */ + init_workqueues(); + usermodehelper_init(); +@@ -618,7 +656,7 @@ static void __init do_basic_setup(void) + static void do_pre_smp_initcalls(void) + { + extern int spawn_ksoftirqd(void); +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU) + extern int migration_init(void); + + migration_init(); +@@ -674,6 +712,12 @@ static int init(void * unused) + + fixup_cpu_present_map(); + smp_init(); ++ ++ /* ++ * This should be done after all cpus are known to ++ * be online. smp_init gives us confidence in it. ++ */ ++ fairsched_init_late(); + sched_init_smp(); + + cpuset_init_smp(); +diff -upr linux-2.6.16.orig/init/version.c linux-2.6.16-026test015/init/version.c +--- linux-2.6.16.orig/init/version.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/init/version.c 2006-07-04 14:41:38.000000000 +0400 +@@ -28,6 +28,12 @@ struct new_utsname system_utsname = { + + EXPORT_SYMBOL(system_utsname); + ++struct new_utsname virt_utsname = { ++ /* we need only this field */ ++ .release = UTS_RELEASE, ++}; ++EXPORT_SYMBOL(virt_utsname); ++ + const char linux_banner[] = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; +diff -upr linux-2.6.16.orig/ipc/mqueue.c linux-2.6.16-026test015/ipc/mqueue.c +--- linux-2.6.16.orig/ipc/mqueue.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/mqueue.c 2006-07-04 14:41:37.000000000 +0400 +@@ -639,7 +639,8 @@ static int oflag2acc[O_ACCMODE] = { MAY_ + return ERR_PTR(-EINVAL); + } + +- if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) { ++ if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], ++ NULL, NULL)) { + dput(dentry); + mntput(mqueue_mnt); + return ERR_PTR(-EACCES); +diff -upr linux-2.6.16.orig/ipc/msg.c linux-2.6.16-026test015/ipc/msg.c +--- linux-2.6.16.orig/ipc/msg.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/msg.c 2006-07-04 14:41:39.000000000 +0400 +@@ -88,6 +88,45 @@ void __init msg_init (void) + sysvipc_msg_proc_show); + } + ++#ifdef CONFIG_VE ++void __init prepare_msg(void) ++{ ++ get_ve0()->_msg_ids = &msg_ids; ++ get_ve0()->_msg_ctlmax = msg_ctlmax; ++ get_ve0()->_msg_ctlmnb = msg_ctlmnb; ++ get_ve0()->_msg_ctlmni = msg_ctlmni; ++} ++ ++#define msg_ids (*(get_exec_env()->_msg_ids)) ++#define msg_ctlmax (get_exec_env()->_msg_ctlmax) ++#define msg_ctlmnb (get_exec_env()->_msg_ctlmnb) ++#define msg_ctlmni (get_exec_env()->_msg_ctlmni) ++ ++void init_ve_ipc_msg(void) ++{ ++ msg_ctlmax = MSGMAX; ++ msg_ctlmnb = MSGMNB; ++ msg_ctlmni = MSGMNI; ++ ipc_init_ids(&msg_ids, MSGMNI); ++} ++ ++void cleanup_ve_ipc_msg(void) ++{ ++ int i; ++ struct msg_queue *msq; ++ ++ down(&msg_ids.sem); ++ for (i = 0; i <= msg_ids.max_id; i++) { ++ msq = msg_lock(i); ++ if (msq == NULL) ++ continue; ++ ++ freeque(msq, i); ++ } ++ up(&msg_ids.sem); ++} ++#endif ++ + static int newque (key_t key, int msgflg) + { + int id; +@@ -108,7 +147,7 @@ static int newque (key_t key, int msgflg + return retval; + } + +- id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni); ++ id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni, -1); + if(id == -1) { + security_msg_queue_free(msq); + ipc_rcu_putref(msq); +@@ -450,7 +489,7 @@ asmlinkage long sys_msgctl (int msqid, i + ipcp = &msq->q_perm; + err = -EPERM; + if (current->euid != ipcp->cuid && +- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) ++ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) + /* We _could_ check for CAP_CHOWN above, but we don't */ + goto out_unlock_up; + +@@ -540,7 +579,7 @@ static inline int pipelined_send(struct + msr->r_msg = ERR_PTR(-E2BIG); + } else { + msr->r_msg = NULL; +- msq->q_lrpid = msr->r_tsk->pid; ++ msq->q_lrpid = virt_pid(msr->r_tsk); + msq->q_rtime = get_seconds(); + wake_up_process(msr->r_tsk); + smp_mb(); +@@ -622,7 +661,7 @@ asmlinkage long sys_msgsnd (int msqid, s + } + } + +- msq->q_lspid = current->tgid; ++ msq->q_lspid = virt_tgid(current); + msq->q_stime = get_seconds(); + + if(!pipelined_send(msq,msg)) { +@@ -718,7 +757,7 @@ asmlinkage long sys_msgrcv (int msqid, s + list_del(&msg->m_list); + msq->q_qnum--; + msq->q_rtime = get_seconds(); +- msq->q_lrpid = current->tgid; ++ msq->q_lrpid = virt_tgid(current); + msq->q_cbytes -= msg->m_ts; + atomic_sub(msg->m_ts,&msg_bytes); + atomic_dec(&msg_hdrs); +@@ -833,3 +872,27 @@ static int sysvipc_msg_proc_show(struct + msq->q_ctime); + } + #endif ++ ++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) ++#include <linux/module.h> ++ ++int sysvipc_walk_msg(int (*func)(int i, struct msg_queue*, void *), void *arg) ++{ ++ int i; ++ int err = 0; ++ struct msg_queue * msq; ++ ++ down(&msg_ids.sem); ++ for(i = 0; i <= msg_ids.max_id; i++) { ++ if ((msq = msg_lock(i)) == NULL) ++ continue; ++ err = func(msg_buildid(i,msq->q_perm.seq), msq, arg); ++ msg_unlock(msq); ++ if (err) ++ break; ++ } ++ up(&msg_ids.sem); ++ return err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_walk_msg); ++#endif +diff -upr linux-2.6.16.orig/ipc/msgutil.c linux-2.6.16-026test015/ipc/msgutil.c +--- linux-2.6.16.orig/ipc/msgutil.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/msgutil.c 2006-07-04 14:41:37.000000000 +0400 +@@ -17,6 +17,8 @@ + + #include "util.h" + ++#include <ub/ub_mem.h> ++ + struct msg_msgseg { + struct msg_msgseg* next; + /* the next part of the message follows immediately */ +@@ -36,7 +38,7 @@ struct msg_msg *load_msg(const void __us + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + +- msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); ++ msg = (struct msg_msg *)ub_kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + +@@ -56,7 +58,7 @@ struct msg_msg *load_msg(const void __us + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; +- seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, ++ seg = (struct msg_msgseg *)ub_kmalloc(sizeof(*seg) + alen, + GFP_KERNEL); + if (seg == NULL) { + err = -ENOMEM; +diff -upr linux-2.6.16.orig/ipc/sem.c linux-2.6.16-026test015/ipc/sem.c +--- linux-2.6.16.orig/ipc/sem.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/sem.c 2006-07-04 14:41:39.000000000 +0400 +@@ -78,6 +78,7 @@ + #include <asm/uaccess.h> + #include "util.h" + ++#include <ub/ub_mem.h> + + #define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) + #define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) +@@ -88,7 +89,7 @@ + ipc_buildid(&sem_ids, id, seq) + static struct ipc_ids sem_ids; + +-static int newary (key_t, int, int); ++static int newary (key_t, int, int, int); + static void freeary (struct sem_array *sma, int id); + #ifdef CONFIG_PROC_FS + static int sysvipc_sem_proc_show(struct seq_file *s, void *it); +@@ -124,6 +125,48 @@ void __init sem_init (void) + sysvipc_sem_proc_show); + } + ++#ifdef CONFIG_VE ++void __init prepare_sem(void) ++{ ++ get_ve0()->_sem_ids = &sem_ids; ++ get_ve0()->_used_sems = used_sems; ++ get_ve0()->_sem_ctls[0] = sem_ctls[0]; ++ get_ve0()->_sem_ctls[1] = sem_ctls[1]; ++ get_ve0()->_sem_ctls[2] = sem_ctls[2]; ++ get_ve0()->_sem_ctls[3] = sem_ctls[3]; ++} ++ ++#define sem_ids (*(get_exec_env()->_sem_ids)) ++#define used_sems (get_exec_env()->_used_sems) ++#define sem_ctls (get_exec_env()->_sem_ctls) ++ ++void init_ve_ipc_sem(void) ++{ ++ used_sems = 0; ++ sem_ctls[0] = SEMMSL; ++ sem_ctls[1] = SEMMNS; ++ sem_ctls[2] = SEMOPM; ++ sem_ctls[3] = SEMMNI; ++ ipc_init_ids(&sem_ids, SEMMNI); ++} ++ ++void cleanup_ve_ipc_sem(void) ++{ ++ int i; ++ struct sem_array *sma; ++ ++ down(&sem_ids.sem); ++ for (i = 0; i <= sem_ids.max_id; i++) { ++ sma = sem_lock(i); ++ if (sma == NULL) ++ continue; ++ ++ freeary(sma, i); ++ } ++ up(&sem_ids.sem); ++} ++#endif ++ + /* + * Lockless wakeup algorithm: + * Without the check/retry algorithm a lockless wakeup is possible: +@@ -158,7 +201,7 @@ void __init sem_init (void) + */ + #define IN_WAKEUP 1 + +-static int newary (key_t key, int nsems, int semflg) ++static int newary (key_t key, int semid, int nsems, int semflg) + { + int id; + int retval; +@@ -187,7 +230,7 @@ static int newary (key_t key, int nsems, + return retval; + } + +- id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni); ++ id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni, semid); + if(id == -1) { + security_sem_free(sma); + ipc_rcu_putref(sma); +@@ -217,12 +260,12 @@ asmlinkage long sys_semget (key_t key, i + down(&sem_ids.sem); + + if (key == IPC_PRIVATE) { +- err = newary(key, nsems, semflg); ++ err = newary(key, -1, nsems, semflg); + } else if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */ + if (!(semflg & IPC_CREAT)) + err = -ENOENT; + else +- err = newary(key, nsems, semflg); ++ err = newary(key, -1, nsems, semflg); + } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) { + err = -EEXIST; + } else { +@@ -743,7 +786,7 @@ static int semctl_main(int semid, int se + for (un = sma->undo; un; un = un->id_next) + un->semadj[semnum] = 0; + curr->semval = val; +- curr->sempid = current->tgid; ++ curr->sempid = virt_tgid(current); + sma->sem_ctime = get_seconds(); + /* maybe some queued-up processes were waiting for this */ + update_queue(sma); +@@ -823,7 +866,7 @@ static int semctl_down(int semid, int se + ipcp = &sma->sem_perm; + + if (current->euid != ipcp->cuid && +- current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { ++ current->euid != ipcp->uid && !capable(CAP_VE_SYS_ADMIN)) { + err=-EPERM; + goto out_unlock; + } +@@ -944,7 +987,8 @@ static inline int get_undo_list(struct s + undo_list = current->sysvsem.undo_list; + if (!undo_list) { + size = sizeof(struct sem_undo_list); +- undo_list = (struct sem_undo_list *) kmalloc(size, GFP_KERNEL); ++ undo_list = (struct sem_undo_list *) ub_kmalloc(size, ++ GFP_KERNEL); + if (undo_list == NULL) + return -ENOMEM; + memset(undo_list, 0, size); +@@ -1008,7 +1052,8 @@ static struct sem_undo *find_undo(int se + ipc_rcu_getref(sma); + sem_unlock(sma); + +- new = (struct sem_undo *) kmalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); ++ new = (struct sem_undo *) ub_kmalloc(sizeof(struct sem_undo) + ++ sizeof(short)*nsems, GFP_KERNEL); + if (!new) { + ipc_lock_by_ptr(&sma->sem_perm); + ipc_rcu_putref(sma); +@@ -1066,7 +1111,7 @@ asmlinkage long sys_semtimedop(int semid + if (nsops > sc_semopm) + return -E2BIG; + if(nsops > SEMOPM_FAST) { +- sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); ++ sops = ub_kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); + if(sops==NULL) + return -ENOMEM; + } +@@ -1150,7 +1195,7 @@ retry_undos: + queue.sops = sops; + queue.nsops = nsops; + queue.undo = un; +- queue.pid = current->tgid; ++ queue.pid = virt_tgid(current); + queue.id = semid; + queue.alter = alter; + if (alter) +@@ -1320,7 +1365,7 @@ found: + sem->semval = 0; + if (sem->semval > SEMVMX) + sem->semval = SEMVMX; +- sem->sempid = current->tgid; ++ sem->sempid = virt_tgid(current); + } + } + sma->sem_otime = get_seconds(); +@@ -1351,3 +1396,48 @@ static int sysvipc_sem_proc_show(struct + sma->sem_ctime); + } + #endif ++ ++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) ++#include <linux/module.h> ++ ++int sysvipc_setup_sem(key_t key, int semid, size_t size, int semflg) ++{ ++ int err = 0; ++ struct sem_array *sma; ++ ++ down(&sem_ids.sem); ++ sma = sem_lock(semid); ++ if (!sma) { ++ err = newary(key, semid, size, semflg); ++ if (err >= 0) ++ sma = sem_lock(semid); ++ } ++ if (sma) ++ sem_unlock(sma); ++ up(&sem_ids.sem); ++ ++ return err > 0 ? 0 : err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_setup_sem); ++ ++int sysvipc_walk_sem(int (*func)(int i, struct sem_array*, void *), void *arg) ++{ ++ int i; ++ int err = 0; ++ struct sem_array *sma; ++ ++ down(&sem_ids.sem); ++ for (i = 0; i <= sem_ids.max_id; i++) { ++ if ((sma = sem_lock(i)) == NULL) ++ continue; ++ err = func(sem_buildid(i,sma->sem_perm.seq), sma, arg); ++ sem_unlock(sma); ++ if (err) ++ break; ++ } ++ up(&sem_ids.sem); ++ return err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_walk_sem); ++EXPORT_SYMBOL_GPL(exit_sem); ++#endif +diff -upr linux-2.6.16.orig/ipc/shm.c linux-2.6.16-026test015/ipc/shm.c +--- linux-2.6.16.orig/ipc/shm.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/shm.c 2006-07-04 14:41:39.000000000 +0400 +@@ -30,9 +30,13 @@ + #include <linux/capability.h> + #include <linux/ptrace.h> + #include <linux/seq_file.h> ++#include <linux/shmem_fs.h> + + #include <asm/uaccess.h> + ++#include <ub/beancounter.h> ++#include <ub/ub_vmpages.h> ++ + #include "util.h" + + static struct file_operations shm_file_operations; +@@ -46,9 +50,11 @@ static struct ipc_ids shm_ids; + #define shm_buildid(id, seq) \ + ipc_buildid(&shm_ids, id, seq) + +-static int newseg (key_t key, int shmflg, size_t size); ++static int newseg (key_t key, int shmid, int shmflg, size_t size); + static void shm_open (struct vm_area_struct *shmd); + static void shm_close (struct vm_area_struct *shmd); ++static void shm_destroy (struct shmid_kernel *shmd); ++static void do_shm_rmid(struct shmid_kernel *shp); + #ifdef CONFIG_PROC_FS + static int sysvipc_shm_proc_show(struct seq_file *s, void *it); + #endif +@@ -68,6 +74,68 @@ void __init shm_init (void) + sysvipc_shm_proc_show); + } + ++#ifdef CONFIG_VE ++void __init prepare_shm(void) ++{ ++ get_ve0()->_shm_ids = &shm_ids; ++ get_ve0()->_shm_ctlmax = shm_ctlmax; ++ get_ve0()->_shm_ctlall = shm_ctlall; ++ get_ve0()->_shm_ctlmni = shm_ctlmni; ++ get_ve0()->_shm_tot = shm_tot; ++} ++ ++#define shm_ids (*(get_exec_env()->_shm_ids)) ++#define shm_ctlmax (get_exec_env()->_shm_ctlmax) ++#define shm_ctlall (get_exec_env()->_shm_ctlall) ++#define shm_ctlmni (get_exec_env()->_shm_ctlmni) ++#define shm_total (get_exec_env()->_shm_tot) ++ ++void init_ve_ipc_shm(void) ++{ ++ shm_ctlmax = SHMMAX; ++ shm_ctlall = SHMALL; ++ shm_ctlmni = SHMMNI; ++ shm_total = 0; ++ ipc_init_ids(&shm_ids, 1); ++} ++ ++void cleanup_ve_ipc_shm(void) ++{ ++ int i; ++ struct shmid_kernel *shp; ++ ++ down(&shm_ids.sem); ++ for (i = 0; i <= shm_ids.max_id; i++) { ++ shp = shm_lock(i); ++ if (shp == NULL) ++ continue; ++ ++ do_shm_rmid(shp); ++ } ++ up(&shm_ids.sem); ++} ++#define sb_ve(sb) VE_OWNER_FSTYPE(sb->s_type) ++#define shm_total_sb(sb) (&sb_ve(sb)->_shm_tot) ++#define shm_lock_sb(id, sb) ((struct shmid_kernel *) \ ++ ipc_lock(sb_ve(sb)->_shm_ids, id)) ++#else ++/* renamed since there is a struct field named shm_tot */ ++#define shm_total shm_tot ++#define shm_total_sb(sb) (&shm_tot) ++#define shm_lock_sb(id, sb) shm_lock(id) ++#endif ++ ++static void do_shm_rmid(struct shmid_kernel *shp) ++{ ++ if (shp->shm_nattch){ ++ shp->shm_perm.mode |= SHM_DEST; ++ /* Do not find it any more */ ++ shp->shm_perm.key = IPC_PRIVATE; ++ shm_unlock(shp); ++ } else ++ shm_destroy (shp); ++} ++ + static inline int shm_checkid(struct shmid_kernel *s, int id) + { + if (ipc_checkid(&shm_ids,&s->shm_perm,id)) +@@ -75,25 +143,25 @@ static inline int shm_checkid(struct shm + return 0; + } + +-static inline struct shmid_kernel *shm_rmid(int id) ++static inline struct shmid_kernel *shm_rmid(struct ipc_ids *ids, int id) + { +- return (struct shmid_kernel *)ipc_rmid(&shm_ids,id); ++ return (struct shmid_kernel *)ipc_rmid(ids,id); + } + +-static inline int shm_addid(struct shmid_kernel *shp) ++static inline int shm_addid(struct shmid_kernel *shp, int reqid) + { +- return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); ++ return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni, reqid); + } + + + +-static inline void shm_inc (int id) { ++static inline void shm_inc(int id, struct super_block *sb) { + struct shmid_kernel *shp; + +- if(!(shp = shm_lock(id))) ++ if(!(shp = shm_lock_sb(id, sb))) + BUG(); + shp->shm_atim = get_seconds(); +- shp->shm_lprid = current->tgid; ++ shp->shm_lprid = virt_tgid(current); + shp->shm_nattch++; + shm_unlock(shp); + } +@@ -101,7 +169,50 @@ static inline void shm_inc (int id) { + /* This is called by fork, once for every shm attach. */ + static void shm_open (struct vm_area_struct *shmd) + { +- shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino); ++ shm_inc(shmd->vm_file->f_dentry->d_inode->i_ino, ++ shmd->vm_file->f_dentry->d_inode->i_sb); ++} ++ ++static int shmem_lock(struct shmid_kernel *shp, int lock, ++ struct user_struct *user) ++{ ++ struct file *file = shp->shm_file; ++ struct inode *inode = file->f_dentry->d_inode; ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ unsigned long size; ++ ++ size = shp->shm_segsz + PAGE_SIZE - 1; ++ ++#ifdef CONFIG_SHMEM ++ spin_lock(&info->lock); ++ if (lock && !(info->flags & VM_LOCKED)) { ++ if (ub_lockedshm_charge(info, size) < 0) ++ goto out_ch; ++ ++ if (!user_shm_lock(inode->i_size, user)) ++ goto out_user; ++ info->flags |= VM_LOCKED; ++ } ++ if (!lock && (info->flags & VM_LOCKED) && user) { ++ ub_lockedshm_uncharge(info, size); ++ user_shm_unlock(inode->i_size, user); ++ info->flags &= ~VM_LOCKED; ++ } ++ spin_unlock(&info->lock); ++ return 0; ++ ++out_user: ++ ub_lockedshm_uncharge(info, size); ++out_ch: ++ spin_unlock(&info->lock); ++ return -ENOMEM; ++#else ++ if (lock && ub_lockedshm_charge(info, size)) ++ return -ENOMEM; ++ if (!lock) ++ ub_lockedshm_uncharge(info, size); ++ return 0; ++#endif + } + + /* +@@ -114,15 +225,24 @@ static void shm_open (struct vm_area_str + */ + static void shm_destroy (struct shmid_kernel *shp) + { +- shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; +- shm_rmid (shp->id); ++ int numpages, *shm_totalp; ++ struct file *f; ++ struct super_block *sb; ++ ++ f = shp->shm_file; ++ sb = f->f_dentry->d_inode->i_sb; ++ numpages = (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ shm_totalp = shm_total_sb(sb); ++ *shm_totalp -= numpages; ++ ++ shm_rmid (shp->_shm_ids, shp->id); + shm_unlock(shp); + if (!is_file_hugepages(shp->shm_file)) +- shmem_lock(shp->shm_file, 0, shp->mlock_user); ++ shmem_lock(shp, 0, shp->mlock_user); + else + user_shm_unlock(shp->shm_file->f_dentry->d_inode->i_size, + shp->mlock_user); +- fput (shp->shm_file); ++ fput(f); + security_shm_free(shp); + ipc_rcu_putref(shp); + } +@@ -138,12 +258,24 @@ static void shm_close (struct vm_area_st + struct file * file = shmd->vm_file; + int id = file->f_dentry->d_inode->i_ino; + struct shmid_kernel *shp; ++ struct super_block *sb; ++ struct ipc_ids *ids; ++#ifdef CONFIG_VE ++ struct ve_struct *ve; ++ ++ sb = file->f_dentry->d_inode->i_sb; ++ ve = get_ve(sb_ve(sb)); ++ ids = ve->_shm_ids; ++#else ++ sb = file->f_dentry->d_inode->i_sb; ++ ids = &shm_ids; ++#endif + +- down (&shm_ids.sem); ++ down (&ids->sem); + /* remove from the list of attaches of the shm segment */ +- if(!(shp = shm_lock(id))) ++ if(!(shp = shm_lock_sb(id, sb))) + BUG(); +- shp->shm_lprid = current->tgid; ++ shp->shm_lprid = virt_tgid(current); + shp->shm_dtim = get_seconds(); + shp->shm_nattch--; + if(shp->shm_nattch == 0 && +@@ -151,7 +283,10 @@ static void shm_close (struct vm_area_st + shm_destroy (shp); + else + shm_unlock(shp); +- up (&shm_ids.sem); ++ up(&ids->sem); ++#ifdef CONFIG_VE ++ put_ve(ve); ++#endif + } + + static int shm_mmap(struct file * file, struct vm_area_struct * vma) +@@ -161,7 +296,10 @@ static int shm_mmap(struct file * file, + ret = shmem_mmap(file, vma); + if (ret == 0) { + vma->vm_ops = &shm_vm_ops; +- shm_inc(file->f_dentry->d_inode->i_ino); ++ if (!(vma->vm_flags & VM_WRITE)) ++ vma->vm_flags &= ~VM_MAYWRITE; ++ shm_inc(file->f_dentry->d_inode->i_ino, ++ file->f_dentry->d_inode->i_sb); + } + + return ret; +@@ -184,19 +322,19 @@ static struct vm_operations_struct shm_v + #endif + }; + +-static int newseg (key_t key, int shmflg, size_t size) ++static int newseg (key_t key, int shmid, int shmflg, size_t size) + { + int error; + struct shmid_kernel *shp; + int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; + struct file * file; +- char name[13]; ++ char name[26]; + int id; + + if (size < SHMMIN || size > shm_ctlmax) + return -EINVAL; + +- if (shm_tot + numpages >= shm_ctlall) ++ if (shm_total + numpages >= shm_ctlall) + return -ENOSPC; + + shp = ipc_rcu_alloc(sizeof(*shp)); +@@ -227,7 +365,11 @@ static int newseg (key_t key, int shmflg + if ((shmflg & SHM_NORESERVE) && + sysctl_overcommit_memory != OVERCOMMIT_NEVER) + acctflag = 0; ++#ifdef CONFIG_VE ++ sprintf (name, "VE%d.SYSV%08x", get_exec_env()->veid, key); ++#else + sprintf (name, "SYSV%08x", key); ++#endif + file = shmem_file_setup(name, size, acctflag); + } + error = PTR_ERR(file); +@@ -235,17 +377,18 @@ static int newseg (key_t key, int shmflg + goto no_file; + + error = -ENOSPC; +- id = shm_addid(shp); ++ id = shm_addid(shp, shmid); + if(id == -1) + goto no_id; + +- shp->shm_cprid = current->tgid; ++ shp->shm_cprid = virt_tgid(current); + shp->shm_lprid = 0; + shp->shm_atim = shp->shm_dtim = 0; + shp->shm_ctim = get_seconds(); + shp->shm_segsz = size; + shp->shm_nattch = 0; + shp->id = shm_buildid(id,shp->shm_perm.seq); ++ shp->_shm_ids = &shm_ids; + shp->shm_file = file; + file->f_dentry->d_inode->i_ino = shp->id; + +@@ -253,7 +396,7 @@ static int newseg (key_t key, int shmflg + if (!(shmflg & SHM_HUGETLB)) + file->f_op = &shm_file_operations; + +- shm_tot += numpages; ++ shm_total += numpages; + shm_unlock(shp); + return shp->id; + +@@ -272,12 +415,12 @@ asmlinkage long sys_shmget (key_t key, s + + down(&shm_ids.sem); + if (key == IPC_PRIVATE) { +- err = newseg(key, shmflg, size); ++ err = newseg(key, -1, shmflg, size); + } else if ((id = ipc_findkey(&shm_ids, key)) == -1) { + if (!(shmflg & IPC_CREAT)) + err = -ENOENT; + else +- err = newseg(key, shmflg, size); ++ err = newseg(key, -1, shmflg, size); + } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) { + err = -EEXIST; + } else { +@@ -470,7 +613,7 @@ asmlinkage long sys_shmctl (int shmid, i + down(&shm_ids.sem); + shm_info.used_ids = shm_ids.in_use; + shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp); +- shm_info.shm_tot = shm_tot; ++ shm_info.shm_tot = shm_total; + shm_info.swap_attempts = 0; + shm_info.swap_successes = 0; + err = shm_ids.max_id; +@@ -557,14 +700,14 @@ asmlinkage long sys_shmctl (int shmid, i + if(cmd==SHM_LOCK) { + struct user_struct * user = current->user; + if (!is_file_hugepages(shp->shm_file)) { +- err = shmem_lock(shp->shm_file, 1, user); ++ err = shmem_lock(shp, 1, user); + if (!err) { + shp->shm_perm.mode |= SHM_LOCKED; + shp->mlock_user = user; + } + } + } else if (!is_file_hugepages(shp->shm_file)) { +- shmem_lock(shp->shm_file, 0, shp->mlock_user); ++ shmem_lock(shp, 0, shp->mlock_user); + shp->shm_perm.mode &= ~SHM_LOCKED; + shp->mlock_user = NULL; + } +@@ -594,7 +737,7 @@ asmlinkage long sys_shmctl (int shmid, i + + if (current->euid != shp->shm_perm.uid && + current->euid != shp->shm_perm.cuid && +- !capable(CAP_SYS_ADMIN)) { ++ !capable(CAP_VE_SYS_ADMIN)) { + err=-EPERM; + goto out_unlock_up; + } +@@ -603,13 +746,7 @@ asmlinkage long sys_shmctl (int shmid, i + if (err) + goto out_unlock_up; + +- if (shp->shm_nattch){ +- shp->shm_perm.mode |= SHM_DEST; +- /* Do not find it any more */ +- shp->shm_perm.key = IPC_PRIVATE; +- shm_unlock(shp); +- } else +- shm_destroy (shp); ++ do_shm_rmid(shp); + up(&shm_ids.sem); + goto out; + } +@@ -633,7 +770,7 @@ asmlinkage long sys_shmctl (int shmid, i + err=-EPERM; + if (current->euid != shp->shm_perm.uid && + current->euid != shp->shm_perm.cuid && +- !capable(CAP_SYS_ADMIN)) { ++ !capable(CAP_VE_SYS_ADMIN)) { + goto out_unlock_up; + } + +@@ -916,3 +1053,55 @@ static int sysvipc_shm_proc_show(struct + shp->shm_ctim); + } + #endif ++ ++#if defined(CONFIG_VZ_CHECKPOINT) || defined(CONFIG_VZ_CHECKPOINT_MODULE) ++#include <linux/module.h> ++ ++struct file * sysvipc_setup_shm(key_t key, int shmid, size_t size, int shmflg) ++{ ++ struct shmid_kernel *shp; ++ struct file *file; ++ ++ down(&shm_ids.sem); ++ shp = shm_lock(shmid); ++ if (!shp) { ++ int err; ++ ++ err = newseg(key, shmid, shmflg, size); ++ file = ERR_PTR(err); ++ if (err < 0) ++ goto out; ++ shp = shm_lock(shmid); ++ } ++ file = ERR_PTR(-EINVAL); ++ if (shp) { ++ file = shp->shm_file; ++ get_file(file); ++ shm_unlock(shp); ++ } ++out: ++ up(&shm_ids.sem); ++ return file; ++} ++EXPORT_SYMBOL_GPL(sysvipc_setup_shm); ++ ++int sysvipc_walk_shm(int (*func)(struct shmid_kernel*, void *), void *arg) ++{ ++ int i; ++ int err = 0; ++ struct shmid_kernel* shp; ++ ++ down(&shm_ids.sem); ++ for(i = 0; i <= shm_ids.max_id; i++) { ++ if ((shp = shm_lock(i)) == NULL) ++ continue; ++ err = func(shp, arg); ++ shm_unlock(shp); ++ if (err) ++ break; ++ } ++ up(&shm_ids.sem); ++ return err; ++} ++EXPORT_SYMBOL_GPL(sysvipc_walk_shm); ++#endif +diff -upr linux-2.6.16.orig/ipc/util.c linux-2.6.16-026test015/ipc/util.c +--- linux-2.6.16.orig/ipc/util.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/util.c 2006-07-04 14:41:39.000000000 +0400 +@@ -13,6 +13,7 @@ + */ + + #include <linux/config.h> ++#include <linux/module.h> + #include <linux/mm.h> + #include <linux/shm.h> + #include <linux/init.h> +@@ -30,6 +31,8 @@ + + #include <asm/unistd.h> + ++#include <ub/ub_mem.h> ++ + #include "util.h" + + struct ipc_proc_iface { +@@ -65,7 +68,7 @@ __initcall(ipc_init); + * array itself. + */ + +-void __init ipc_init_ids(struct ipc_ids* ids, int size) ++void __ve_init ipc_init_ids(struct ipc_ids* ids, int size) + { + int i; + sema_init(&ids->sem,1); +@@ -94,7 +97,21 @@ void __init ipc_init_ids(struct ipc_ids* + ids->entries->size = size; + for(i=0;i<size;i++) + ids->entries->p[i] = NULL; ++ ++ ids->owner_env = get_exec_env(); ++} ++ ++#ifdef CONFIG_VE ++static inline void ipc_free_ids(struct ipc_ids *ids) ++{ ++ if (ids == NULL) ++ return; ++ ++ if (ids->entries != &ids->nullentry) ++ ipc_rcu_putref(ids->entries); ++ kfree(ids); + } ++#endif + + #ifdef CONFIG_PROC_FS + static struct file_operations sysvipc_proc_fops; +@@ -182,8 +199,7 @@ static int grow_ary(struct ipc_ids* ids, + if(new == NULL) + return size; + new->size = newsize; +- memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size + +- sizeof(struct ipc_id_ary)); ++ memcpy(new->p, ids->entries->p, sizeof(struct kern_ipc_perm *)*size); + for(i=size;i<newsize;i++) { + new->p[i] = NULL; + } +@@ -213,10 +229,20 @@ static int grow_ary(struct ipc_ids* ids, + * Called with ipc_ids.sem held. + */ + +-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) ++int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid) + { + int id; + ++ if (reqid >= 0) { ++ id = reqid%SEQ_MULTIPLIER; ++ size = grow_ary(ids,id+1); ++ if (id >= size) ++ return -1; ++ if (ids->entries->p[id] == NULL) ++ goto found; ++ return -1; ++ } ++ + size = grow_ary(ids,size); + + /* +@@ -229,16 +255,21 @@ int ipc_addid(struct ipc_ids* ids, struc + } + return -1; + found: +- ids->in_use++; ++ if (ids->in_use++ == 0) ++ (void)get_ve(ids->owner_env); + if (id > ids->max_id) + ids->max_id = id; + + new->cuid = new->uid = current->euid; + new->gid = new->cgid = current->egid; + +- new->seq = ids->seq++; +- if(ids->seq > ids->seq_max) +- ids->seq = 0; ++ if (reqid >= 0) { ++ new->seq = reqid/SEQ_MULTIPLIER; ++ } else { ++ new->seq = ids->seq++; ++ if(ids->seq > ids->seq_max) ++ ids->seq = 0; ++ } + + spin_lock_init(&new->lock); + new->deleted = 0; +@@ -276,7 +307,8 @@ struct kern_ipc_perm* ipc_rmid(struct ip + ids->entries->p[lid] = NULL; + if(p==NULL) + BUG(); +- ids->in_use--; ++ if (--ids->in_use == 0) ++ put_ve(ids->owner_env); + + if (lid == ids->max_id) { + do { +@@ -302,9 +334,9 @@ void* ipc_alloc(int size) + { + void* out; + if(size > PAGE_SIZE) +- out = vmalloc(size); ++ out = ub_vmalloc(size); + else +- out = kmalloc(size, GFP_KERNEL); ++ out = ub_kmalloc(size, GFP_KERNEL); + return out; + } + +@@ -387,14 +419,14 @@ void* ipc_rcu_alloc(int size) + * workqueue if necessary (for vmalloc). + */ + if (rcu_use_vmalloc(size)) { +- out = vmalloc(HDRLEN_VMALLOC + size); ++ out = ub_vmalloc(HDRLEN_VMALLOC + size); + if (out) { + out += HDRLEN_VMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 1; + container_of(out, struct ipc_rcu_hdr, data)->refcount = 1; + } + } else { +- out = kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); ++ out = ub_kmalloc(HDRLEN_KMALLOC + size, GFP_KERNEL); + if (out) { + out += HDRLEN_KMALLOC; + container_of(out, struct ipc_rcu_hdr, data)->is_vmalloc = 0; +@@ -603,6 +635,71 @@ int ipc_checkid(struct ipc_ids* ids, str + return 0; + } + ++#ifdef CONFIG_VE ++void __init prepare_ipc(void) ++{ ++ prepare_msg(); ++ prepare_sem(); ++ prepare_shm(); ++} ++ ++int init_ve_ipc(struct ve_struct * envid) ++{ ++ envid->_msg_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), ++ GFP_KERNEL); ++ if (envid->_msg_ids == NULL) ++ goto out_nomem; ++ envid->_sem_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), ++ GFP_KERNEL); ++ if (envid->_sem_ids == NULL) ++ goto out_free_msg; ++ envid->_shm_ids = kmalloc(sizeof(struct ipc_ids) + sizeof(void *), ++ GFP_KERNEL); ++ if (envid->_shm_ids == NULL) ++ goto out_free_sem; ++ ++ init_ve_ipc_msg(); ++ init_ve_ipc_sem(); ++ init_ve_ipc_shm(); ++ return 0; ++ ++out_free_sem: ++ kfree(envid->_sem_ids); ++out_free_msg: ++ kfree(envid->_msg_ids); ++out_nomem: ++ return -ENOMEM; ++} ++ ++void ve_ipc_cleanup(void) ++{ ++ cleanup_ve_ipc_msg(); ++ cleanup_ve_ipc_sem(); ++ cleanup_ve_ipc_shm(); ++} ++ ++void ve_ipc_free(struct ve_struct *env) ++{ ++ ipc_free_ids(env->_msg_ids); ++ ipc_free_ids(env->_sem_ids); ++ ipc_free_ids(env->_shm_ids); ++ env->_msg_ids = NULL; ++ env->_sem_ids = NULL; ++ env->_shm_ids = NULL; ++} ++ ++void fini_ve_ipc(struct ve_struct *ptr) ++{ ++ ve_ipc_cleanup(); ++ ve_ipc_free(ptr); ++} ++ ++EXPORT_SYMBOL(init_ve_ipc); ++EXPORT_SYMBOL(ve_ipc_cleanup); ++EXPORT_SYMBOL(ve_ipc_free); ++EXPORT_SYMBOL(fini_ve_ipc); ++#endif /* CONFIG_VE */ ++ + #ifdef __ARCH_WANT_IPC_PARSE_VERSION + + +diff -upr linux-2.6.16.orig/ipc/util.h linux-2.6.16-026test015/ipc/util.h +--- linux-2.6.16.orig/ipc/util.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/ipc/util.h 2006-07-04 14:41:39.000000000 +0400 +@@ -15,6 +15,22 @@ void sem_init (void); + void msg_init (void); + void shm_init (void); + ++#ifdef CONFIG_VE ++void prepare_msg(void); ++void prepare_sem(void); ++void prepare_shm(void); ++void init_ve_ipc_msg(void); ++void init_ve_ipc_sem(void); ++void init_ve_ipc_shm(void); ++void cleanup_ve_ipc_msg(void); ++void cleanup_ve_ipc_sem(void); ++void cleanup_ve_ipc_shm(void); ++ ++#define __ve_init ++#else ++#define __ve_init __init ++#endif ++ + struct ipc_id_ary { + int size; + struct kern_ipc_perm *p[0]; +@@ -28,10 +44,11 @@ struct ipc_ids { + struct semaphore sem; + struct ipc_id_ary nullentry; + struct ipc_id_ary* entries; ++ struct ve_struct *owner_env; + }; + + struct seq_file; +-void __init ipc_init_ids(struct ipc_ids* ids, int size); ++void __ve_init ipc_init_ids(struct ipc_ids *ids, int size); + #ifdef CONFIG_PROC_FS + void __init ipc_init_proc_interface(const char *path, const char *header, + struct ipc_ids *ids, +@@ -42,7 +59,7 @@ void __init ipc_init_proc_interface(cons + + /* must be called with ids->sem acquired.*/ + int ipc_findkey(struct ipc_ids* ids, key_t key); +-int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); ++int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size, int reqid); + + /* must be called with both locks acquired. */ + struct kern_ipc_perm* ipc_rmid(struct ipc_ids* ids, int id); +diff -upr linux-2.6.16.orig/kernel/Kconfig.fairsched linux-2.6.16-026test015/kernel/Kconfig.fairsched +--- linux-2.6.16.orig/kernel/Kconfig.fairsched 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/Kconfig.fairsched 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,22 @@ ++config SCHED_VCPU ++ bool "VCPU scheduler support" ++ default n ++ help ++ VCPU scheduler support adds additional layer of abstraction ++ which allows to virtualize cpu notion and split physical cpus ++ and virtual cpus. This support allows to use CPU fair scheduler, ++ dynamically add/remove cpus to/from VPS and so on. ++ ++config FAIRSCHED ++ bool "Fair CPU scheduler (EXPERIMENTAL)" ++ depends on SCHED_VCPU ++ default SCHED_VCPU ++ help ++ Config option for Fair CPU scheduler (fairsched). ++ This option allows to group processes to scheduling nodes ++ which receive CPU proportional to their weight. ++ This is very important feature for process groups isolation and ++ QoS management. ++ ++ If unsure, say N. ++ +diff -upr linux-2.6.16.orig/kernel/Kconfig.openvz linux-2.6.16-026test015/kernel/Kconfig.openvz +--- linux-2.6.16.orig/kernel/Kconfig.openvz 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/Kconfig.openvz 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,61 @@ ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++menu "OpenVZ" ++ ++config VE ++ bool "Virtual Environment support" ++ default y ++ help ++ This option adds support of virtual Linux running on the original box ++ with fully supported virtual network driver, tty subsystem and ++ configurable access for hardware and other resources. ++ ++config VE_CALLS ++ tristate "VE calls interface" ++ depends on VE ++ default m ++ help ++ This option controls how to build vzmon code containing VE calls. ++ By default it's build in module vzmon.o ++ ++config VE_NETDEV ++ tristate "VE networking" ++ depends on VE_CALLS ++ default m ++ help ++ This option controls whether to build VE networking code. ++ ++config VE_ETHDEV ++ tristate "Virtual ethernet device" ++ depends on VE_CALLS ++ default m ++ help ++ This option controls whether to build virtual ethernet device. ++ ++config VE_IPTABLES ++ bool "VE netfiltering" ++ depends on VE && VE_NETDEV && INET && NETFILTER ++ default y ++ help ++ This option controls whether to build VE netfiltering code. ++ ++config VZ_WDOG ++ tristate "VE watchdog module" ++ depends on VE_CALLS ++ default m ++ help ++ This option controls building of vzwdog module, which dumps ++ a lot of useful system info on console periodically. ++ ++config VZ_CHECKPOINT ++ tristate "Checkpointing & restoring Virtual Environments" ++ depends on SOFTWARE_SUSPEND && VE_CALLS ++ default m ++ help ++ This option adds two modules, "cpt" and "rst", which allow ++ to save a running Virtual Environment and restore it ++ on another host (live migration) or on the same host (checkpointing). ++ ++endmenu +diff -upr linux-2.6.16.orig/kernel/Makefile linux-2.6.16-026test015/kernel/Makefile +--- linux-2.6.16.orig/kernel/Makefile 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/Makefile 2006-07-04 14:41:39.000000000 +0400 +@@ -2,7 +2,8 @@ + # Makefile for the linux kernel. + # + +-obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ ++obj-y = sched.o fairsched.o \ ++ fork.o exec_domain.o panic.o printk.o profile.o \ + exit.o itimer.o time.o softirq.o resource.o \ + sysctl.o capability.o ptrace.o timer.o user.o \ + signal.o sys.o kmod.o workqueue.o pid.o \ +@@ -10,6 +11,18 @@ obj-y = sched.o fork.o exec_domain.o + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + hrtimer.o + ++obj-y += ub/ ++ ++obj-$(CONFIG_VE) += ve.o ++obj-$(CONFIG_VE) += veowner.o ++obj-$(CONFIG_VE_CALLS) += vzdev.o ++obj-$(CONFIG_VZ_WDOG) += vzwdog.o ++obj-$(CONFIG_VE_CALLS) += vzmon.o ++ ++vzmon-objs = vecalls.o ++ ++obj-$(CONFIG_VZ_CHECKPOINT) += cpt/ ++ + obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o + obj-$(CONFIG_FUTEX) += futex.o + obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o +diff -upr linux-2.6.16.orig/kernel/audit.c linux-2.6.16-026test015/kernel/audit.c +--- linux-2.6.16.orig/kernel/audit.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/audit.c 2006-07-04 14:41:38.000000000 +0400 +@@ -372,6 +372,9 @@ static int audit_receive_msg(struct sk_b + uid_t loginuid; /* loginuid of sender */ + struct audit_sig_info sig_data; + ++ if (!ve_is_super(VE_OWNER_SKB(skb))) ++ return -ECONNREFUSED; ++ + err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); + if (err) + return err; +diff -upr linux-2.6.16.orig/kernel/auditsc.c linux-2.6.16-026test015/kernel/auditsc.c +--- linux-2.6.16.orig/kernel/auditsc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/auditsc.c 2006-07-04 14:41:36.000000000 +0400 +@@ -966,11 +966,6 @@ void audit_syscall_entry(struct task_str + if (context->in_syscall) { + struct audit_context *newctx; + +-#if defined(__NR_vm86) && defined(__NR_vm86old) +- /* vm86 mode should only be entered once */ +- if (major == __NR_vm86 || major == __NR_vm86old) +- return; +-#endif + #if AUDIT_DEBUG + printk(KERN_ERR + "audit(:%d) pid=%d in syscall=%d;" +diff -upr linux-2.6.16.orig/kernel/capability.c linux-2.6.16-026test015/kernel/capability.c +--- linux-2.6.16.orig/kernel/capability.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/capability.c 2006-07-04 14:41:39.000000000 +0400 +@@ -24,7 +24,8 @@ EXPORT_SYMBOL(cap_bset); + * This lock protects task->cap_* for all tasks including current. + * Locking rule: acquire this prior to tasklist_lock. + */ +-static DEFINE_SPINLOCK(task_capability_lock); ++DEFINE_SPINLOCK(task_capability_lock); ++EXPORT_SYMBOL(task_capability_lock); + + /* + * For sys_getproccap() and sys_setproccap(), any of the three +@@ -67,8 +68,8 @@ asmlinkage long sys_capget(cap_user_head + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + +- if (pid && pid != current->pid) { +- target = find_task_by_pid(pid); ++ if (pid && pid != virt_pid(current)) { ++ target = find_task_by_pid_ve(pid); + if (!target) { + ret = -ESRCH; + goto out; +@@ -100,9 +101,13 @@ static inline int cap_set_pg(int pgrp, k + int ret = -EPERM; + int found = 0; + +- do_each_task_pid(pgrp, PIDTYPE_PGID, g) { ++ pgrp = vpid_to_pid(pgrp); ++ if (pgrp < 0) ++ return ret; ++ ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, g) { + target = g; +- while_each_thread(g, target) { ++ while_each_thread_ve(g, target) { + if (!security_capset_check(target, effective, + inheritable, + permitted)) { +@@ -113,7 +118,7 @@ static inline int cap_set_pg(int pgrp, k + } + found = 1; + } +- } while_each_task_pid(pgrp, PIDTYPE_PGID, g); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, g); + + if (!found) + ret = 0; +@@ -132,7 +137,7 @@ static inline int cap_set_all(kernel_cap + int ret = -EPERM; + int found = 0; + +- do_each_thread(g, target) { ++ do_each_thread_ve(g, target) { + if (target == current || target->pid == 1) + continue; + found = 1; +@@ -141,7 +146,7 @@ static inline int cap_set_all(kernel_cap + continue; + ret = 0; + security_capset_set(target, effective, inheritable, permitted); +- } while_each_thread(g, target); ++ } while_each_thread_ve(g, target); + + if (!found) + ret = 0; +@@ -188,7 +193,7 @@ asmlinkage long sys_capset(cap_user_head + if (get_user(pid, &header->pid)) + return -EFAULT; + +- if (pid && pid != current->pid && !capable(CAP_SETPCAP)) ++ if (pid && pid != virt_pid(current) && !capable(CAP_SETPCAP)) + return -EPERM; + + if (copy_from_user(&effective, &data->effective, sizeof(effective)) || +@@ -199,8 +204,8 @@ asmlinkage long sys_capset(cap_user_head + spin_lock(&task_capability_lock); + read_lock(&tasklist_lock); + +- if (pid > 0 && pid != current->pid) { +- target = find_task_by_pid(pid); ++ if (pid > 0 && pid != virt_pid(current)) { ++ target = find_task_by_pid_ve(pid); + if (!target) { + ret = -ESRCH; + goto out; +diff -upr linux-2.6.16.orig/kernel/compat.c linux-2.6.16-026test015/kernel/compat.c +--- linux-2.6.16.orig/kernel/compat.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/compat.c 2006-07-04 14:41:39.000000000 +0400 +@@ -21,6 +21,8 @@ + #include <linux/syscalls.h> + #include <linux/unistd.h> + #include <linux/security.h> ++#include <linux/hrtimer.h> ++#include <linux/module.h> + + #include <asm/uaccess.h> + +@@ -38,61 +40,73 @@ int put_compat_timespec(const struct tim + __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; + } + +-static long compat_nanosleep_restart(struct restart_block *restart) ++long compat_nanosleep_restart(struct restart_block *restart) + { +- unsigned long expire = restart->arg0, now = jiffies; + struct compat_timespec __user *rmtp; ++ struct timespec tu; ++ void *rfn_save = restart->fn; ++ struct hrtimer timer; ++ ktime_t rem; + +- /* Did it expire while we handled signals? */ +- if (!time_after(expire, now)) +- return 0; ++ restart->fn = do_no_restart_syscall; ++ ++ hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS); ++ ++ timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; + +- expire = schedule_timeout_interruptible(expire - now); +- if (expire == 0) ++ set_current_state(TASK_INTERRUPTIBLE); ++ rem = schedule_hrtimer(&timer, HRTIMER_ABS); ++ ++ if (rem.tv64 <= 0) + return 0; + +- rmtp = (struct compat_timespec __user *)restart->arg1; +- if (rmtp) { +- struct compat_timespec ct; +- struct timespec t; +- +- jiffies_to_timespec(expire, &t); +- ct.tv_sec = t.tv_sec; +- ct.tv_nsec = t.tv_nsec; +- if (copy_to_user(rmtp, &ct, sizeof(ct))) +- return -EFAULT; +- } +- /* The 'restart' block is already filled in */ ++ rmtp = (struct compat_timespec __user *) restart->arg2; ++ tu = ktime_to_timespec(rem); ++ if (rmtp && put_compat_timespec(&tu, rmtp)) ++ return -EFAULT; ++ ++ restart->fn = rfn_save; ++ ++ /* The other values in restart are already filled in */ + return -ERESTART_RESTARTBLOCK; + } ++EXPORT_SYMBOL_GPL(compat_nanosleep_restart); + + asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp) + { + struct timespec t; + struct restart_block *restart; +- unsigned long expire; ++ struct hrtimer timer; ++ ktime_t rem; + + if (get_compat_timespec(&t, rqtp)) + return -EFAULT; + +- if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0)) ++ if (!timespec_valid(&t)) + return -EINVAL; + +- expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); +- expire = schedule_timeout_interruptible(expire); +- if (expire == 0) ++ hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_REL); ++ ++ timer.expires = timespec_to_ktime(t); ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ rem = schedule_hrtimer(&timer, HRTIMER_REL); ++ if (rem.tv64 <= 0) + return 0; + +- if (rmtp) { +- jiffies_to_timespec(expire, &t); +- if (put_compat_timespec(&t, rmtp)) +- return -EFAULT; +- } ++ t = ktime_to_timespec(rem); ++ ++ if (rmtp && put_compat_timespec(&t, rmtp)) ++ return -EFAULT; ++ + restart = ¤t_thread_info()->restart_block; + restart->fn = compat_nanosleep_restart; +- restart->arg0 = jiffies + expire; +- restart->arg1 = (unsigned long) rmtp; ++ restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF; ++ restart->arg1 = timer.expires.tv64 >> 32; ++ restart->arg2 = (unsigned long) rmtp; ++ restart->arg3 = (unsigned long) timer.base->index; ++ + return -ERESTART_RESTARTBLOCK; + } + +diff -upr linux-2.6.16.orig/kernel/configs.c linux-2.6.16-026test015/kernel/configs.c +--- linux-2.6.16.orig/kernel/configs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/configs.c 2006-07-04 14:41:38.000000000 +0400 +@@ -89,8 +89,7 @@ static int __init ikconfig_init(void) + struct proc_dir_entry *entry; + + /* create the current config file */ +- entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, +- &proc_root); ++ entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, NULL); + if (!entry) + return -ENOMEM; + +diff -upr linux-2.6.16.orig/kernel/cpt/Makefile linux-2.6.16-026test015/kernel/cpt/Makefile +--- linux-2.6.16.orig/kernel/cpt/Makefile 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/Makefile 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,41 @@ ++# ++# ++# kernel/cpt/Makefile ++# ++# Copyright (C) 2000-2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++obj-$(CONFIG_VZ_CHECKPOINT) += vzcpt.o vzrst.o ++ ++vzcpt-objs := cpt_proc.o cpt_dump.o cpt_obj.o cpt_context.o cpt_process.o \ ++ cpt_mm.o cpt_files.o cpt_kernel.o \ ++ cpt_socket.o cpt_socket_in.o cpt_tty.o cpt_sysvipc.o cpt_net.o \ ++ cpt_conntrack.o cpt_ubc.o cpt_epoll.o ++ ++vzrst-objs := rst_proc.o rst_undump.o rst_context.o rst_process.o \ ++ rst_mm.o rst_files.o \ ++ rst_socket.o rst_socket_in.o rst_tty.o rst_sysvipc.o rst_net.o \ ++ rst_conntrack.o rst_ubc.o rst_epoll.o ++ ++ifeq ($(CONFIG_VZ_CHECKPOINT), m) ++vzrst-objs += cpt_obj.o cpt_kernel.o ++endif ++ ++ifeq ($(CONFIG_VZ_CHECKPOINT_LAZY), y) ++vzcpt-objs += cpt_pagein.o ++vzrst-objs += rst_pagein.o ++endif ++ ++ifeq ($(CONFIG_X86_64), y) ++vzcpt-objs += cpt_x8664.o ++vzrst-objs += rst_x8664.o ++ifeq ($(CONFIG_VZ_CHECKPOINT), m) ++vzrst-objs += cpt_x8664.o ++endif ++endif ++ ++ifeq ($(CONFIG_X86_32), y) ++vzrst-objs += rst_i386.o ++endif +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_conntrack.c linux-2.6.16-026test015/kernel/cpt/cpt_conntrack.c +--- linux-2.6.16.orig/kernel/cpt/cpt_conntrack.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_conntrack.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,370 @@ ++/* ++ * ++ * kernel/cpt/cpt_conntrack.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <linux/rtnetlink.h> ++#include <linux/unistd.h> ++#include <linux/ve.h> ++#include <linux/vzcalluser.h> ++#include <linux/cpt_image.h> ++#include <linux/icmp.h> ++#include <linux/ip.h> ++ ++#if defined(CONFIG_VE_IPTABLES) && \ ++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) ++ ++#include <linux/netfilter.h> ++#include <linux/netfilter_ipv4/ip_conntrack.h> ++#include <linux/netfilter_ipv4/ip_nat.h> ++#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> ++#include <linux/netfilter_ipv4/ip_conntrack_helper.h> ++#include <linux/netfilter_ipv4/ip_conntrack_core.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++ ++/* How does it work? ++ * ++ * Network is disabled, so new conntrack entries will not appear. ++ * However, some of them can disappear because of timeouts. ++ * ++ * So, we take read_lock, collect all required information atomically, ++ * essentially, creating parallel "refcount" structures holding pointers. ++ * We delete conntrack timers as well, so the structures cannot disappear ++ * after releasing the lock. Now, after releasing lock we can dump everything ++ * safely. And on exit we restore timers to their original values. ++ * ++ * Note, this approach is not going to work in VE0. ++ */ ++ ++struct ct_holder ++{ ++ struct ct_holder *next; ++ struct ip_conntrack_tuple_hash *cth; ++ int index; ++}; ++ ++static void encode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple) ++{ ++ v->cpt_dst = tuple->dst.ip; ++ v->cpt_dstport = tuple->dst.u.all; ++ v->cpt_protonum = tuple->dst.protonum; ++ v->cpt_dir = tuple->dst.dir; ++ ++ v->cpt_src = tuple->src.ip; ++ v->cpt_srcport = tuple->src.u.all; ++} ++ ++static int dump_one_expect(struct cpt_ip_connexpect_image *v, ++ struct ip_conntrack_expect *exp, ++ int sibling, cpt_context_t *ctx) ++{ ++ int err = 0; ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_NET_CONNTRACK_EXPECT; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ encode_tuple(&v->cpt_tuple, &exp->tuple); ++ encode_tuple(&v->cpt_mask, &exp->mask); ++ v->cpt_sibling_conntrack = sibling; ++ v->cpt_flags = exp->flags; ++ v->cpt_seq = exp->id; ++ v->cpt_dir = 0; ++ v->cpt_manip_proto = 0; ++#ifdef CONFIG_IP_NF_NAT_NEEDED ++ v->cpt_manip_proto = exp->saved_proto.all; ++ v->cpt_dir = exp->dir; ++#endif ++ v->cpt_timeout = 0; ++ if (exp->master->helper->timeout) ++ v->cpt_timeout = exp->timeout.expires - jiffies; ++ return err; ++} ++ ++/* NOTE. We use one page to dump list of expectations. This may be not enough ++ * in theory. In practice there is only one expectation per conntrack record. ++ * Moreover, taking into account that _ALL_ of expecations are saved in one ++ * global list, which is looked up each incoming/outpging packet, the system ++ * would be severely dead when even one conntrack would have so much of ++ * expectations. Shortly, I am not going to repair this. ++ */ ++ ++static int dump_expect_list(struct ip_conntrack *ct, struct ct_holder *list, ++ cpt_context_t *ctx) ++{ ++ int err = 0; ++ unsigned long pg; ++ struct cpt_ip_connexpect_image *v; ++ struct ip_conntrack_expect *exp; ++ ++ if (ct->expecting == 0) ++ return err; ++ if (ct->expecting*sizeof(struct cpt_ip_connexpect_image) > PAGE_SIZE) ++ return -ENOBUFS; ++ ++ pg = __get_free_page(GFP_KERNEL); ++ if (!pg) ++ return -ENOMEM; ++ v = (struct cpt_ip_connexpect_image *)pg; ++ ++ read_lock_bh(&ip_conntrack_lock); ++ list_for_each_entry(exp, &ve_ip_conntrack_expect_list, list) { ++ int sibling; ++ ++ if (exp->master != ct) ++ continue; ++ ++ if (ct->helper == NULL) { ++ eprintk_ctx("conntrack: no helper and non-trivial expectation\n"); ++ err = -EINVAL; ++ break; ++ } ++ ++ sibling = 0; ++#if 0 ++ /* That's all? No need to calculate sibling? */ ++ if (exp->sibling) { ++ struct ct_holder *c; ++ for (c = list; c; c = c->next) { ++ if (tuplehash_to_ctrack(c->cth) == exp->sibling) { ++ sibling = c->index; ++ break; ++ } ++ } ++ /* NOTE: exp->sibling could be not "confirmed" and, hence, ++ * out of hash table. We should just ignore such a sibling, ++ * the connection is going to be retried, the packet ++ * apparently was lost somewhere. ++ */ ++ if (sibling == 0) ++ dprintk_ctx("sibling conntrack is not found\n"); ++ } ++#endif ++ ++ /* If the expectation still does not have exp->sibling ++ * and timer is not running, it is about to die on another ++ * cpu. Skip it. */ ++ if (!sibling && ++ ct->helper->timeout && ++ !timer_pending(&exp->timeout)) { ++ dprintk_ctx("conntrack: expectation: no timer\n"); ++ continue; ++ } ++ ++ err = dump_one_expect(v, exp, sibling, ctx); ++ if (err) ++ break; ++ ++ v++; ++ } ++ read_unlock_bh(&ip_conntrack_lock); ++ ++ if (err == 0 && (unsigned long)v != pg) ++ ctx->write((void*)pg, (unsigned long)v - pg, ctx); ++ ++ free_page(pg); ++ return err; ++} ++ ++static int dump_one_ct(struct ct_holder *c, struct ct_holder *list, ++ cpt_context_t *ctx) ++{ ++ struct ip_conntrack_tuple_hash *h = c->cth; ++ struct ip_conntrack *ct = tuplehash_to_ctrack(h); ++ struct cpt_ip_conntrack_image v; ++ int err = 0; ++ ++ if (sizeof(v.cpt_proto_data) != sizeof(ct->proto)) { ++ eprintk_ctx("conntrack module ct->proto version mismatch\n"); ++ return -EINVAL; ++ } ++ if (sizeof(v.cpt_help_data) != sizeof(ct->help)) { ++ eprintk_ctx("conntrack module ct->help version mismatch\n"); ++ return -EINVAL; ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_CONNTRACK; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ read_lock_bh(&ip_conntrack_lock); ++ v.cpt_status = ct->status; ++ v.cpt_timeout = ct->timeout.expires - jiffies; ++ v.cpt_ct_helper = (ct->helper != NULL); ++ v.cpt_index = c->index; ++ v.cpt_id = ct->id; ++ v.cpt_mark = 0; ++#if defined(CONFIG_IP_NF_CONNTRACK_MARK) ++ v.cpt_mark = ct->mark; ++#endif ++ encode_tuple(&v.cpt_tuple[0], &ct->tuplehash[0].tuple); ++ encode_tuple(&v.cpt_tuple[1], &ct->tuplehash[1].tuple); ++ memcpy(&v.cpt_proto_data, &ct->proto, sizeof(v.cpt_proto_data)); ++ memcpy(&v.cpt_help_data, &ct->help, sizeof(v.cpt_help_data)); ++ ++ v.cpt_masq_index = 0; ++ v.cpt_initialized = 0; ++ v.cpt_num_manips = 0; ++ v.cpt_nat_helper = 0; ++#ifdef CONFIG_IP_NF_NAT_NEEDED ++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ ++ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) ++ v.cpt_masq_index = ct->nat.masq_index; ++#endif ++ /* "help" data is used by pptp, difficult to support */ ++ v.cpt_nat_seq[0].cpt_correction_pos = ct->nat.info.seq[0].correction_pos; ++ v.cpt_nat_seq[0].cpt_offset_before = ct->nat.info.seq[0].offset_before; ++ v.cpt_nat_seq[0].cpt_offset_after = ct->nat.info.seq[0].offset_after; ++ v.cpt_nat_seq[1].cpt_correction_pos = ct->nat.info.seq[1].correction_pos; ++ v.cpt_nat_seq[1].cpt_offset_before = ct->nat.info.seq[1].offset_before; ++ v.cpt_nat_seq[1].cpt_offset_after = ct->nat.info.seq[1].offset_after; ++#endif ++ read_unlock_bh(&ip_conntrack_lock); ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ err = dump_expect_list(ct, list, ctx); ++ ++ cpt_close_object(ctx); ++ return err; ++} ++ ++int cpt_dump_ip_conntrack(cpt_context_t * ctx) ++{ ++ struct ct_holder *ct_list = NULL; ++ struct ct_holder *c, **cp; ++ int err = 0; ++ int index = 0; ++ int idx; ++ ++ if (get_exec_env()->_ip_conntrack == NULL) ++ return 0; ++ ++ for (idx = atomic_read(&(get_exec_env()->_ip_conntrack->_ip_conntrack_count)); idx >= 0; idx--) { ++ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); ++ if (c == NULL) { ++ err = -ENOMEM; ++ goto done; ++ } ++ memset(c, 0, sizeof(struct ct_holder)); ++ c->next = ct_list; ++ ct_list = c; ++ } ++ ++ c = ct_list; ++ ++ read_lock_bh(&ip_conntrack_lock); ++ for (idx = 0; idx < ip_conntrack_htable_size; idx++) { ++ struct ip_conntrack_tuple_hash *h; ++ list_for_each_entry(h, &ve_ip_conntrack_hash[idx], list) { ++ /* Skip reply tuples, they are covered by original ++ * direction. */ ++ if (DIRECTION(h)) ++ continue; ++ ++ /* Oops, we have not enough of holders... ++ * It is impossible. */ ++ if (unlikely(c == NULL)) { ++ read_unlock_bh(&ip_conntrack_lock); ++ eprintk_ctx("unexpected conntrack appeared\n"); ++ err = -ENOMEM; ++ goto done; ++ } ++ ++ /* If timer is not running, it means that it ++ * has just been scheduled on another cpu. ++ * We should skip this conntrack, it is about to be ++ * destroyed. */ ++ if (!del_timer(&tuplehash_to_ctrack(h)->timeout)) { ++ dprintk_ctx("conntrack: no timer\n"); ++ continue; ++ } ++ ++ /* Timer is deleted. refcnt is _not_ decreased. ++ * We are going to restore the timer on exit ++ * from this function. */ ++ c->cth = h; ++ c->index = ++index; ++ c = c->next; ++ } ++ } ++ read_unlock_bh(&ip_conntrack_lock); ++ ++ /* No conntracks? Good. */ ++ if (index == 0) ++ goto done; ++ ++ /* Comb the list a little. */ ++ cp = &ct_list; ++ while ((c = *cp) != NULL) { ++ /* Discard unused entries; they can appear, if some ++ * entries were timed out since we preallocated the list. ++ */ ++ if (c->cth == NULL) { ++ *cp = c->next; ++ kfree(c); ++ continue; ++ } ++ ++ /* Move conntracks attached to expectations to the beginning ++ * of the list. */ ++ if (tuplehash_to_ctrack(c->cth)->master && c != ct_list) { ++ *cp = c->next; ++ c->next = ct_list; ++ ct_list = c; ++ dprintk_ctx("conntrack: %d moved in list\n", c->index); ++ continue; ++ } ++ cp = &c->next; ++ } ++ ++ cpt_open_section(ctx, CPT_SECT_NET_CONNTRACK); ++ ++ for (c = ct_list; c; c = c->next) { ++ err = dump_one_ct(c, ct_list, ctx); ++ if (err) ++ goto done; ++ } ++ ++ cpt_close_section(ctx); ++ ++done: ++ while ((c = ct_list) != NULL) { ++ ct_list = c->next; ++ if (c->cth) { ++ /* Restore timer. refcnt is preserved. */ ++ add_timer(&tuplehash_to_ctrack(c->cth)->timeout); ++ } ++ kfree(c); ++ } ++ return err; ++} ++ ++#endif +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_context.c linux-2.6.16-026test015/kernel/cpt/cpt_context.c +--- linux-2.6.16.orig/kernel/cpt/cpt_context.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_context.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,255 @@ ++/* ++ * ++ * kernel/cpt/cpt_context.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/pagemap.h> ++ ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++ ++static void file_write(const void *addr, size_t count, struct cpt_context *ctx) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->write(file, addr, count, &file->f_pos); ++ set_fs(oldfs); ++ if (err != count && !ctx->write_error) ++ ctx->write_error = err < 0 ? err : -EIO; ++} ++ ++static void file_pwrite(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->write(file, addr, count, &pos); ++ set_fs(oldfs); ++ if (err != count && !ctx->write_error) ++ ctx->write_error = err < 0 ? err : -EIO; ++} ++ ++static void file_align(struct cpt_context *ctx) ++{ ++ struct file *file = ctx->file; ++ ++ if (file) ++ file->f_pos = CPT_ALIGN(file->f_pos); ++} ++ ++void cpt_context_init(struct cpt_context *ctx) ++{ ++ int i; ++ ++ memset(ctx, 0, sizeof(*ctx)); ++ ++ init_MUTEX(&ctx->main_sem); ++ ctx->refcount = 1; ++ ++ ctx->current_section = -1; ++ ctx->current_object = -1; ++ ctx->pagesize = PAGE_SIZE; ++ ctx->write = file_write; ++ ctx->pwrite = file_pwrite; ++ ctx->align = file_align; ++ for (i=0; i < CPT_SECT_MAX; i++) ++ ctx->sections[i] = CPT_NULL; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ init_completion(&ctx->pgin_notify); ++#endif ++ cpt_object_init(ctx); ++} ++ ++int cpt_open_dumpfile(struct cpt_context *ctx) ++{ ++ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->tmpbuf == NULL) ++ return -ENOMEM; ++ __cpt_release_buf(ctx); ++ return 0; ++} ++ ++int cpt_close_dumpfile(struct cpt_context *ctx) ++{ ++ if (ctx->file) { ++ fput(ctx->file); ++ ctx->file = NULL; ++ } ++ if (ctx->tmpbuf) { ++ free_page((unsigned long)ctx->tmpbuf); ++ ctx->tmpbuf = NULL; ++ } ++ if (ctx->write_error) ++ eprintk_ctx("error while writing dump file: %d\n", ctx->write_error); ++ return ctx->write_error; ++} ++ ++int cpt_major_hdr_out(struct cpt_context *ctx) ++{ ++ struct cpt_major_hdr hdr; ++ ++ if (ctx->file == NULL) ++ return 0; ++ ++ memset(&hdr, 0, sizeof(hdr)); ++ hdr.cpt_signature[0] = CPT_SIGNATURE0; ++ hdr.cpt_signature[1] = CPT_SIGNATURE1; ++ hdr.cpt_signature[2] = CPT_SIGNATURE2; ++ hdr.cpt_signature[3] = CPT_SIGNATURE3; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_image_version = 1; ++#ifdef CONFIG_X86_32 ++ hdr.cpt_os_arch = CPT_OS_ARCH_I386; ++#endif ++#ifdef CONFIG_X86_64 ++ hdr.cpt_os_arch = CPT_OS_ARCH_EMT64; ++#endif ++ hdr.cpt_os_version = 0; ++ hdr.cpt_os_features = 0; ++ hdr.cpt_pagesize = PAGE_SIZE; ++ hdr.cpt_hz = HZ; ++ hdr.cpt_start_jiffies64 = ctx->virt_jiffies64; ++ hdr.cpt_start_sec = ctx->start_time.tv_sec; ++ hdr.cpt_start_nsec = ctx->start_time.tv_nsec; ++ hdr.cpt_cpu_caps[0] = ctx->src_cpu_flags; ++ hdr.cpt_kernel_config[0] = ctx->kernel_config_flags; ++ hdr.cpt_iptables_mask = ctx->iptables_mask; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ return 0; ++} ++ ++int cpt_close_section(struct cpt_context *ctx) ++{ ++ if (ctx->file && ctx->current_section >= 0) { ++ __u64 next = ctx->file->f_pos - ctx->current_section; ++ ctx->pwrite(&next, 8, ctx, ctx->current_section); ++ ctx->current_section = -1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cpt_close_section); ++ ++int cpt_open_section(struct cpt_context *ctx, __u32 type) ++{ ++ struct cpt_section_hdr hdr; ++ ++ if (ctx->file == NULL) ++ return 0; ++ ++ cpt_close_section(ctx); ++ ++ ctx->current_section = ctx->file->f_pos; ++ ctx->sections[type] = ctx->current_section; ++ ++ hdr.cpt_next = 0; ++ hdr.cpt_section = type; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_align = 0; ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ ++ return 0; ++} ++EXPORT_SYMBOL(cpt_open_section); ++ ++ ++int cpt_close_object(struct cpt_context *ctx) ++{ ++ if (ctx->file && ctx->current_object >= 0) { ++ __u64 next = ctx->file->f_pos - ctx->current_object; ++ ctx->pwrite(&next, 8, ctx, ctx->current_object); ++ ctx->current_object = -1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cpt_close_object); ++ ++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ if (ctx->file == NULL) ++ return 0; ++ ++ cpt_close_object(ctx); ++ ++ ctx->current_object = ctx->file->f_pos; ++ if (obj) ++ cpt_obj_setpos(obj, ctx->current_object, ctx); ++ ++ return 0; ++} ++EXPORT_SYMBOL(cpt_open_object); ++ ++int cpt_push_object(loff_t *saved, struct cpt_context *ctx) ++{ ++ if (ctx->file) { ++ *saved = ctx->current_object; ++ ctx->current_object = ctx->file->f_pos; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(cpt_push_object); ++ ++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx) ++{ ++ ctx->current_object = *saved; ++ return 0; ++} ++EXPORT_SYMBOL(cpt_pop_object); ++ ++int cpt_dump_tail(struct cpt_context *ctx) ++{ ++ struct cpt_major_tail hdr; ++ int i; ++ ++ if (ctx->file == NULL) ++ return 0; ++ ++ cpt_open_section(ctx, CPT_SECT_TRAILER); ++ memset(&hdr, 0, sizeof(hdr)); ++ hdr.cpt_next = sizeof(hdr); ++ hdr.cpt_object = CPT_OBJ_TRAILER; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_content = CPT_CONTENT_VOID; ++ hdr.cpt_lazypages = 0; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ hdr.cpt_lazypages = ctx->lazypages; ++#endif ++ hdr.cpt_64bit = ctx->tasks64; ++ hdr.cpt_signature[0] = CPT_SIGNATURE0; ++ hdr.cpt_signature[1] = CPT_SIGNATURE1; ++ hdr.cpt_signature[2] = CPT_SIGNATURE2; ++ hdr.cpt_signature[3] = CPT_SIGNATURE3; ++ hdr.cpt_nsect = CPT_SECT_MAX_INDEX; ++ for (i = 0; i < CPT_SECT_MAX_INDEX; i++) ++ hdr.cpt_sections[i] = ctx->sections[i]; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ cpt_close_section(ctx); ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_context.h linux-2.6.16-026test015/kernel/cpt/cpt_context.h +--- linux-2.6.16.orig/kernel/cpt/cpt_context.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_context.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,196 @@ ++#include <linux/fs.h> ++#include <asm/uaccess.h> ++ ++#define CPT_CTX_ERROR -1 ++#define CPT_CTX_IDLE 0 ++#define CPT_CTX_SUSPENDING 1 ++#define CPT_CTX_SUSPENDED 2 ++#define CPT_CTX_DUMPING 3 ++#define CPT_CTX_UNDUMPING 4 ++#define CPT_CTX_UNDUMPED 5 ++ ++#define CPT_TID(tsk) (tsk)->pid, virt_pid(tsk), (tsk)->comm ++#define CPT_FID "%d,%d(%s)" ++ ++ ++typedef struct cpt_context ++{ ++ struct list_head ctx_list; ++ int refcount; ++ int ctx_state; ++ int objcount; ++ int sticky; ++ struct semaphore main_sem; ++ ++ struct file *errorfile; ++ struct file *statusfile; ++ struct file *lockfile; ++ ++ int errno; ++ char *error_msg; ++ loff_t err_offset; ++ ++ struct file *file; ++ char *tmpbuf; ++ int pagesize; ++ ++ loff_t current_section; ++ loff_t current_object; ++ ++ loff_t sections[CPT_SECT_MAX]; ++ ++ __u32 errormask; ++ __u32 write_error; ++ ++ struct list_head object_array[CPT_OBJ_MAX]; ++ ++ void (*write)(const void *addr, size_t count, struct cpt_context *ctx); ++ void (*pwrite)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); ++ ssize_t (*read)(void *addr, size_t count, struct cpt_context *ctx); ++ ssize_t (*pread)(void *addr, size_t count, struct cpt_context *ctx, loff_t pos); ++ void (*align)(struct cpt_context *ctx); ++ int ve_id; ++ int contextid; ++ __u64 cpt_jiffies64; /* Host jiffies64 at the moment of cpt/rst, ++ * corresponging to start_time */ ++ __u64 virt_jiffies64; /* Virtual jiffies64. It is == cpt_jiffies64 when ++ * VE did not migrate. */ ++ struct timespec start_time; ++ struct timespec delta_time; ++ int image_version; ++ int lo_index; ++ int lo_index_old; ++ int venet_index; ++ int venet_index_old; ++ __u64 iptables_mask; ++ ++#define CPT_ANONVMA_HBITS (sizeof(void*) == 4 ? 10 : 9) ++#define CPT_ANONVMA_HSIZE (1<<CPT_ANONVMA_HBITS) ++ struct hlist_head *anonvmas; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ struct file *pagein_file_in; ++ struct file *pagein_file_out; ++ int lazy_vm; ++ int lazypages; ++ int lazytype; ++ task_t *pgin_task; ++ unsigned long last_pagein; ++ struct pagein_desc **pgin_dir; ++ struct pgin_device *pagein_dev; ++ struct completion pgin_notify; ++ struct completion *pgind_completion; ++ struct swap_info_struct *pgin_swp; ++#endif ++ int tasks64; ++ __u32 src_cpu_flags; ++ __u32 dst_cpu_flags; ++ __u32 kernel_config_flags; ++ ++ struct filejob *filejob_queue; ++} cpt_context_t; ++ ++typedef struct { ++ int pid; ++ cpt_context_t *ctx; ++ struct completion done; ++} pagein_info_t; ++ ++int pagein_info_printf(char *buf, cpt_context_t *ctx); ++ ++int cpt_open_dumpfile(struct cpt_context *); ++int cpt_close_dumpfile(struct cpt_context *); ++int rst_open_dumpfile(struct cpt_context *); ++void rst_close_dumpfile(struct cpt_context *); ++void cpt_context_init(struct cpt_context *); ++void rst_context_init(struct cpt_context *); ++void cpt_context_destroy(struct cpt_context *); ++ ++void rst_report_error(int err, cpt_context_t *ctx); ++ ++ ++int cpt_major_hdr_out(struct cpt_context *ctx); ++int cpt_dump_tail(struct cpt_context *ctx); ++int cpt_close_section(struct cpt_context *ctx); ++int cpt_open_section(struct cpt_context *ctx, __u32 type); ++int cpt_close_object(struct cpt_context *ctx); ++int cpt_open_object(cpt_object_t *obj, struct cpt_context *ctx); ++int cpt_push_object(loff_t *saved, struct cpt_context *ctx); ++int cpt_pop_object(loff_t *saved, struct cpt_context *ctx); ++ ++int rst_get_section(int type, struct cpt_context * ctx, loff_t *, loff_t *); ++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx); ++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx); ++void rst_put_name(__u8 *name, struct cpt_context *ctx); ++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx); ++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx); ++ ++#define rst_get_object(type, pos, tmp, ctx) \ ++ _rst_get_object((type), (pos), (tmp), sizeof(*(tmp)), (ctx)) ++ ++extern int debug_level; ++ ++#define cpt_printk(lvl, fmt, args...) do { \ ++ if (lvl <= debug_level) \ ++ printk(fmt, ##args); \ ++ } while (0) ++ ++#define dprintk(a...) cpt_printk(3, "CPT DBG: " a) ++#define dprintk_ctx(f, arg...) dprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) ++ ++#define wprintk(a...) cpt_printk(2, "CPT WRN: " a) ++#define wprintk_ctx(f, arg...) wprintk("%p,%u: " f, ctx, ctx->ve_id, ##arg) ++ ++#define eprintk(a...) cpt_printk(1, "CPT ERR: " a) ++#define eprintk_ctx(f, arg...) \ ++do { \ ++ eprintk("%p,%u :" f, ctx, ctx->ve_id, ##arg); \ ++ if (ctx->error_msg && ctx->err_offset < PAGE_SIZE) \ ++ ctx->err_offset += snprintf((char*)(ctx->error_msg + \ ++ ctx->err_offset), \ ++ PAGE_SIZE - ctx->err_offset, f, ##arg); \ ++} while(0) ++ ++#define CPT_TMPBUF_FREE 0x789adf12 ++#define CPT_TMPBUF_BUSY 0xabcd9876 ++ ++static inline void *cpt_get_buf(cpt_context_t *ctx) ++{ ++ void *buf = ctx->tmpbuf; ++ ++ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_FREE); ++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_BUSY; ++ return buf; ++} ++ ++static inline void __cpt_release_buf(cpt_context_t *ctx) ++{ ++ void *buf = ctx->tmpbuf; ++ ++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; ++} ++ ++static inline void cpt_release_buf(cpt_context_t *ctx) ++{ ++ void *buf = ctx->tmpbuf; ++ ++ BUG_ON(*(u32*)(buf + PAGE_SIZE - 4) != CPT_TMPBUF_BUSY); ++ *(u32*)(buf + PAGE_SIZE - 4) = CPT_TMPBUF_FREE; ++} ++ ++static inline void cpt_flush_error(cpt_context_t *ctx) ++{ ++ mm_segment_t oldfs; ++ ++ if (ctx->errorfile && ctx->error_msg && ctx->err_offset) { ++ if (ctx->errorfile->f_op && ctx->errorfile->f_op->write) { ++ oldfs = get_fs(); ++ set_fs(KERNEL_DS); ++ ctx->errorfile->f_op->write(ctx->errorfile, ++ ctx->error_msg, ctx->err_offset, ++ &ctx->errorfile->f_pos); ++ set_fs(oldfs); ++ } ++ ctx->error_msg[0] = 0; ++ ctx->err_offset = 0; ++ } ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_dump.c linux-2.6.16-026test015/kernel/cpt/cpt_dump.c +--- linux-2.6.16.orig/kernel/cpt/cpt_dump.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_dump.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,840 @@ ++/* ++ * ++ * kernel/cpt/cpt_dump.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/pagemap.h> ++#include <linux/ptrace.h> ++#include <linux/smp_lock.h> ++#include <linux/ve.h> ++#include <linux/ve_proto.h> ++#include <linux/virtinfo.h> ++#include <ub/ub_task.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_dump.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_process.h" ++#include "cpt_net.h" ++#include "cpt_socket.h" ++#include "cpt_ubc.h" ++#include "cpt_kernel.h" ++ ++ ++static int vps_child_level(task_t *root, task_t *c) ++{ ++ int level = 0; ++ int veid = VE_TASK_INFO(c)->owner_env->veid; ++ ++ while (VE_TASK_INFO(c)->owner_env->veid == veid) { ++ if (c->pid != c->tgid) ++ c = c->group_leader; ++ if (c == root) ++ return level; ++ ++ c = c->real_parent; ++ level++; ++ } ++ return -1; ++} ++ ++static inline int freezable(struct task_struct * p) ++{ ++ if (p->exit_state) ++ return 0; ++ ++ switch (p->state) { ++ case EXIT_ZOMBIE: ++ case EXIT_DEAD: ++ case TASK_STOPPED: ++#if TASK_TRACED != TASK_STOPPED ++ case TASK_TRACED: ++#endif ++ return 0; ++ default: ++ return 1; ++ } ++} ++ ++/* ++ * Some comment is necessary about PF_FREEZE,PF_FROZEN,TIF_FREEZE... ++ * ++ * SWSUSP uses PF_FREEZE flag in tsk->flags raising it in context ++ * of another process. Apparently, it is unacceptable on SMP. ++ * Let's take freeze_processes() in kernel/power/process.c as an example. ++ * Unserialized modifications tsk->flags easily ++ * (believe or not, but it happens with probability of almost 100% :-)) ++ * creates the situation when setting PF_FREEZE in freeze_processes(), ++ * which quickly spins raising PF_FREEZE of all the processes, ++ * _clears_ PF_FROZEN just set in refrigerator(), so that suspend deadlocks. ++ * ++ * So, to make things clean, we require that those flags may be modified ++ * only under tsk->sighand->siglock, which is quite natural because PF_FREEZE ++ * is just a kind of signal. ++ * ++ * It is not enough, because we are still not allowed to change tsk->flags ++ * in context of another process, we can corrupt another flags, when the process ++ * running on another cpu modifies them. So, we use TIF_FREEZE in thread flags, ++ * which can be changed atomically. ++ * ++ * PF_FROZEN also changes in context of another process, but this happens ++ * only when the process is already in refrigerator() which does not modify ++ * tsk->flags. ++ */ ++ ++static int vps_stop_tasks(struct cpt_context *ctx) ++{ ++ unsigned long start_time = jiffies; ++ int err; ++ task_t *p, *g; ++ int todo; ++ int round = 0; ++ ++ do_gettimespec(&ctx->start_time); ++ ctx->cpt_jiffies64 = get_jiffies_64(); ++ ctx->virt_jiffies64 = ctx->cpt_jiffies64 + get_exec_env()->jiffies_fixup; ++ ++ read_lock(&tasklist_lock); ++ for(;;) { ++ task_t *root; ++ todo = 0; ++ ++ root = find_task_by_pid_ve(1); ++ if (!root) { ++ read_unlock(&tasklist_lock); ++ eprintk_ctx("cannot find ve init\n"); ++ return -ESRCH; ++ } ++ ++ do_each_thread_ve(g, p) { ++ if (vps_child_level(root, p) >= 0) { ++ if (!is_virtual_pid(virt_pid(p))) { ++ eprintk_ctx("external process %d/%d(%s) inside VPS (e.g. vzctl enter or vzctl exec).\n", virt_pid(p), p->pid, p->comm); ++ todo = -1; ++ goto out; ++ } ++ if (p->vfork_done) { ++ /* Task between vfork()...exec() ++ * cannot be frozen, because parent ++ * wait in uninterruptible state. ++ * So, we do nothing, waiting for ++ * exec(), unless: ++ */ ++ if (p->state == TASK_STOPPED || ++ p->state == TASK_TRACED) { ++ eprintk_ctx("task %d/%d(%s) is stopped while vfork(). Checkpointing is impossible.\n", virt_pid(p), p->pid, p->comm); ++ todo = -1; ++ /* It is fatal, _user_ stopped ++ * vfork()ing task, so that we ++ * cannot suspend now. ++ */ ++ } else { ++ todo = -3; ++ } ++ goto out; ++ } ++ if (p->state == TASK_TRACED ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++ && !p->stopped_state ++#endif ++ ) { ++ int ptrace_id = p->pn_state; ++ /* Debugger waits for signal. */ ++ switch (ptrace_id) { ++ case PN_STOP_TF: ++ case PN_STOP_TF_RT: ++ case PN_STOP_ENTRY: ++ case PN_STOP_FORK: ++ case PN_STOP_VFORK: ++ case PN_STOP_SIGNAL: ++ case PN_STOP_EXIT: ++ case PN_STOP_LEAVE: ++ break; ++ default: ++ eprintk_ctx("task %d/%d(%s) is stopped by debugger while %d.\n", virt_pid(p), p->pid, p->comm, ptrace_id); ++ todo = -1; ++ goto out; ++ } ++ } ++ if (p->flags & PF_NOFREEZE) ++ goto out; ++ if (p->flags & PF_FROZEN) ++ continue; ++ if (!freezable(p)) ++ continue; ++ ++ spin_lock_irq(&p->sighand->siglock); ++ set_tsk_thread_flag(p, TIF_FREEZE); ++ signal_wake_up(p, 0); ++ spin_unlock_irq(&p->sighand->siglock); ++ ++ if (round == 10) ++ wprintk_ctx("%d/%d(%s) is running\n", virt_pid(p), p->pid, p->comm); ++ ++ todo++; ++ } else { ++ if (p != current) { ++ eprintk_ctx("foreign process %d/%d(%s) inside VPS (e.g. vzctl enter or vzctl exec).\n", virt_pid(p), p->pid, p->comm); ++ todo = -1; ++ goto out; ++ } ++ } ++ } while_each_thread_ve(g, p); ++ ++out: ++ if (todo && ++ (time_after(jiffies, start_time + 10*HZ) || ++ signal_pending(current) || todo < 0)) { ++ do_each_thread_ve(g, p) { ++ if (vps_child_level(root, p) >= 0) { ++ spin_lock_irq(&p->sighand->siglock); ++ clear_tsk_thread_flag(p, TIF_FREEZE); ++ if (p->flags & PF_FROZEN) { ++ p->flags &= ~PF_FROZEN; ++ wake_up_process(p); ++ } ++ spin_unlock_irq(&p->sighand->siglock); ++ } ++ } while_each_thread_ve(g, p); ++ if (todo > 0) ++ todo = -2; ++ /* This is sign of failure of printk(), which is not ++ * ours. So, no prefixes. */ ++ printk(">\n"); ++ } ++ ++ read_unlock(&tasklist_lock); ++ ++ if (!todo) ++ return 0; ++ ++ if (todo == -1) { ++ eprintk_ctx("suspend is impossible now.\n"); ++ return -EAGAIN; ++ } ++ ++ if (todo == -2) { ++ eprintk_ctx("interrupted or timed out.\n"); ++ return -EINTR; ++ } ++ ++ if (time_after(jiffies, start_time + 10*HZ) || ++ signal_pending(current)) { ++ if (todo == -3) { ++ eprintk_ctx("vfork() is active, suspend is impossible now.\n"); ++ } else { ++ eprintk_ctx("suspend is impossible, reason %d\n", todo); ++ } ++ return -EAGAIN; ++ } ++ ++ if (todo < 0 || round > 0) { ++ current->state = TASK_INTERRUPTIBLE; ++ schedule_timeout(HZ/50); ++ } else { ++ yield(); ++ } ++ ++ read_lock(&tasklist_lock); ++ round++; ++ } ++ ++ read_unlock(&tasklist_lock); ++ return err; ++} ++ ++static int cpt_unlock_ve(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ down_write(&env->op_sem); ++ env->is_locked = 0; ++ up_write(&env->op_sem); ++ put_ve(env); ++ return 0; ++} ++ ++int cpt_resume(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_unlock_sockets(ctx); ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pgin_task) { ++ wait_for_completion(&ctx->pgin_notify); ++ put_task_struct(ctx->pgin_task); ++ ctx->pgin_task = NULL; ++ } ++#endif ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ clear_tsk_thread_flag(tsk, TIF_FREEZE); ++ if (tsk->flags & PF_FROZEN) { ++ tsk->flags &= ~PF_FROZEN; ++ wake_up_process(tsk); ++ } else if (freezable(tsk)) { ++ eprintk_ctx("strange, %s not frozen\n", tsk->comm ); ++ } ++ spin_unlock_irq(&tsk->sighand->siglock); ++ put_task_struct(tsk); ++ } ++ ++ cpt_resume_network(ctx); ++ ++ cpt_unlock_ve(ctx); ++ ++ cpt_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ return 0; ++} ++ ++int cpt_kill(struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct ve_struct *env; ++ cpt_object_t *obj; ++ task_t *root_task = NULL; ++ long delay; ++ ++ if (!ctx->ve_id) ++ return -EINVAL; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ ++ /* from here cpt_kill succeeds */ ++ if (VE_TASK_INFO(current)->owner_env == env) { ++ wprintk_ctx("attempt to kill ve from inside, escaping...\n"); ++ ++ write_lock_irq(&tasklist_lock); ++ VE_TASK_INFO(current)->owner_env = get_ve0(); ++ REMOVE_VE_LINKS(current); ++ SET_VE_LINKS(current); ++ ++ atomic_inc(&get_ve0()->pcounter); ++ atomic_dec(&env->pcounter); ++ write_unlock_irq(&tasklist_lock); ++ set_exec_env(get_ve0()); ++ } ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pgin_task) { ++ wait_for_completion(&ctx->pgin_notify); ++ put_task_struct(ctx->pgin_task); ++ ctx->pgin_task = NULL; ++ } ++#endif ++ ++ cpt_kill_sockets(ctx); ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ ++ if (tsk->exit_state) { ++ put_task_struct(tsk); ++ continue; ++ } ++ ++ if (virt_pid(tsk) == 1) { ++ root_task = tsk; ++ continue; ++ } ++ ++ if (tsk->ptrace) { ++ write_lock_irq(&tasklist_lock); ++ tsk->ptrace = 0; ++ if (!list_empty(&tsk->ptrace_list)) { ++ list_del_init(&tsk->ptrace_list); ++ REMOVE_LINKS(tsk); ++ tsk->parent = tsk->real_parent; ++ SET_LINKS(tsk); ++ } ++ write_unlock_irq(&tasklist_lock); ++ } ++ ++ send_sig(SIGKILL, tsk, 1); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ sigfillset(&tsk->blocked); ++ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); ++ set_tsk_thread_flag(tsk, TIF_SIGPENDING); ++ clear_tsk_thread_flag(tsk, TIF_FREEZE); ++ if (tsk->flags & PF_FROZEN) ++ tsk->flags &= ~PF_FROZEN; ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ wake_up_process(tsk); ++ put_task_struct(tsk); ++ } ++ ++ yield(); ++ ++ if (root_task != NULL) { ++ send_sig(SIGKILL, root_task, 1); ++ ++ spin_lock_irq(&root_task->sighand->siglock); ++ sigfillset(&root_task->blocked); ++ sigdelsetmask(&root_task->blocked, sigmask(SIGKILL)); ++ set_tsk_thread_flag(root_task, TIF_SIGPENDING); ++ clear_tsk_thread_flag(root_task, TIF_FREEZE); ++ if (root_task->flags & PF_FROZEN) ++ root_task->flags &= ~PF_FROZEN; ++ spin_unlock_irq(&root_task->sighand->siglock); ++ ++ wake_up_process(root_task); ++ put_task_struct(root_task); ++ } ++ ++ cpt_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ ++ delay = 1; ++ while (atomic_read(&env->counter) != 1) { ++ if (signal_pending(current)) ++ break; ++ current->state = TASK_INTERRUPTIBLE; ++ delay = (delay < HZ) ? (delay << 1) : HZ; ++ schedule_timeout(delay); ++ } ++ put_ve(env); ++ ++ return err; ++} ++ ++static void collect_task_ubc(task_t *t, struct cpt_context *ctx) ++{ ++ struct task_beancounter *tbc; ++ ++ tbc = &(t->task_bc); ++ cpt_add_ubc(tbc->exec_ub, ctx); ++ cpt_add_ubc(tbc->task_ub, ctx); ++ cpt_add_ubc(tbc->fork_sub, ctx); ++} ++ ++static cpt_object_t * remember_task(task_t * child, cpt_object_t * head, ++ cpt_context_t * ctx) ++{ ++ cpt_object_t *cobj; ++ ++ if (freezable(child) && !(child->flags&PF_FROZEN)) { ++ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(child)); ++ put_task_struct(child); ++ return NULL; ++ } ++ ++ if (lookup_cpt_object(CPT_OBJ_TASK, child, ctx)) BUG(); ++ if ((cobj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { ++ put_task_struct(child); ++ return NULL; ++ } ++ cobj->o_count = 1; ++ cpt_obj_setobj(cobj, child, ctx); ++ insert_cpt_object(CPT_OBJ_TASK, cobj, head, ctx); ++ collect_task_ubc(child, ctx); ++ return cobj; ++} ++ ++static int vps_collect_tasks(struct cpt_context *ctx) ++{ ++ int err = -ESRCH; ++ cpt_object_t *obj; ++ task_t *root; ++ ++ read_lock(&tasklist_lock); ++ root = find_task_by_pid_ve(1); ++ if (root) ++ get_task_struct(root); ++ read_unlock(&tasklist_lock); ++ ++ if (!root) { ++ err = -ESRCH; ++ eprintk_ctx("vps_collect_tasks: cannot find root\n"); ++ goto out; ++ } ++ ++ if ((obj = alloc_cpt_object(GFP_KERNEL, ctx)) == NULL) { ++ put_task_struct(root); ++ return -ENOMEM; ++ } ++ obj->o_count = 1; ++ cpt_obj_setobj(obj, root, ctx); ++ intern_cpt_object(CPT_OBJ_TASK, obj, ctx); ++ collect_task_ubc(root, ctx); ++ ++ /* Collect process subtree recursively */ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ cpt_object_t *head = obj; ++ task_t *tsk = obj->o_obj; ++ task_t *child; ++ ++ if (freezable(tsk) && !(tsk->flags&PF_FROZEN)) { ++ eprintk_ctx("process " CPT_FID " is not frozen\n", CPT_TID(tsk)); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ wait_task_inactive(tsk); ++ ++ if (tsk->pid == tsk->tgid) { ++ child = tsk; ++ for (;;) { ++ read_lock(&tasklist_lock); ++ child = next_thread(child); ++ if (child != tsk) ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ ++ if (child == tsk) ++ break; ++ ++ if (child->real_parent != tsk->real_parent) { ++ put_task_struct(child); ++ eprintk_ctx("illegal thread structure, kernel bug\n"); ++ return -EINVAL; ++ } ++ ++ if ((head = remember_task(child, head, ctx)) == NULL) ++ return -ENOMEM; ++ } ++ } ++ ++ /* About locking. VE is frozen. But lists of children ++ * may change at least for init, when entered task reparents ++ * to init and when reparented task exits. If we take care ++ * of this case, we still can unlock while scanning ++ * tasklists. ++ */ ++ read_lock(&tasklist_lock); ++ list_for_each_entry(child, &tsk->children, sibling) { ++ if (child->real_parent != tsk) ++ continue; ++ if (child->pid != child->tgid) ++ continue; ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ ++ if ((head = remember_task(child, head, ctx)) == NULL) ++ return -ENOMEM; ++ ++ read_lock(&tasklist_lock); ++ } ++ ++ list_for_each_entry(child, &tsk->ptrace_children, ptrace_list) { ++ if (child->real_parent != tsk) ++ continue; ++ if (child->pid != child->tgid) ++ continue; ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ ++ if ((head = remember_task(child, head, ctx)) == NULL) ++ return -ENOMEM; ++ ++ read_lock(&tasklist_lock); ++ } ++ read_unlock(&tasklist_lock); ++ } ++ ++ return 0; ++ ++out: ++ return err; ++} ++ ++static int cpt_collect(struct cpt_context *ctx) ++{ ++ int err; ++ ++ if ((err = cpt_collect_mm(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_sysv(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_files(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_fs(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_namespace(ctx)) != 0) ++ return err; ++ ++ if ((err = cpt_collect_signals(ctx)) != 0) ++ return err; ++ ++ return 0; ++} ++ ++static int cpt_dump_veinfo(cpt_context_t *ctx) ++{ ++ struct cpt_veinfo_image i; ++ struct ve_struct *ve; ++ struct timespec delta; ++ ++ cpt_open_section(ctx, CPT_SECT_VEINFO); ++ cpt_open_object(NULL, ctx); ++ ++ i.cpt_next = CPT_NULL; ++ i.cpt_object = CPT_OBJ_VEINFO; ++ i.cpt_hdrlen = sizeof(i); ++ i.cpt_content = CPT_CONTENT_VOID; ++ ++ ve = get_exec_env(); ++ i.shm_ctl_all = ve->_shm_ctlall; ++ i.shm_ctl_max = ve->_shm_ctlmax; ++ i.shm_ctl_mni = ve->_shm_ctlmni; ++ ++ i.msg_ctl_max = ve->_msg_ctlmax; ++ i.msg_ctl_mni = ve->_msg_ctlmni; ++ i.msg_ctl_mnb = ve->_msg_ctlmnb; ++ ++ BUG_ON(sizeof(ve->_sem_ctls) != sizeof(i.sem_ctl_arr)); ++ i.sem_ctl_arr[0] = ve->_sem_ctls[0]; ++ i.sem_ctl_arr[1] = ve->_sem_ctls[1]; ++ i.sem_ctl_arr[2] = ve->_sem_ctls[2]; ++ i.sem_ctl_arr[3] = ve->_sem_ctls[3]; ++ ++ do_posix_clock_monotonic_gettime(&delta); ++ _set_normalized_timespec(&delta, ++ delta.tv_sec - ve->start_timespec.tv_sec, ++ delta.tv_nsec - ve->start_timespec.tv_nsec); ++ i.start_timespec_delta = cpt_timespec_export(&delta); ++ i.start_jiffies_delta = get_jiffies_64() - ve->start_jiffies; ++ ++ ctx->write(&i, sizeof(i), ctx); ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int cpt_dump_utsname(cpt_context_t *ctx) ++{ ++ int len; ++ struct cpt_object_hdr o; ++ ++ cpt_open_section(ctx, CPT_SECT_UTSNAME); ++ ++ len = strlen(ve_utsname.nodename); ++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(ve_utsname.nodename, len+1, ctx); ++ ctx->align(ctx); ++ ++ len = strlen(ve_utsname.domainname); ++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(ve_utsname.domainname, len+1, ctx); ++ ctx->align(ctx); ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_dump(struct cpt_context *ctx) ++{ ++ struct ve_struct *oldenv, *env; ++ int err, err2 = 0; ++ ++ if (!ctx->ve_id) ++ return -EINVAL; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ ++ down_read(&env->op_sem); ++ err = -ESRCH; ++ if (!env->is_running) ++ goto out_noenv; ++ if (!env->is_locked) ++ goto out_noenv; ++ ++ oldenv = set_exec_env(env); ++ ++ /* Phase 2: real checkpointing */ ++ err = cpt_open_dumpfile(ctx); ++ if (err) ++ goto out; ++ ++ cpt_major_hdr_out(ctx); ++ ++ if (!err) ++ err = cpt_dump_veinfo(ctx); ++ if (!err) ++ err = cpt_dump_ubc(ctx); ++ if (!err) ++ err = cpt_dump_ifinfo(ctx); ++ if (!err) ++ err = cpt_dump_files(ctx); ++ if (!err) ++ err = cpt_dump_files_struct(ctx); ++ if (!err) ++ err = cpt_dump_fs_struct(ctx); ++ if (!err) ++ err = cpt_dump_namespace(ctx); ++ if (!err) ++ err = cpt_dump_sighand(ctx); ++ if (!err) ++ err = cpt_dump_vm(ctx); ++ if (!err) ++ err = cpt_dump_sysvsem(ctx); ++ if (!err) ++ err = cpt_dump_tasks(ctx); ++ if (!err) ++ err = cpt_dump_orphaned_sockets(ctx); ++#if defined(CONFIG_VE_IPTABLES) && \ ++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) ++ if (!err) ++ err = cpt_dump_ip_conntrack(ctx); ++#endif ++ if (!err) ++ err = cpt_dump_utsname(ctx); ++ ++ if (!err) ++ err = cpt_dump_tail(ctx); ++ ++ err2 = cpt_close_dumpfile(ctx); ++ ++out: ++ set_exec_env(oldenv); ++out_noenv: ++ up_read(&env->op_sem); ++ put_ve(env); ++ return err ? : err2; ++} ++ ++int cpt_vps_suspend(struct cpt_context *ctx) ++{ ++ struct ve_struct *oldenv, *env; ++ int err = 0; ++ ++ ctx->kernel_config_flags = test_kernel_config(); ++ cpt_object_init(ctx); ++ ++ if (!ctx->ve_id) { ++ env = get_exec_env(); ++ if (env == get_ve0()) ++ return -EINVAL; ++ wprintk("undefined ve_id\n"); ++ ctx->ve_id = env->veid; ++ get_ve(env); ++ } else { ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ } ++ ++#ifdef CONFIG_VE_IPTABLES ++ ctx->iptables_mask = env->_iptables_modules; ++#endif ++ ++ down_write(&env->op_sem); ++ err = -ESRCH; ++ if (!env->is_running) ++ goto out_noenv; ++ ++ err = -EBUSY; ++ if (env->is_locked) ++ goto out_noenv; ++ env->is_locked = 1; ++ downgrade_write(&env->op_sem); ++ ++ oldenv = set_exec_env(env); ++ ++ /* Phase 0: find and stop all the tasks */ ++ if ((err = vps_stop_tasks(ctx)) != 0) ++ goto out; ++ ++ if ((err = cpt_suspend_network(ctx)) != 0) ++ goto out; ++ ++ /* At the moment all the state is frozen. We do not need to lock ++ * the state, which can be changed only if the tasks are running. ++ */ ++ ++ /* Phase 1: collect task tree */ ++ if ((err = vps_collect_tasks(ctx)) != 0) ++ goto out; ++ ++ /* Phase 1': collect all the resources */ ++ if ((err = cpt_collect(ctx)) != 0) ++ goto out; ++ ++out: ++ set_exec_env(oldenv); ++ up_read(&env->op_sem); ++ put_ve(env); ++ return err; ++ ++out_noenv: ++ up_write(&env->op_sem); ++ put_ve(env); ++ return err; ++} ++ ++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps) ++{ ++ task_t *p; ++ struct ve_struct *env; ++ unsigned int flags = test_cpu_caps(); ++ ++ if (!ctx->ve_id) ++ return -EINVAL; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (env == NULL) ++ return -ESRCH; ++ ++ *caps = flags & (1<<CPT_CPU_X86_CMOV); ++ flags &= ~((1<<CPT_CPU_X86_EMT64)|(1<<CPT_CPU_X86_IA64)); ++ ++ read_lock(&tasklist_lock); ++ for (p = __first_task_ve(env); p != NULL ; p = __next_task_ve(env, p)) { ++ if (tsk_used_math(p)) ++ *caps |= flags; ++#ifdef CONFIG_X86_64 ++ if (!(p->thread_info->flags & _TIF_IA32)) ++ *caps |= (1<<CPT_CPU_X86_EMT64); ++#endif ++ } ++ read_unlock(&tasklist_lock); ++ put_ve(env); ++ ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_dump.h linux-2.6.16-026test015/kernel/cpt/cpt_dump.h +--- linux-2.6.16.orig/kernel/cpt/cpt_dump.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_dump.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,14 @@ ++int cpt_dump(struct cpt_context *cpt); ++int rst_undump(struct cpt_context *cpt); ++int cpt_suspend(struct cpt_context *cpt); ++int cpt_resume(struct cpt_context *cpt); ++int cpt_kill(struct cpt_context *cpt); ++int rst_clean(struct cpt_context *cpt); ++int rst_resume(struct cpt_context *cpt); ++int rst_kill(struct cpt_context *cpt); ++ ++int cpt_freeze_one(pid_t pid, int freeze); ++int cpt_vps_suspend(struct cpt_context *ctx); ++int vps_rst_undump(struct cpt_context *ctx); ++ ++int cpt_vps_caps(struct cpt_context *ctx, __u32 *caps); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_epoll.c linux-2.6.16-026test015/kernel/cpt/cpt_epoll.c +--- linux-2.6.16.orig/kernel/cpt/cpt_epoll.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_epoll.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,116 @@ ++/* ++ * ++ * kernel/cpt/cpt_epoll.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/namespace.h> ++#include <linux/mount.h> ++#include <linux/namei.h> ++#include <linux/smp_lock.h> ++#include <asm/uaccess.h> ++#include <linux/vzcalluser.h> ++#include <linux/eventpoll.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++extern struct file_operations eventpoll_fops; ++ ++int cpt_dump_epolldev(cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ int err = 0; ++ struct file *file = obj->o_obj; ++ struct eventpoll *ep; ++ struct rb_node *rbp; ++ struct cpt_epoll_image ei; ++ ++ if (file->f_op != &eventpoll_fops) { ++ eprintk_ctx("bad epoll file\n"); ++ return -EINVAL; ++ } ++ ++ ep = file->private_data; ++ ++ /* eventpoll.c does not protect open /proc/N/fd, silly. ++ * Opener will get an invalid file with uninitialized private_data ++ */ ++ if (unlikely(ep == NULL)) { ++ eprintk_ctx("bad epoll device\n"); ++ return -EINVAL; ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ ei.cpt_next = CPT_NULL; ++ ei.cpt_object = CPT_OBJ_EPOLL; ++ ei.cpt_hdrlen = sizeof(ei); ++ ei.cpt_content = CPT_CONTENT_ARRAY; ++ ei.cpt_file = obj->o_pos; ++ ++ ctx->write(&ei, sizeof(ei), ctx); ++ ++ down(&epsem); ++ for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { ++ loff_t saved_obj; ++ cpt_object_t *tobj; ++ struct cpt_epoll_file_image efi; ++ struct epitem *epi; ++ epi = rb_entry(rbp, struct epitem, rbn); ++ tobj = lookup_cpt_object(CPT_OBJ_FILE, epi->ffd.file, ctx); ++ if (tobj == NULL) { ++ eprintk_ctx("epoll device refers to an external file\n"); ++ err = -EBUSY; ++ break; ++ } ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ efi.cpt_next = CPT_NULL; ++ efi.cpt_object = CPT_OBJ_EPOLL_FILE; ++ efi.cpt_hdrlen = sizeof(efi); ++ efi.cpt_content = CPT_CONTENT_VOID; ++ efi.cpt_file = tobj->o_pos; ++ efi.cpt_fd = epi->ffd.fd; ++ efi.cpt_events = epi->event.events; ++ efi.cpt_data = epi->event.data; ++ efi.cpt_revents = epi->revents; ++ efi.cpt_ready = 0; ++ if (!list_empty(&epi->rdllink)) ++ efi.cpt_ready = 1; ++ ++ ctx->write(&efi, sizeof(efi), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ up(&epsem); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_files.c linux-2.6.16-026test015/kernel/cpt/cpt_files.c +--- linux-2.6.16.orig/kernel/cpt/cpt_files.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_files.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1343 @@ ++/* ++ * ++ * kernel/cpt/cpt_files.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/namespace.h> ++#include <linux/mount.h> ++#include <linux/namei.h> ++#include <linux/smp_lock.h> ++#include <linux/pagemap.h> ++#include <asm/uaccess.h> ++#include <linux/vzcalluser.h> ++#include <linux/ve_proto.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++void cpt_printk_dentry(struct dentry *d, struct vfsmount *mnt) ++{ ++ char *path; ++ unsigned long pg = __get_free_page(GFP_KERNEL); ++ ++ if (!pg) ++ return; ++ ++ path = d_path(d, mnt, (char *)pg, PAGE_SIZE); ++ ++ if (!IS_ERR(path)) ++ printk("<%s>", path); ++ free_page(pg); ++} ++ ++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, ++ cpt_context_t *ctx) ++{ ++ if (path[0] == '/' && !IS_ROOT(d) && !d_unhashed(d)) { ++ struct nameidata nd; ++ if (path_lookup(path, 0, &nd)) { ++ eprintk_ctx("d_path cannot be looked up %s\n", path); ++ return -EINVAL; ++ } ++ if (nd.dentry != d || nd.mnt != mnt) { ++ eprintk_ctx("d_path is invisible %s\n", path); ++ path_release(&nd); ++ return -EINVAL; ++ } ++ path_release(&nd); ++ } ++ return 0; ++} ++ ++int cpt_dump_dentry(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int len; ++ char *path; ++ char *pg = cpt_get_buf(ctx); ++ ++ path = d_path(d, mnt, pg, PAGE_SIZE); ++ len = PTR_ERR(path); ++ ++ if (IS_ERR(path)) { ++ struct cpt_object_hdr o; ++ char tmp[1]; ++ /* VZ changes d_path() to return EINVAL, when path ++ * is not supposed to be visible inside VE. */ ++ if (len != -EINVAL) ++ eprintk_ctx("d_path err=%d\n", len); ++ else ++ len = 0; ++ ++ o.cpt_next = sizeof(o) + CPT_ALIGN(1); ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ tmp[0] = 0; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(tmp, 1, ctx); ++ ctx->align(ctx); ++ ++ __cpt_release_buf(ctx); ++ return len; ++ } else { ++ struct cpt_object_hdr o; ++ ++ len = pg + PAGE_SIZE - 1 - path; ++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ path[len] = 0; ++ ++ if (cpt_verify_overmount(path, d, mnt, ctx)) { ++ __cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(path, len+1, ctx); ++ ctx->align(ctx); ++ __cpt_release_buf(ctx); ++ } ++ return 0; ++} ++ ++int cpt_dump_string(const char *s, struct cpt_context *ctx) ++{ ++ int len; ++ struct cpt_object_hdr o; ++ ++ len = strlen(s); ++ o.cpt_next = sizeof(o) + CPT_ALIGN(len + 1); ++ o.cpt_object = CPT_OBJ_NAME; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&o, sizeof(o), ctx); ++ ctx->write(s, len+1, ctx); ++ ctx->align(ctx); ++ return 0; ++} ++ ++int cpt_dump_filename(struct file *file, struct cpt_context *ctx) ++{ ++ return cpt_dump_dentry(file->f_dentry, file->f_vfsmnt, ctx); ++} ++ ++int cpt_dump_inode(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_inode_image *v = cpt_get_buf(ctx); ++ struct kstat sbuf; ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_INODE; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ if ((err = vfs_getattr(mnt, d, &sbuf)) != 0) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ v->cpt_dev = d->d_inode->i_sb->s_dev; ++ v->cpt_ino = d->d_inode->i_ino; ++ v->cpt_mode = sbuf.mode; ++ v->cpt_nlink = sbuf.nlink; ++ v->cpt_uid = sbuf.uid; ++ v->cpt_gid = sbuf.gid; ++ v->cpt_rdev = d->d_inode->i_rdev; ++ v->cpt_size = sbuf.size; ++ v->cpt_atime = cpt_timespec_export(&sbuf.atime); ++ v->cpt_mtime = cpt_timespec_export(&sbuf.mtime); ++ v->cpt_ctime = cpt_timespec_export(&sbuf.ctime); ++ v->cpt_blksize = sbuf.blksize; ++ v->cpt_blocks = sbuf.blocks; ++ v->cpt_sb = d->d_inode->i_sb->s_magic; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++int cpt_collect_files(cpt_context_t * ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ int index = 0; ++ ++ /* Collect process fd sets */ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ if (tsk->files && cpt_object_add(CPT_OBJ_FILES, tsk->files, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ /* Collect files from fd sets */ ++ for_each_object(obj, CPT_OBJ_FILES) { ++ int fd; ++ struct files_struct *f = obj->o_obj; ++ ++ cpt_obj_setindex(obj, index++, ctx); ++ ++ if (obj->o_count != atomic_read(&f->count)) { ++ eprintk_ctx("files_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&f->count)); ++ return -EBUSY; ++ } ++ ++ for (fd = 0; fd < f->fdt->max_fds; fd++) { ++ struct file *file = fcheck_files(f, fd); ++ if (file && cpt_object_add(CPT_OBJ_FILE, file, ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ ++ /* Collect files queued by AF_UNIX sockets. */ ++ if ((err = cpt_collect_passedfds(ctx)) < 0) ++ return err; ++ ++ /* OK. At this point we should count all the references. */ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ struct file *parent; ++ cpt_object_t *ino_obj; ++ ++ if (obj->o_count != atomic_read(&file->f_count)) { ++ eprintk_ctx("file struct is referenced outside %d %d\n", obj->o_count, atomic_read(&file->f_count)); ++ cpt_printk_dentry(file->f_dentry, file->f_vfsmnt); ++ return -EBUSY; ++ } ++ ++ switch (file->f_dentry->d_inode->i_sb->s_magic) { ++ case FSMAGIC_FUTEX: ++ case FSMAGIC_MQUEUE: ++ case FSMAGIC_BDEV: ++ eprintk_ctx("file on unsupported FS: magic %08lx\n", file->f_dentry->d_inode->i_sb->s_magic); ++ return -EBUSY; ++ } ++ ++ /* Collect inode. It is necessary mostly to resolve deleted ++ * hard links. */ ++ ino_obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); ++ if (ino_obj == NULL) ++ return -ENOMEM; ++ ++ parent = ino_obj->o_parent; ++ if (!parent || (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) ++ ino_obj->o_parent = file; ++ ++ if (S_ISCHR(file->f_dentry->d_inode->i_mode)) { ++ int maj = imajor(file->f_dentry->d_inode); ++ if (maj == PTY_MASTER_MAJOR || ++ (maj >= UNIX98_PTY_MASTER_MAJOR && ++ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || ++ maj == PTY_SLAVE_MAJOR || ++ maj == UNIX98_PTY_SLAVE_MAJOR || ++ maj == TTYAUX_MAJOR) { ++ err = cpt_collect_tty(file, ctx); ++ if (err) ++ return err; ++ } ++ } ++ ++ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { ++ err = cpt_collect_socket(file, ctx); ++ if (err) ++ return err; ++ } ++ } ++ ++ err = cpt_index_sockets(ctx); ++ ++ return err; ++} ++ ++/* /dev/ptmx is special, all the files share one inode, but real tty backend ++ * is attached via file->private_data. ++ */ ++ ++static inline int is_cloning_inode(struct inode *ino) ++{ ++ return S_ISCHR(ino->i_mode) && ++ ino->i_rdev == MKDEV(TTYAUX_MAJOR,2); ++} ++ ++static int dump_one_flock(struct file_lock *fl, int owner, struct cpt_context *ctx) ++{ ++ pid_t pid; ++ struct cpt_flock_image *v = cpt_get_buf(ctx); ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_FLOCK; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ v->cpt_owner = owner; ++ ++ pid = fl->fl_pid; ++ if (pid && !is_virtual_pid(fl->fl_pid)) { ++ pid = _pid_type_to_vpid(PIDTYPE_TGID, fl->fl_pid); ++ if (pid == -1) { ++ if (!(fl->fl_flags&FL_FLOCK)) { ++ eprintk_ctx("posix lock from another VE?\n"); ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++ pid = 0; ++ } ++ } ++ ++ v->cpt_pid = pid; ++ v->cpt_start = fl->fl_start; ++ v->cpt_end = fl->fl_end; ++ v->cpt_flags = fl->fl_flags; ++ v->cpt_type = fl->fl_type; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++ ++int cpt_dump_flock(struct file *file, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct file_lock *fl; ++ ++ lock_kernel(); ++ for (fl = file->f_dentry->d_inode->i_flock; ++ fl; fl = fl->fl_next) { ++ if (file != fl->fl_file) ++ continue; ++ if (fl->fl_flags & FL_LEASE) { ++ eprintk_ctx("lease lock is not supported\n"); ++ err = -EINVAL; ++ break; ++ } ++ if (fl->fl_flags & FL_POSIX) { ++ cpt_object_t *obj; ++ obj = lookup_cpt_object(CPT_OBJ_FILES, fl->fl_owner, ctx); ++ if (obj) { ++ dump_one_flock(fl, obj->o_index, ctx); ++ continue; ++ } else { ++ eprintk_ctx("unknown lock owner %p\n", fl->fl_owner); ++ err = -EINVAL; ++ } ++ } ++ if (fl->fl_flags & FL_FLOCK) { ++ dump_one_flock(fl, -1, ctx); ++ continue; ++ } ++ } ++ unlock_kernel(); ++ return err; ++} ++ ++static int __comb_pid_to_vpid(int pid) ++{ ++ int vpid = pid; ++ ++ if (pid > 0) { ++ vpid = _pid_type_to_vpid(PIDTYPE_PID, pid); ++ if (unlikely(vpid < 0)) { ++ dprintk("pid %d does not exist amymore.\n", pid); ++ return 0; ++ } ++ } else if (pid < 0) { ++ vpid = _pid_type_to_vpid(PIDTYPE_PGID, -pid); ++ if (unlikely(vpid < 0)) { ++ dprintk("pgid %d does not exist amymore.\n", -pid); ++ return 0; ++ } ++ vpid = -vpid; ++ } ++ return vpid; ++} ++ ++static int dump_one_file(cpt_object_t *obj, struct file *file, cpt_context_t *ctx) ++{ ++ int err = 0; ++ cpt_object_t *iobj; ++ struct cpt_file_image *v = cpt_get_buf(ctx); ++ struct kstat sbuf; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FILE; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_flags = file->f_flags; ++ v->cpt_mode = file->f_mode; ++ v->cpt_pos = file->f_pos; ++ v->cpt_uid = file->f_uid; ++ v->cpt_gid = file->f_gid; ++ ++ vfs_getattr(file->f_vfsmnt, file->f_dentry, &sbuf); ++ ++ v->cpt_i_mode = sbuf.mode; ++ v->cpt_lflags = 0; ++ if (IS_ROOT(file->f_dentry)) ++ v->cpt_lflags |= CPT_DENTRY_ROOT; ++ else if (d_unhashed(file->f_dentry)) ++ v->cpt_lflags |= CPT_DENTRY_DELETED; ++ if (is_cloning_inode(file->f_dentry->d_inode)) ++ v->cpt_lflags |= CPT_DENTRY_CLONING; ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_PROC) ++ v->cpt_lflags |= CPT_DENTRY_PROC; ++ v->cpt_inode = CPT_NULL; ++ iobj = lookup_cpt_object(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); ++ if (iobj) ++ v->cpt_inode = iobj->o_pos; ++ v->cpt_priv = CPT_NULL; ++ v->cpt_fown_fd = -1; ++ if (S_ISCHR(v->cpt_i_mode)) { ++ iobj = lookup_cpt_object(CPT_OBJ_TTY, file->private_data, ctx); ++ if (iobj) { ++ v->cpt_priv = iobj->o_pos; ++ if (file->f_flags&FASYNC) ++ v->cpt_fown_fd = cpt_tty_fasync(file, ctx); ++ } ++ } ++ if (S_ISSOCK(v->cpt_i_mode)) { ++ if (obj->o_index < 0) { ++ eprintk_ctx("BUG: no socket index\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_priv = obj->o_index; ++ if (file->f_flags&FASYNC) ++ v->cpt_fown_fd = cpt_socket_fasync(file, ctx); ++ } ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { ++ v->cpt_priv = file->f_dentry->d_inode->i_ino; ++ v->cpt_lflags |= CPT_DENTRY_EPOLL; ++ } ++ ++ v->cpt_fown_pid = __comb_pid_to_vpid((int)file->f_owner.pid); ++ v->cpt_fown_uid = file->f_owner.uid; ++ v->cpt_fown_euid = file->f_owner.euid; ++ v->cpt_fown_signo = file->f_owner.signum; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (!S_ISSOCK(v->cpt_i_mode)) { ++ err = cpt_dump_filename(file, ctx); ++ if (err) ++ return err; ++ } ++ ++ if (file->f_dentry->d_inode->i_flock) ++ err = cpt_dump_flock(file, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++/* About this weird function... Crappy code dealing with SYSV shared memory ++ * defines TMPFS inode and file with f_op doing only mmap. So... ++ * Maybe, this is wrong and leaks something. It is clear access to ++ * SYSV shmem via mmap is quite unusual and impossible from user space. ++ */ ++static int dump_content_shm(struct file *file, struct cpt_context *ctx) ++{ ++ struct cpt_obj_bits *v; ++ loff_t saved_pos; ++ unsigned long addr; ++ ++ addr = do_mmap_pgoff(file, 0, file->f_dentry->d_inode->i_size, ++ PROT_READ, MAP_SHARED, 0); ++ if (IS_ERR((void*)addr)) ++ return PTR_ERR((void*)addr); ++ ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ v = cpt_get_buf(ctx); ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_BITS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_DATA; ++ v->cpt_size = file->f_dentry->d_inode->i_size; ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ctx->write((void*)addr, file->f_dentry->d_inode->i_size, ctx); ++ ctx->align(ctx); ++ do_munmap(current->mm, addr, file->f_dentry->d_inode->i_size); ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ return 0; ++} ++ ++static int data_is_zero(char *addr, int len) ++{ ++ int i; ++ unsigned long zerolong = 0; ++ ++ for (i=0; i<len/sizeof(unsigned long); i++) { ++ if (((unsigned long*)(addr))[i] != 0) ++ return 0; ++ } ++ i = len % sizeof(unsigned long); ++ if (!i) ++ return 1; ++ return memcmp(addr + len - i, &zerolong, i) == 0; ++} ++ ++ ++static int dump_content_regular(struct file *file, struct cpt_context *ctx) ++{ ++ loff_t saved_pos; ++ loff_t pos = 0; ++ loff_t obj_opened = CPT_NULL; ++ struct cpt_page_block pgb; ++ ssize_t (*do_read)(struct file *, char __user *, size_t, loff_t *); ++ ++ if (file->f_op == NULL) ++ return -EINVAL; ++ ++ if ((do_read = file->f_op->read) == NULL) { ++ if (file->f_op->mmap == NULL) ++ return -EINVAL; ++ if (file->f_dentry->d_inode->i_sb->s_magic != FSMAGIC_TMPFS) { ++ eprintk_ctx("unreadable, but not SYSV SHM file\n"); ++ return -EINVAL; ++ } ++ ++ do_read = file->f_dentry->d_inode->i_fop->read; ++ cpt_dump_content_sysvshm(file, ctx); ++ if (!do_read) { ++ wprintk_ctx("TMPFS is not configured?\n"); ++ return dump_content_shm(file, ctx); ++ } ++ } ++ ++ if (!(file->f_mode & FMODE_READ) || ++ (file->f_flags & O_DIRECT)) { ++ file = dentry_open(dget(file->f_dentry), ++ mntget(file->f_vfsmnt), O_RDONLY); ++ } else { ++ atomic_inc(&file->f_count); ++ } ++ ++ for (;;) { ++ mm_segment_t oldfs; ++ int err; ++ ++ (void)cpt_get_buf(ctx); ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = do_read(file, ctx->tmpbuf, PAGE_SIZE, &pos); ++ set_fs(oldfs); ++ if (err < 0) { ++ eprintk_ctx("dump_content_regular: do_read: %d", err); ++ fput(file); ++ __cpt_release_buf(ctx); ++ return err; ++ } ++ if (err == 0) { ++ __cpt_release_buf(ctx); ++ break; ++ } ++ if (data_is_zero(ctx->tmpbuf, err)) { ++ if (obj_opened != CPT_NULL) { ++ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ obj_opened = CPT_NULL; ++ } ++ } else { ++ if (obj_opened == CPT_NULL) { ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ obj_opened = ctx->file->f_pos; ++ pgb.cpt_next = CPT_NULL; ++ pgb.cpt_object = CPT_OBJ_PAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_DATA; ++ pgb.cpt_start = pos - err; ++ pgb.cpt_end = pgb.cpt_start; ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ } ++ ctx->write(ctx->tmpbuf, err, ctx); ++ pgb.cpt_end += err; ++ } ++ __cpt_release_buf(ctx); ++ } ++ ++ fput(file); ++ ++ if (obj_opened != CPT_NULL) { ++ ctx->pwrite(&pgb.cpt_end, 8, ctx, obj_opened + offsetof(struct cpt_page_block, cpt_end)); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ obj_opened = CPT_NULL; ++ } ++ return 0; ++} ++ ++ ++static int dump_content_chrdev(struct file *file, struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ int maj; ++ ++ maj = imajor(ino); ++ if (maj == MEM_MAJOR) { ++ /* Well, OK. */ ++ return 0; ++ } ++ if (maj == PTY_MASTER_MAJOR || ++ (maj >= UNIX98_PTY_MASTER_MAJOR && ++ maj < UNIX98_PTY_MASTER_MAJOR+UNIX98_PTY_MAJOR_COUNT) || ++ maj == PTY_SLAVE_MAJOR || ++ maj == UNIX98_PTY_SLAVE_MAJOR || ++ maj == TTYAUX_MAJOR) { ++ return cpt_dump_content_tty(file, ctx); ++ } ++ eprintk_ctx("unsupported chrdev %d/%d\n", maj, iminor(ino)); ++ return -EINVAL; ++} ++ ++static int dump_content_blkdev(struct file *file, struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ ++ /* We are not going to transfer them. */ ++ eprintk_ctx("unsupported blkdev %d/%d\n", imajor(ino), iminor(ino)); ++ return -EINVAL; ++} ++ ++static int dump_content_fifo(struct file *file, struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ cpt_object_t *obj; ++ loff_t saved_pos; ++ int readers; ++ int writers; ++ int anon = 0; ++ ++ mutex_lock(PIPE_MUTEX(*ino)); ++ readers = PIPE_READERS(*ino); ++ writers = PIPE_WRITERS(*ino); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file1 = obj->o_obj; ++ if (file1->f_dentry->d_inode == ino) { ++ if (file1->f_mode & FMODE_READ) ++ readers--; ++ if (file1->f_mode & FMODE_WRITE) ++ writers--; ++ } ++ } ++ mutex_unlock(PIPE_MUTEX(*ino)); ++ if (readers || writers) { ++ struct dentry *dr = file->f_dentry->d_sb->s_root; ++ if (dr->d_name.len == 7 && memcmp(dr->d_name.name,"pipefs:",7) == 0) ++ anon = 1; ++ ++ if (anon) { ++ eprintk_ctx("pipe has %d/%d external readers/writers\n", readers, writers); ++ return -EBUSY; ++ } ++ /* If fifo has external readers/writers, we are in troubles. ++ * If the buffer is not empty, we must move its content. ++ * But if the fifo is owned by a service, we cannot do ++ * this. See? ++ * ++ * For now we assume, that if fifo is opened by another ++ * process, we do not own it and, hence, migrate without ++ * data. ++ */ ++ return 0; ++ } ++ ++ /* OK, we must save fifo state. No semaphores required. */ ++ ++ if (ino->i_pipe->nrbufs) { ++ struct cpt_obj_bits *v = cpt_get_buf(ctx); ++ struct pipe_inode_info *info; ++ int count, buf, nrbufs; ++ ++ mutex_lock(PIPE_MUTEX(*ino)); ++ info = ino->i_pipe; ++ count = 0; ++ buf = info->curbuf; ++ nrbufs = info->nrbufs; ++ while (--nrbufs >= 0) { ++ if (!info->bufs[buf].ops->can_merge) { ++ mutex_unlock(PIPE_MUTEX(*ino)); ++ eprintk_ctx("unknown format of pipe buffer\n"); ++ return -EINVAL; ++ } ++ count += info->bufs[buf].len; ++ buf = (buf+1) & (PIPE_BUFFERS-1); ++ } ++ ++ if (!count) { ++ mutex_unlock(PIPE_MUTEX(*ino)); ++ return 0; ++ } ++ ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_BITS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_DATA; ++ v->cpt_size = count; ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ count = 0; ++ buf = info->curbuf; ++ nrbufs = info->nrbufs; ++ while (--nrbufs >= 0) { ++ struct pipe_buffer *b = info->bufs + buf; ++ void * addr = b->ops->map(file, info, b); ++ ctx->write(addr + b->offset, b->len, ctx); ++ b->ops->unmap(info, b); ++ buf = (buf+1) & (PIPE_BUFFERS-1); ++ } ++ ++ mutex_unlock(PIPE_MUTEX(*ino)); ++ ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ } ++ ++ return 0; ++} ++ ++static int dump_content_socket(struct file *file, struct cpt_context *ctx) ++{ ++ return 0; ++} ++ ++static int dump_one_inode(struct file *file, struct dentry *d, ++ struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct inode *ino = d->d_inode; ++ cpt_object_t *iobj; ++ int dump_it = 0; ++ ++ iobj = lookup_cpt_object(CPT_OBJ_INODE, ino, ctx); ++ if (!iobj) ++ return -EINVAL; ++ ++ if (iobj->o_pos >= 0) ++ return 0; ++ ++ if (!IS_ROOT(d) && d_unhashed(d)) ++ dump_it = 1; ++ if (!S_ISREG(ino->i_mode) && !S_ISDIR(ino->i_mode)) { ++ /* One more bug in epoll: invalid inode mode. ++ * What a load of crap... ++ */ ++ if (ino->i_sb->s_magic == FSMAGIC_EPOLL && ++ (ino->i_mode & S_IFMT) == 0) ++ return 0; ++ dump_it = 1; ++ } ++ ++ if (!dump_it) ++ return 0; ++ ++ cpt_open_object(iobj, ctx); ++ cpt_dump_inode(d, mnt, ctx); ++ ++ if (!IS_ROOT(d) && d_unhashed(d)) { ++ struct file *parent; ++ parent = iobj->o_parent; ++ if (!parent || ++ (!IS_ROOT(parent->f_dentry) && d_unhashed(parent->f_dentry))) { ++ /* Inode is not deleted, but it does not ++ * have references from inside checkpointed ++ * process group. We have options: ++ * A. Fail, abort checkpointing ++ * B. Proceed. File will be cloned. ++ * A is correct, B is more complicated */ ++ /* Just as a hint where to create deleted file */ ++ if (ino->i_nlink != 0) { ++ eprintk_ctx("deleted reference to existing inode, checkpointing is impossible\n"); ++ return -EBUSY; ++ } ++ } else { ++ /* Refer to _another_ file name. */ ++ err = cpt_dump_filename(parent, ctx); ++ if (err) ++ return err; ++ if (S_ISREG(ino->i_mode) || S_ISDIR(ino->i_mode)) ++ dump_it = 0; ++ } ++ } ++ if (dump_it) { ++ if (S_ISREG(ino->i_mode)) { ++ if ((err = dump_content_regular(file, ctx)) != 0) { ++ eprintk_ctx("dump_content_regular "); ++ cpt_printk_dentry(d, mnt); ++ } ++ } else if (S_ISDIR(ino->i_mode)) { ++ /* We cannot do anything. The directory should be ++ * empty, so it is not a big deal. ++ */ ++ } else if (S_ISCHR(ino->i_mode)) { ++ err = dump_content_chrdev(file, ctx); ++ } else if (S_ISBLK(ino->i_mode)) { ++ err = dump_content_blkdev(file, ctx); ++ } else if (S_ISFIFO(ino->i_mode)) { ++ err = dump_content_fifo(file, ctx); ++ } else if (S_ISSOCK(ino->i_mode)) { ++ err = dump_content_socket(file, ctx); ++ } else { ++ eprintk_ctx("unknown inode mode %o\n", ino->i_mode & S_IFMT); ++ err = -EINVAL; ++ } ++ } ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++int cpt_dump_files(struct cpt_context *ctx) ++{ ++ int epoll_nr; ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_TTY); ++ for_each_object(obj, CPT_OBJ_TTY) { ++ int err; ++ ++ if ((err = cpt_dump_tty(obj, ctx)) != 0) ++ return err; ++ } ++ cpt_close_section(ctx); ++ ++ cpt_open_section(ctx, CPT_SECT_INODE); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ int err; ++ ++ if ((err = dump_one_inode(file, file->f_dentry, ++ file->f_vfsmnt, ctx)) != 0) ++ return err; ++ } ++ for_each_object(obj, CPT_OBJ_FS) { ++ struct fs_struct *fs = obj->o_obj; ++ int err; ++ ++ if (fs->root && ++ (err = dump_one_inode(NULL, fs->root, fs->rootmnt, ctx)) != 0) ++ return err; ++ if (fs->pwd && ++ (err = dump_one_inode(NULL, fs->pwd, fs->pwdmnt, ctx)) != 0) ++ return err; ++ if (fs->altroot && ++ (err = dump_one_inode(NULL, fs->altroot, fs->altrootmnt, ctx)) != 0) ++ return err; ++ } ++ cpt_close_section(ctx); ++ ++ epoll_nr = 0; ++ cpt_open_section(ctx, CPT_SECT_FILES); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ int err; ++ ++ if ((err = dump_one_file(obj, file, ctx)) != 0) ++ return err; ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) ++ epoll_nr++; ++ } ++ cpt_close_section(ctx); ++ ++ if (epoll_nr) { ++ cpt_open_section(ctx, CPT_SECT_EPOLL); ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ if (file->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_EPOLL) { ++ int err; ++ if ((err = cpt_dump_epolldev(obj, ctx)) != 0) ++ return err; ++ } ++ } ++ cpt_close_section(ctx); ++ } ++ ++ cpt_open_section(ctx, CPT_SECT_SOCKET); ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ int err; ++ ++ if ((err = cpt_dump_socket(obj, obj->o_obj, obj->o_index, -1, ctx)) != 0) ++ return err; ++ } ++ cpt_close_section(ctx); ++ ++ return 0; ++} ++ ++static int dump_filedesc(int fd, struct file *file, ++ struct files_struct *f, struct cpt_context *ctx) ++{ ++ struct cpt_fd_image *v = cpt_get_buf(ctx); ++ cpt_object_t *obj; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FILEDESC; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ v->cpt_fd = fd; ++ obj = lookup_cpt_object(CPT_OBJ_FILE, file, ctx); ++ if (!obj) BUG(); ++ v->cpt_file = obj->o_pos; ++ v->cpt_flags = 0; ++ if (FD_ISSET(fd, f->fdt->close_on_exec)) ++ v->cpt_flags = CPT_FD_FLAG_CLOSEEXEC; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++static int dump_one_file_struct(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct files_struct *f = obj->o_obj; ++ struct cpt_files_struct_image *v = cpt_get_buf(ctx); ++ int fd; ++ loff_t saved_obj; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FILES; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_index = obj->o_index; ++ v->cpt_max_fds = f->fdt->max_fds; ++ v->cpt_next_fd = f->fdt->next_fd; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ for (fd = 0; fd < f->fdt->max_fds; fd++) { ++ struct file *file = fcheck_files(f, fd); ++ if (file) ++ dump_filedesc(fd, file, f, ctx); ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_files_struct(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_FILES_STRUCT); ++ ++ for_each_object(obj, CPT_OBJ_FILES) { ++ int err; ++ ++ if ((err = dump_one_file_struct(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_collect_fs(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ if (tsk->fs) { ++ if (cpt_object_add(CPT_OBJ_FS, tsk->fs, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->fs->pwd && ++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->pwd->d_inode, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->fs->root && ++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->root->d_inode, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->fs->altroot && ++ cpt_object_add(CPT_OBJ_INODE, tsk->fs->altroot->d_inode, ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ return 0; ++} ++ ++static int cpt_dump_dir(struct dentry *d, struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ struct file file; ++ ++ memset(&file, 0, sizeof(file)); ++ ++ file.f_dentry = d; ++ file.f_vfsmnt = mnt; ++ file.f_mode = FMODE_READ|FMODE_PREAD|FMODE_LSEEK; ++ return dump_one_file(NULL, &file, ctx); ++} ++ ++static int dump_one_fs(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct fs_struct *fs = obj->o_obj; ++ struct cpt_fs_struct_image *v = cpt_get_buf(ctx); ++ loff_t saved_obj; ++ int err; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_FS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_umask = fs->umask; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ err = cpt_dump_dir(fs->root, fs->rootmnt, ctx); ++ if (!err) ++ err = cpt_dump_dir(fs->pwd, fs->pwdmnt, ctx); ++ if (!err && fs->altroot) ++ err = cpt_dump_dir(fs->altroot, fs->altrootmnt, ctx); ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++int cpt_dump_fs_struct(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_FS); ++ ++ for_each_object(obj, CPT_OBJ_FS) { ++ int err; ++ ++ if ((err = dump_one_fs(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int check_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct namespace *n = obj->o_obj; ++ struct list_head *p; ++ char *path_buf, *path; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) ++ return -ENOMEM; ++ ++ down_read(&namespace_sem); ++ list_for_each(p, &n->list) { ++ struct vfsmount *mnt = list_entry(p, struct vfsmount, mnt_list); ++ ++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) ++ continue; ++ ++ if ( ++ strcmp(mnt->mnt_sb->s_type->name, "rootfs") != 0 && ++ strcmp(mnt->mnt_sb->s_type->name, "ext3") != 0 && ++ strcmp(mnt->mnt_sb->s_type->name, "simfs") != 0 && ++ strcmp(mnt->mnt_sb->s_type->name, "tmpfs") != 0 && ++ strcmp(mnt->mnt_sb->s_type->name, "devpts") != 0 && ++ strcmp(mnt->mnt_sb->s_type->name, "proc") != 0 && ++ strcmp(mnt->mnt_sb->s_type->name, "sysfs") != 0) { ++ eprintk_ctx("unsupported fs type %s\n", mnt->mnt_sb->s_type->name); ++ err = -EINVAL; ++ break; ++ } ++ } ++ up_read(&namespace_sem); ++ ++ free_page((unsigned long) path_buf); ++ ++ return err; ++} ++ ++int cpt_collect_namespace(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ if (tsk->namespace && cpt_object_add(CPT_OBJ_NAMESPACE, tsk->namespace, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ for_each_object(obj, CPT_OBJ_NAMESPACE) { ++ int err; ++ if ((err = check_one_namespace(obj, ctx)) != 0) ++ return err; ++ } ++ ++ return 0; ++} ++ ++struct args_t ++{ ++ int* pfd; ++ char* path; ++}; ++ ++static int dumptmpfs(void *arg) ++{ ++ int i; ++ struct args_t *args = arg; ++ int *pfd = args->pfd; ++ char *path = args->path; ++ char *argv[] = { "tar", "-c", "-S", "--numeric-owner", path, NULL }; ++ ++ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); ++ if (i < 0) { ++ eprintk("cannot enter ve to dump tmpfs\n"); ++ module_put(THIS_MODULE); ++ return 1; ++ } ++ ++ if (pfd[1] != 1) ++ sc_dup2(pfd[1], 1); ++ ++ for (i=0; i<current->files->fdt->max_fds; i++) { ++ if (i != 1) ++ sc_close(i); ++ } ++ ++ module_put(THIS_MODULE); ++ ++ set_fs(KERNEL_DS); ++ i = sc_execve("/bin/tar", argv, NULL); ++ eprintk("failed to exec /bin/tar: %d\n", i); ++ return -1; ++} ++ ++static int cpt_dump_tmpfs(char *path, struct cpt_context *ctx) ++{ ++ int err; ++ int pid; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ char buf[16]; ++ int n; ++ loff_t saved_obj; ++ struct args_t args; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) ++ return err; ++ args.pfd = pfd; ++ args.path = path; ++ err = pid = local_kernel_thread(dumptmpfs, (void*)&args, SIGCHLD, 0); ++ if (err < 0) ++ goto out; ++ f = fget(pfd[0]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NAME; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ do { ++ mm_segment_t oldfs; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); ++ set_fs(oldfs); ++ if (n > 0) ++ ctx->write(buf, n, ctx); ++ } while (n > 0); ++ ++ fput(f); ++ ++ if ((err = sc_waitx(pid, 0)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ ++ buf[0] = 0; ++ ctx->write(buf, 1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ return n; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ return err; ++} ++ ++static int dump_vfsmount(struct vfsmount *mnt, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct cpt_vfsmount_image v; ++ loff_t saved_obj; ++ char *path_buf, *path; ++ ++ path_buf = (char *) __get_free_page(GFP_KERNEL); ++ if (!path_buf) ++ return -ENOMEM; ++ ++ path = d_path(mnt->mnt_root, mnt, path_buf, PAGE_SIZE); ++ if (IS_ERR(path)) { ++ free_page((unsigned long) path_buf); ++ return PTR_ERR(path) == -EINVAL ? 0 : PTR_ERR(path); ++ } ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = -1; ++ v.cpt_object = CPT_OBJ_VFSMOUNT; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ v.cpt_mntflags = mnt->mnt_flags; ++ v.cpt_flags = mnt->mnt_sb->s_flags; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_dump_string(mnt->mnt_devname ? : "none", ctx); ++ cpt_dump_string(path, ctx); ++ cpt_dump_string(mnt->mnt_sb->s_type->name, ctx); ++#if 0 ++ /* This is an evident crap. Ask Savochkin, he might know this. ++ * Goal is to get some path to mount --bind to. ++ */ ++ cpt_dump_dentry(mnt->mnt_root, mnt->mnt_parent, ctx); ++#else ++ /* For now we just bail, when some FS is mounted not at root. */ ++ if (mnt->mnt_root != mnt->mnt_sb->s_root) { ++ eprintk_ctx("mount --bind prevents checkpointing\n"); ++ err = -EINVAL; ++ } ++#endif ++ ++ if (strcmp(mnt->mnt_sb->s_type->name, "tmpfs") == 0) { ++ cpt_dump_tmpfs(path, ctx); ++ } ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ free_page((unsigned long) path_buf); ++ ++ return err; ++} ++ ++static int dump_one_namespace(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct namespace *n = obj->o_obj; ++ struct cpt_object_hdr v; ++ struct list_head *p; ++ loff_t saved_obj; ++ int err = 0; ++ ++ cpt_open_object(obj, ctx); ++ ++ v.cpt_next = -1; ++ v.cpt_object = CPT_OBJ_NAMESPACE; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ ++ down_read(&namespace_sem); ++ list_for_each(p, &n->list) { ++ err = dump_vfsmount(list_entry(p, struct vfsmount, mnt_list), ctx); ++ if (err) ++ break; ++ } ++ up_read(&namespace_sem); ++ ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ ++ return err; ++} ++ ++int cpt_dump_namespace(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_NAMESPACE); ++ ++ for_each_object(obj, CPT_OBJ_NAMESPACE) { ++ int err; ++ ++ if ((err = dump_one_namespace(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_files.h linux-2.6.16-026test015/kernel/cpt/cpt_files.h +--- linux-2.6.16.orig/kernel/cpt/cpt_files.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_files.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,46 @@ ++int cpt_collect_files(cpt_context_t *); ++int cpt_collect_fs(cpt_context_t *); ++int cpt_collect_namespace(cpt_context_t *); ++int cpt_collect_sysvsem_undo(cpt_context_t *); ++int cpt_collect_tty(struct file *, cpt_context_t *); ++int cpt_dump_files(struct cpt_context *ctx); ++int cpt_dump_files_struct(struct cpt_context *ctx); ++int cpt_dump_fs_struct(struct cpt_context *ctx); ++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx); ++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx); ++int cpt_dump_tty(cpt_object_t *, struct cpt_context *ctx); ++struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx); ++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, unsigned flags, struct cpt_context *ctx); ++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx); ++ ++int rst_posix_locks(struct cpt_context *ctx); ++ ++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx); ++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_restore_fs(struct cpt_context *ctx); ++ ++int cpt_collect_sysv(cpt_context_t *); ++int cpt_dump_sysvsem(struct cpt_context *ctx); ++int rst_sysv_ipc(struct cpt_context *ctx); ++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++ ++int cpt_dump_namespace(struct cpt_context *ctx); ++int rst_root_namespace(struct cpt_context *ctx); ++ ++int rst_stray_files(struct cpt_context *ctx); ++int rst_tty_jobcontrol(struct cpt_context *ctx); ++ ++void rst_flush_filejobs(struct cpt_context *); ++int rst_do_filejobs(struct cpt_context *); ++ ++int rst_eventpoll(struct cpt_context *); ++struct file *cpt_open_epolldev(struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx); ++int cpt_dump_epolldev(cpt_object_t *obj, struct cpt_context *); ++ ++int cpt_verify_overmount(char *path, struct dentry *d, struct vfsmount *mnt, ++ cpt_context_t *ctx); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_fsmagic.h linux-2.6.16-026test015/kernel/cpt/cpt_fsmagic.h +--- linux-2.6.16.orig/kernel/cpt/cpt_fsmagic.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_fsmagic.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,15 @@ ++/* Collected from kernel sources. */ ++ ++#define FSMAGIC_TMPFS 0x01021994 ++#define FSMAGIC_PIPEFS 0x50495045 ++#define FSMAGIC_SOCKFS 0x534F434B ++#define FSMAGIC_PFMFS 0xa0b4d889 ++#define FSMAGIC_BDEV 0x62646576 ++#define FSMAGIC_EPOLL 0x03111965 ++#define FSMAGIC_FUTEX 0x0BAD1DEA ++#define FSMAGIC_MQUEUE 0x19800202 ++#define FSMAGIC_PROC 0x9fa0 ++#define FSMAGIC_DEVPTS 0x1CD1 ++#define FSMAGIC_AUTOFS 0x0187 ++#define FSMAGIC_EXT2 0xEF53 ++#define FSMAGIC_REISER 0x52654973 +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_kernel.c linux-2.6.16-026test015/kernel/cpt/cpt_kernel.c +--- linux-2.6.16.orig/kernel/cpt/cpt_kernel.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_kernel.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,134 @@ ++/* ++ * ++ * kernel/cpt/cpt_kernel.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#define __KERNEL_SYSCALLS__ 1 ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <linux/kernel.h> ++#include <asm/cpufeature.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_kernel.h" ++#include "cpt_syscalls.h" ++ ++int debug_level = 1; ++ ++#ifndef CONFIG_X86_64 ++ ++extern void local_kernel_thread_helper(void); ++__asm__(".section .text\n" ++ ".align 4\n" ++ "local_kernel_thread_helper:\n\t" ++ "movl %edx,%eax\n\t" ++ "pushl %edx\n\t" ++ "call *%ebx\n\t" ++ "pushl %eax\n\t" ++ "pushl $0\n\t" ++ "call complete_and_exit\n" ++ ".previous"); ++ ++/* ++ * Create a kernel thread ++ */ ++int asm_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) ++{ ++ struct pt_regs regs; ++ ++ memset(®s, 0, sizeof(regs)); ++ ++ regs.ebx = (unsigned long) fn; ++ regs.edx = (unsigned long) arg; ++ ++ regs.xds = __USER_DS; ++ regs.xes = __USER_DS; ++ regs.orig_eax = -1; ++ regs.eip = (unsigned long) local_kernel_thread_helper; ++ regs.xcs = __KERNEL_CS; ++ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; ++ ++ /* Ok, create the new process.. */ ++ return do_fork_pid(flags | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL, pid); ++} ++#endif ++ ++int local_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags, pid_t pid) ++{ ++ pid_t ret; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ ret = asm_kernel_thread(fn, arg, flags, pid); ++ if (ret < 0) ++ module_put(THIS_MODULE); ++ return ret; ++} ++ ++#ifdef __i386__ ++int __execve(const char *file, char **argv, char **envp) ++{ ++ long res; ++ __asm__ volatile ("int $0x80" ++ : "=a" (res) ++ : "0" (__NR_execve),"b" ((long)(file)),"c" ((long)(argv)), ++ "d" ((long)(envp)) : "memory"); ++ return (int)res; ++} ++#endif ++ ++int sc_execve(char *cmd, char **argv, char **env) ++{ ++ int ret; ++#ifndef __i386__ ++ ret = execve(cmd, argv, env); ++#else ++ ret = __execve(cmd, argv, env); ++#endif ++ return ret; ++} ++ ++unsigned int test_cpu_caps() ++{ ++ unsigned int flags = 0; ++ if (boot_cpu_has(X86_FEATURE_CMOV)) ++ flags |= 1 << CPT_CPU_X86_CMOV; ++ if (cpu_has_fxsr) ++ flags |= 1 << CPT_CPU_X86_FXSR; ++ if (cpu_has_xmm) ++ flags |= 1 << CPT_CPU_X86_SSE; ++#ifndef CONFIG_X86_64 ++ if (cpu_has_xmm2) ++#endif ++ flags |= 1 << CPT_CPU_X86_SSE2; ++ if (cpu_has_mmx) ++ flags |= 1 << CPT_CPU_X86_MMX; ++ if (boot_cpu_has(X86_FEATURE_3DNOW)) ++ flags |= 1 << CPT_CPU_X86_3DNOW; ++ if (boot_cpu_has(X86_FEATURE_3DNOWEXT)) ++ flags |= 1 << CPT_CPU_X86_3DNOW2; ++ if (boot_cpu_has(X86_FEATURE_SEP)) ++ flags |= 1 << CPT_CPU_X86_SEP; ++#ifdef CONFIG_X86_64 ++ flags |= 1 << CPT_CPU_X86_EMT64; ++#endif ++ return flags; ++} ++ ++unsigned int test_kernel_config() ++{ ++ unsigned int flags = 0; ++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) ++ flags |= 1 << CPT_KERNEL_CONFIG_PAE; ++#endif ++ return flags; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_kernel.h linux-2.6.16-026test015/kernel/cpt/cpt_kernel.h +--- linux-2.6.16.orig/kernel/cpt/cpt_kernel.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_kernel.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,74 @@ ++/* Interface to kernel vars which we had to _add_. */ ++ ++asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); ++ ++#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++#define TASK_TRACED TASK_STOPPED ++#define unix_peer(sk) ((sk)->sk_pair) ++#define page_mapcount(pg) ((pg)->mapcount) ++#else ++#define unix_peer(sk) (unix_sk(sk)->peer) ++#endif ++ ++#ifdef CONFIG_X86_64 ++#define cpu_has_fxsr 1 ++#endif ++ ++static inline void do_gettimespec(struct timespec *ts) ++{ ++ struct timeval tv; ++ do_gettimeofday(&tv); ++ ts->tv_sec = tv.tv_sec; ++ ts->tv_nsec = tv.tv_usec*1000; ++} ++ ++int local_kernel_thread(int (*fn)(void *), ++ void * arg, ++ unsigned long flags, ++ pid_t pid); ++int asm_kernel_thread(int (*fn)(void *), ++ void * arg, ++ unsigned long flags, ++ pid_t pid); ++ ++unsigned int test_cpu_caps(void); ++unsigned int test_kernel_config(void); ++ ++#define test_one_flag(src, dst, flag, message, ret) \ ++if (src & (1 << flag)) \ ++ if (!(dst & (1 << flag))) { \ ++ wprintk("Destination cpu does not have " message "\n"); \ ++ ret = 1; \ ++ } ++ ++static inline void ++_set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) ++{ ++ while (nsec >= NSEC_PER_SEC) { ++ nsec -= NSEC_PER_SEC; ++ ++sec; ++ } ++ while (nsec < 0) { ++ nsec += NSEC_PER_SEC; ++ --sec; ++ } ++ ts->tv_sec = sec; ++ ts->tv_nsec = nsec; ++} ++ ++static inline struct timespec ++_ns_to_timespec(const nsec_t nsec) ++{ ++ struct timespec ts; ++ ++ if (!nsec) ++ return (struct timespec) {0, 0}; ++ ++ ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); ++ if (unlikely(nsec < 0)) ++ _set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); ++ ++ return ts; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_mm.c linux-2.6.16-026test015/kernel/cpt/cpt_mm.c +--- linux-2.6.16.orig/kernel/cpt/cpt_mm.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_mm.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,823 @@ ++/* ++ * ++ * kernel/cpt/cpt_mm.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/hugetlb.h> ++#include <linux/errno.h> ++#include <linux/ve.h> ++#include <linux/pagemap.h> ++#include <linux/rmap.h> ++#include <asm/ldt.h> ++#include <asm/mmu.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++#include "cpt_pagein.h" ++#endif ++#include "cpt_ubc.h" ++ ++static int collect_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, ++ cpt_context_t *ctx) ++{ ++ if (!list_empty(&aio_ctx->run_list)) { ++ /* This is impossible at least with kernel 2.6.8.1 or 2.6.16 */ ++ eprintk_ctx("run list is not empty, cannot suspend AIO\n"); ++ return -EBUSY; ++ } ++ ++ /* Wait for pending IOCBs. Linux AIO is mostly _fake_. ++ * It is actually synchronous, except for direct IO and ++ * some funny raw USB things, which cannot happen inside VE. ++ * However, we do this for future. ++ * ++ * Later note: in 2.6.16 we may allow O_DIRECT, so that ++ * it is not meaningless code. ++ */ ++ wait_for_all_aios(aio_ctx); ++ ++ if (!list_empty(&aio_ctx->run_list) || ++ !list_empty(&aio_ctx->active_reqs) || ++ aio_ctx->reqs_active) { ++ eprintk_ctx("were not able to suspend AIO\n"); ++ return -EBUSY; ++ } ++ ++ return 0; ++} ++ ++static int collect_one_mm(struct mm_struct *mm, cpt_context_t * ctx) ++{ ++ struct vm_area_struct *vma; ++ ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ if (vma->vm_file) { ++ if (cpt_object_add(CPT_OBJ_FILE, vma->vm_file, ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ if (cpt_add_ubc(mm->mm_ub, ctx) == NULL) ++ return -ENOMEM; ++ ++ if (mm->ioctx_list) { ++ struct kioctx *aio_ctx; ++ int err; ++ ++ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) ++ if ((err = collect_one_aio_ctx(mm, aio_ctx, ctx)) != 0) ++ return err; ++ } ++ ++ return 0; ++} ++ ++int cpt_collect_mm(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ int err; ++ int index; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ if (tsk->mm && cpt_object_add(CPT_OBJ_MM, tsk->mm, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ index = 1; ++ for_each_object(obj, CPT_OBJ_MM) { ++ struct mm_struct *mm = obj->o_obj; ++ if (obj->o_count != atomic_read(&mm->mm_users)) { ++ eprintk_ctx("mm_struct is referenced outside %d %d\n", obj->o_count, atomic_read(&mm->mm_users)); ++ return -EBUSY; ++ } ++ cpt_obj_setindex(obj, index++, ctx); ++ ++ if ((err = collect_one_mm(mm, ctx)) != 0) ++ return err; ++ } ++ ++ return 0; ++} ++ ++static int zcnt, scnt, scnt0, ucnt; ++ ++/* Function where_is_anon_page() returns address of a anonymous page in mm ++ * of already dumped process. This happens f.e. after fork(). We do not use ++ * this right now, just keep statistics, it is diffucult to restore such state, ++ * but the most direct use is to save space in dumped image. */ ++ ++ ++static inline unsigned long ++vma_address0(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) ++ address |= 1; ++ return address; ++} ++ ++static int really_this_one(struct vm_area_struct *vma, unsigned long address, ++ struct page *page) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *pte; ++ spinlock_t *ptl; ++ int result; ++ ++ pgd = pgd_offset(mm, address); ++ if (unlikely(!pgd_present(*pgd))) ++ return 0; ++ ++ pud = pud_offset(pgd, address); ++ if (!pud_present(*pud)) ++ return 0; ++ ++ pmd = pmd_offset(pud, address); ++ if (unlikely(!pmd_present(*pmd))) ++ return 0; ++ ++ result = 0; ++ pte = pte_offset_map(pmd, address); ++ if (!pte_present(*pte)) { ++ pte_unmap(pte); ++ return 0; ++ } ++ ++ ptl = pte_lockptr(mm, pmd); ++ if (!spin_trylock(ptl)) { ++ pte_unmap(pte); ++ return 0; ++ } ++ if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) ++ result = 1; ++ pte_unmap_unlock(pte, ptl); ++ return result; ++} ++ ++static loff_t where_is_anon_page(cpt_object_t *mmobj, unsigned long mapaddr, ++ struct page *page, cpt_context_t * ctx) ++{ ++ loff_t mmptr = CPT_NULL; ++ struct anon_vma *anon_vma; ++ struct vm_area_struct *vma; ++ int idx = mmobj->o_index; ++ ++ if (!PageAnon(page)) ++ return CPT_NULL; ++ ++ anon_vma = page_lock_anon_vma(page); ++ if (!anon_vma) ++ return CPT_NULL; ++ ++ list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { ++ unsigned long addr = vma_address0(page, vma); ++ cpt_object_t *obj; ++ ++ /* We do not try to support mremapped regions (addr != mapaddr), ++ * only mmaps directly inherited via fork(). ++ * With this limitation we may check self-consistency of ++ * vmas (vm_start, vm_pgoff, anon_vma) before ++ * doing __copy_page_range() in rst_mm. ++ */ ++ if (mmobj->o_obj != vma->vm_mm && addr == mapaddr) { ++ obj = lookup_cpt_object(CPT_OBJ_MM, vma->vm_mm, ctx); ++ if (obj && obj->o_pos != CPT_NULL && obj->o_index < idx) { ++ if (really_this_one(vma, addr, page)) { ++ mmptr = obj->o_pos; ++ idx = obj->o_index; ++ } ++ } ++ } ++ } ++ spin_unlock(&anon_vma->lock); ++ ++ return mmptr; ++} ++ ++struct page_area ++{ ++ int type; ++ unsigned long start; ++ unsigned long end; ++ pgoff_t pgoff; ++ loff_t mm; ++}; ++ ++struct page_desc ++{ ++ int type; ++ pgoff_t index; ++ loff_t mm; ++ int shared; ++}; ++ ++enum { ++ PD_ABSENT, ++ PD_COPY, ++ PD_ZERO, ++ PD_CLONE, ++ PD_FUNKEY, ++ PD_LAZY ++}; ++ ++/* 0: page can be obtained from backstore, or still not mapped anonymous page, ++ or something else, which does not requre copy. ++ 1: page requires copy ++ 2: page requres copy but its content is zero. Quite useless. ++ 3: wp page is shared after fork(). It is to be COWed when modified. ++ 4: page is something unsupported... We copy it right now. ++ */ ++ ++ ++ ++static void page_get_desc(cpt_object_t *mmobj, ++ struct vm_area_struct *vma, unsigned long addr, ++ struct page_desc *pdesc, cpt_context_t * ctx) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep, pte; ++ spinlock_t *ptl; ++ struct page *pg; ++ pgoff_t linear_index = (addr - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff; ++ ++ pdesc->index = linear_index; ++ pdesc->shared = 0; ++ ++ if (vma->vm_flags & VM_IO) { ++ pdesc->type = PD_ABSENT; ++ return; ++ } ++ ++ pgd = pgd_offset(mm, addr); ++ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) ++ goto out_absent; ++ pud = pud_offset(pgd, addr); ++ if (pud_none(*pud) || unlikely(pud_bad(*pud))) ++ goto out_absent; ++ pmd = pmd_offset(pud, addr); ++ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) ++ goto out_absent; ++ if (pmd_huge(*pmd)) { ++ eprintk_ctx("page_huge\n"); ++ goto out_unsupported; ++ } ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!ptep) ++ goto out_absent; ++ ++ pte = *ptep; ++ if (pte_none(pte)) ++ goto out_absent_unmap; ++ ++ if (!pte_present(pte)) { ++ if (pte_file(pte)) { ++ pdesc->index = pte_to_pgoff(pte); ++ goto out_absent_unmap; ++ } ++ if (vma->vm_flags & VM_SHARED) { ++ /* It is impossible: shared mappings cannot be in swap */ ++ eprintk_ctx("shared mapping is not present: %08lx@%Ld\n", addr, mmobj->o_pos); ++ goto out_unsupported_unmap; ++ } ++ /* Otherwise it is in swap. */ ++ goto out_lazy_unmap; ++ } else if ((pg = vm_normal_page(vma, addr, pte)) != NULL) { ++ ++ if (pg->mapping && !PageAnon(pg)) { ++ if (vma->vm_file == NULL) { ++ eprintk_ctx("pg->mapping!=NULL for fileless vma: %08lx\n", addr); ++ goto out_unsupported_unmap; ++ } ++ if (vma->vm_file->f_mapping != pg->mapping) { ++ eprintk_ctx("pg->mapping!=f_mapping: %08lx %p %p %Ld\n", addr, vma->vm_file->f_mapping, pg->mapping, mmobj->o_pos); ++ goto out_unsupported_unmap; ++ } ++ pdesc->index = (pg->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); ++ /* Page is in backstore. For us it is like ++ * it is not present. ++ */ ++ goto out_absent_unmap; ++ } ++ ++ if (PageReserved(pg)) { ++ /* Special case: ZERO_PAGE is used, when an ++ * anonymous page is accessed but not written. */ ++ if (pg == ZERO_PAGE(addr)) { ++ if (pte_write(pte)) { ++ eprintk_ctx("not funny already, writable ZERO_PAGE\n"); ++ goto out_unsupported_unmap; ++ } ++ zcnt++; ++ goto out_absent_unmap; ++ } ++ eprintk_ctx("reserved page %lu at %08lx@%Ld\n", pg->index, addr, mmobj->o_pos); ++ goto out_unsupported_unmap; ++ } ++ ++ if (pg == ZERO_PAGE(addr)) { ++ wprintk_ctx("that's how it works now\n"); ++ } ++ ++ if (!pg->mapping) { ++ eprintk_ctx("page without mapping at %08lx@%Ld\n", addr, mmobj->o_pos); ++ goto out_unsupported_unmap; ++ } ++ ++ if (pg->mapping && page_mapcount(pg) > 1) { ++ pdesc->shared = 1; ++ pdesc->mm = where_is_anon_page(mmobj, addr, pg, ctx); ++ if (pdesc->mm != CPT_NULL) { ++ scnt0++; ++ goto out_clone_unmap; ++ } else { ++ scnt++; ++ } ++ } ++ ++ if (!pte_young(pte)) ++ goto out_lazy_unmap; ++ } ++ pte_unmap_unlock(ptep, ptl); ++ pdesc->type = PD_COPY; ++ return; ++ ++out_lazy_unmap: ++ pte_unmap_unlock(ptep, ptl); ++ pdesc->type = PD_LAZY; ++ return; ++ ++out_absent_unmap: ++ pte_unmap_unlock(ptep, ptl); ++out_absent: ++ pdesc->type = PD_ABSENT; ++ return; ++ ++out_clone_unmap: ++ pte_unmap_unlock(ptep, ptl); ++ pdesc->type = PD_CLONE; ++ return; ++ ++out_unsupported_unmap: ++ pte_unmap_unlock(ptep, ptl); ++out_unsupported: ++ ucnt++; ++ pdesc->type = PD_FUNKEY; ++ return; ++} ++ ++/* ATTN: We give "current" to get_user_pages(). This is wrong, but get_user_pages() ++ * does not really need this thing. It just stores some page fault stats there. ++ * ++ * BUG: some archs (f.e. sparc64, but not Intel*) require flush cache pages ++ * before accessing vma. ++ */ ++void dump_pages(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, struct cpt_context *ctx) ++{ ++#define MAX_PAGE_BATCH 16 ++ struct page *pg[MAX_PAGE_BATCH]; ++ int npages = (end - start)/PAGE_SIZE; ++ int count = 0; ++ ++ while (count < npages) { ++ int copy = npages - count; ++ int n; ++ ++ if (copy > MAX_PAGE_BATCH) ++ copy = MAX_PAGE_BATCH; ++ n = get_user_pages(current, vma->vm_mm, start, copy, ++ 0, 1, pg, NULL); ++ if (n == copy) { ++ int i; ++ for (i=0; i<n; i++) { ++ char *maddr = kmap(pg[i]); ++ ctx->write(maddr, PAGE_SIZE, ctx); ++ kunmap(pg[i]); ++ } ++ } else { ++ eprintk_ctx("get_user_pages fault"); ++ for ( ; n > 0; n--) ++ page_cache_release(pg[n-1]); ++ return; ++ } ++ start += n*PAGE_SIZE; ++ count += n; ++ for ( ; n > 0; n--) ++ page_cache_release(pg[n-1]); ++ } ++ return; ++} ++ ++int dump_page_block(struct vm_area_struct *vma, struct cpt_page_block *pgb, ++ int copy, ++ struct cpt_context *ctx) ++{ ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb->cpt_object = (copy != PD_LAZY) ? CPT_OBJ_PAGES : CPT_OBJ_LAZYPAGES; ++ pgb->cpt_hdrlen = sizeof(*pgb); ++ pgb->cpt_content = (copy == PD_COPY || copy == PD_LAZY) ? CPT_CONTENT_DATA : CPT_CONTENT_VOID; ++ ++ ctx->write(pgb, sizeof(*pgb), ctx); ++ if (copy == PD_COPY || copy == PD_LAZY) ++ dump_pages(vma, pgb->cpt_start, pgb->cpt_end, ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_remappage_block(struct vm_area_struct *vma, struct page_area *pa, ++ struct cpt_context *ctx) ++{ ++ struct cpt_remappage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = CPT_OBJ_REMAPPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++ pgb.cpt_pgoff = pa->pgoff - (pa->end-pa->start)/PAGE_SIZE + 1; ++ ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_copypage_block(struct vm_area_struct *vma, struct page_area *pa, ++ struct cpt_context *ctx) ++{ ++ struct cpt_copypage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = CPT_OBJ_COPYPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++ pgb.cpt_source = pa->mm; ++ ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++int dump_lazypage_block(struct vm_area_struct *vma, struct page_area *pa, ++ cpt_context_t *ctx) ++{ ++ struct cpt_lazypage_block pgb; ++ loff_t saved_object; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ pgb.cpt_object = CPT_OBJ_LAZYPAGES; ++ pgb.cpt_hdrlen = sizeof(pgb); ++ pgb.cpt_content = CPT_CONTENT_VOID; ++ pgb.cpt_start = pa->start; ++ pgb.cpt_end = pa->end; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ pgb.cpt_index = cpt_alloc_pgin_index(vma, pa->start, ++ (pa->end-pa->start)/PAGE_SIZE, ctx); ++#endif ++ ctx->write(&pgb, sizeof(pgb), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++static int can_expand(struct page_area *pa, struct page_desc *pd) ++{ ++ if (pa->start == pa->end) ++ return 1; ++ if (pa->type != pd->type) ++ return 0; ++ if (pa->type == PD_ABSENT) ++ return pd->index == pa->pgoff + 1; ++ if (pa->type == PD_CLONE) ++ return pd->mm == pa->mm; ++ return 1; ++} ++ ++static int dump_one_vma(cpt_object_t *mmobj, ++ struct vm_area_struct *vma, struct cpt_context *ctx) ++{ ++ struct cpt_vma_image *v = cpt_get_buf(ctx); ++ unsigned long addr; ++ loff_t saved_object; ++ struct cpt_page_block pgb; ++ struct page_area pa; ++ int cloned_pages = 0; ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ v->cpt_object = CPT_OBJ_VMA; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_start = vma->vm_start; ++ v->cpt_end = vma->vm_end; ++ v->cpt_flags = vma->vm_flags; ++ if (vma->vm_flags&VM_HUGETLB) { ++ eprintk_ctx("huge TLB VMAs are still not supported\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_pgprot = vma->vm_page_prot.pgprot; ++ v->cpt_pgoff = vma->vm_pgoff; ++ v->cpt_file = CPT_NULL; ++ v->cpt_type = CPT_VMA_TYPE_0; ++ v->cpt_anonvma = 0; ++ ++ /* We have to remember what VMAs are bound to one anon_vma. ++ * So, we store an identifier of group of VMAs. It is handy ++ * to use absolute address of anon_vma as this identifier. */ ++ v->cpt_anonvmaid = (unsigned long)vma->anon_vma; ++ ++ if (vma->vm_file) { ++ struct file *filp; ++ cpt_object_t *obj = lookup_cpt_object(CPT_OBJ_FILE, vma->vm_file, ctx); ++ if (obj == NULL) BUG(); ++ filp = obj->o_obj; ++ if (filp->f_op && ++ filp->f_op->read == NULL && ++ filp->f_dentry->d_inode->i_sb->s_magic == FSMAGIC_TMPFS) ++ v->cpt_type = CPT_VMA_TYPE_SHM; ++ v->cpt_file = obj->o_pos; ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ pa.type = PD_ABSENT; ++ pa.pgoff = vma->vm_pgoff; ++ pa.mm = CPT_NULL; ++ pa.start = vma->vm_start; ++ pa.end = vma->vm_start; ++ ++ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { ++ struct page_desc pd; ++ ++ page_get_desc(mmobj, vma, addr, &pd, ctx); ++ cloned_pages += pd.shared; ++ ++ if (pd.type == PD_FUNKEY) { ++ eprintk_ctx("dump_one_vma: funkey page\n"); ++ return -EINVAL; ++ } ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (pd.type == PD_LAZY && ++ (ctx->lazy_vm == 0 || (vma->vm_flags&VM_LOCKED))) ++ pd.type = PD_COPY; ++#else ++ if (pd.type == PD_LAZY) ++ pd.type = PD_COPY; ++#endif ++ ++ if (!can_expand(&pa, &pd)) { ++ if (pa.type == PD_COPY || ++ pa.type == PD_ZERO) { ++ pgb.cpt_start = pa.start; ++ pgb.cpt_end = pa.end; ++ dump_page_block(vma, &pgb, pa.type, ctx); ++ } else if (pa.type == PD_CLONE) { ++ dump_copypage_block(vma, &pa, ctx); ++ cloned_pages++; ++ } else if (pa.type == PD_LAZY) { ++ dump_lazypage_block(vma, &pa, ctx); ++ } else if (pa.type == PD_ABSENT && ++ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { ++ dump_remappage_block(vma, &pa, ctx); ++ } ++ pa.start = addr; ++ } ++ pa.type = pd.type; ++ pa.end = addr + PAGE_SIZE; ++ pa.pgoff = pd.index; ++ pa.mm = pd.mm; ++ } ++ ++ if (pa.end > pa.start) { ++ if (pa.type == PD_COPY || ++ pa.type == PD_ZERO) { ++ pgb.cpt_start = pa.start; ++ pgb.cpt_end = pa.end; ++ dump_page_block(vma, &pgb, pa.type, ctx); ++ } else if (pa.type == PD_CLONE) { ++ dump_copypage_block(vma, &pa, ctx); ++ cloned_pages++; ++ } else if (pa.type == PD_LAZY) { ++ dump_lazypage_block(vma, &pa, ctx); ++ } else if (pa.type == PD_ABSENT && ++ pa.pgoff != (pa.end - vma->vm_start)/PAGE_SIZE + vma->vm_pgoff - 1) { ++ dump_remappage_block(vma, &pa, ctx); ++ } ++ } ++ ++ if (cloned_pages) { ++ __u32 anonvma = 1; ++ loff_t anonpos = ctx->current_object + offsetof(struct cpt_vma_image, cpt_anonvma); ++ ctx->pwrite(&anonvma, 4, ctx, anonpos); ++ } ++ ++ cpt_close_object(ctx); ++ ++ cpt_pop_object(&saved_object, ctx); ++ ++ return 0; ++} ++ ++static int dump_one_aio_ctx(struct mm_struct *mm, struct kioctx *aio_ctx, ++ cpt_context_t *ctx) ++{ ++ loff_t saved_object; ++ struct cpt_aio_ctx_image aimg; ++ ++ if (!list_empty(&aio_ctx->run_list) || ++ !list_empty(&aio_ctx->active_reqs) || ++ aio_ctx->reqs_active) { ++ eprintk_ctx("AIO is active after suspend\n"); ++ return -EBUSY; ++ } ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ aimg.cpt_next = CPT_ALIGN(sizeof(aimg)); ++ aimg.cpt_object = CPT_OBJ_AIO_CONTEXT; ++ aimg.cpt_hdrlen = sizeof(aimg); ++ aimg.cpt_content = CPT_CONTENT_ARRAY; ++ ++ aimg.cpt_max_reqs = aio_ctx->max_reqs; ++ aimg.cpt_ring_pages = aio_ctx->ring_info.nr_pages; ++ aimg.cpt_nr = aio_ctx->ring_info.nr; ++ aimg.cpt_tail = aio_ctx->ring_info.tail; ++ aimg.cpt_mmap_base = aio_ctx->ring_info.mmap_base; ++ ++ ctx->write(&aimg, sizeof(aimg), ctx); ++ ++ cpt_pop_object(&saved_object, ctx); ++ return 0; ++} ++ ++static int dump_one_mm(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct mm_struct *mm = obj->o_obj; ++ struct vm_area_struct *vma; ++ struct cpt_mm_image *v = cpt_get_buf(ctx); ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = -1; ++ v->cpt_object = CPT_OBJ_MM; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_start_code = mm->start_code; ++ v->cpt_end_code = mm->end_code; ++ v->cpt_start_data = mm->start_data; ++ v->cpt_end_data = mm->end_data; ++ v->cpt_start_brk = mm->start_brk; ++ v->cpt_brk = mm->brk; ++ v->cpt_start_stack = mm->start_stack; ++ v->cpt_start_arg = mm->arg_start; ++ v->cpt_end_arg = mm->arg_end; ++ v->cpt_start_env = mm->env_start; ++ v->cpt_end_env = mm->env_end; ++ v->cpt_def_flags = mm->def_flags; ++ v->cpt_mmub = cpt_lookup_ubc(mm->mm_ub, ctx); ++ v->cpt_dumpable = mm->dumpable; ++ v->cpt_vps_dumpable = mm->vps_dumpable; ++ v->cpt_used_hugetlb = 0; /* not used */ ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (mm->context.size) { ++ loff_t saved_object; ++ struct cpt_obj_bits b; ++ int size; ++ ++ dprintk_ctx("nontrivial LDT\n"); ++ ++ cpt_push_object(&saved_object, ctx); ++ ++ cpt_open_object(NULL, ctx); ++ b.cpt_next = CPT_NULL; ++ b.cpt_object = CPT_OBJ_BITS; ++ b.cpt_hdrlen = sizeof(b); ++ b.cpt_content = CPT_CONTENT_MM_CONTEXT; ++ b.cpt_size = mm->context.size*LDT_ENTRY_SIZE; ++ ++ ctx->write(&b, sizeof(b), ctx); ++ ++ size = mm->context.size*LDT_ENTRY_SIZE; ++ ++#if defined(CONFIG_X86_64) || LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,15) ++ ctx->write(mm->context.ldt, size, ctx); ++#else ++ for (i = 0; i < size; i += PAGE_SIZE) { ++ int nr = i / PAGE_SIZE, bytes; ++ char *kaddr = kmap(mm->context.ldt_pages[nr]); ++ ++ bytes = size - i; ++ if (bytes > PAGE_SIZE) ++ bytes = PAGE_SIZE; ++ ctx->write(kaddr, bytes, ctx); ++ kunmap(mm->context.ldt_pages[nr]); ++ } ++#endif ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_object, ctx); ++ } ++ ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ int err; ++ ++#ifdef CONFIG_X86_64 ++ if (vma->vm_start == 0xFFFFE000 && ++ vma->vm_end == 0xFFFFF000) ++ continue; ++#endif ++ ++ if ((err = dump_one_vma(obj, vma, ctx)) != 0) ++ return err; ++ } ++ ++ if (mm->ioctx_list) { ++ struct kioctx *aio_ctx; ++ int err; ++ ++ for (aio_ctx = mm->ioctx_list; aio_ctx; aio_ctx = aio_ctx->next) ++ if ((err = dump_one_aio_ctx(mm, aio_ctx, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_vm(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ scnt = scnt0 = zcnt = 0; ++ ++ cpt_open_section(ctx, CPT_SECT_MM); ++ ++ for_each_object(obj, CPT_OBJ_MM) { ++ int err; ++ ++ if ((err = dump_one_mm(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ ++ if (scnt) ++ dprintk_ctx("cpt_dump_vm: %d shared private anon pages\n", scnt); ++ if (scnt0) ++ dprintk_ctx("cpt_dump_vm: %d anon pages are cloned\n", scnt0); ++ if (zcnt) ++ dprintk_ctx("cpt_dump_vm: %d silly pages canceled\n", zcnt); ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_mm.h linux-2.6.16-026test015/kernel/cpt/cpt_mm.h +--- linux-2.6.16.orig/kernel/cpt/cpt_mm.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_mm.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,16 @@ ++int cpt_collect_mm(cpt_context_t *); ++ ++int cpt_dump_vm(struct cpt_context *ctx); ++ ++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx); ++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++ ++int cpt_mm_prepare(unsigned long veid); ++ ++int cpt_free_pgin_dir(struct cpt_context *); ++int cpt_start_pagein(struct cpt_context *); ++int rst_setup_pagein(struct cpt_context *); ++int rst_complete_pagein(struct cpt_context *, int); ++int rst_pageind(struct cpt_context *); ++int rst_swapoff(struct cpt_context *); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_net.c linux-2.6.16-026test015/kernel/cpt/cpt_net.c +--- linux-2.6.16.orig/kernel/cpt/cpt_net.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_net.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,428 @@ ++/* ++ * ++ * kernel/cpt/cpt_net.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <net/addrconf.h> ++#include <linux/rtnetlink.h> ++#include <linux/ve.h> ++#include <linux/ve_proto.h> ++#include <linux/vzcalluser.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++#include "cpt_syscalls.h" ++ ++int cpt_dump_link(struct cpt_context * ctx) ++{ ++ struct net_device *dev; ++ ++ cpt_open_section(ctx, CPT_SECT_NET_DEVICE); ++ for (dev = dev_base; dev; dev = dev->next) { ++ struct cpt_netdev_image v; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_DEVICE; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_index = dev->ifindex; ++ v.cpt_flags = dev->flags; ++ memcpy(v.cpt_name, dev->name, IFNAMSIZ); ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ ++ if (strcmp(dev->name, "lo") != 0 && ++ strcmp(dev->name, "venet0") != 0) { ++ eprintk_ctx("unsupported netdevice %s\n", dev->name); ++ cpt_close_section(ctx); ++ return -EBUSY; ++ } ++ } ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_suspend_network(struct cpt_context *ctx) ++{ ++ get_exec_env()->disable_net = 1; ++ synchronize_net(); ++ return 0; ++} ++ ++int cpt_resume_network(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ env->disable_net = 0; ++ put_ve(env); ++ return 0; ++} ++ ++int cpt_dump_ifaddr(struct cpt_context * ctx) ++{ ++ struct net_device *dev; ++ ++ cpt_open_section(ctx, CPT_SECT_NET_IFADDR); ++ for (dev = dev_base; dev; dev = dev->next) { ++ struct in_device *idev = in_dev_get(dev); ++ struct in_ifaddr *ifa; ++ ++ if (!idev) ++ continue; ++ ++ for (ifa = idev->ifa_list; ifa; ifa = ifa->ifa_next) { ++ struct cpt_ifaddr_image v; ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_IFADDR; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_index = dev->ifindex; ++ v.cpt_family = AF_INET; ++ v.cpt_masklen = ifa->ifa_prefixlen; ++ v.cpt_flags = ifa->ifa_flags; ++ v.cpt_scope = ifa->ifa_scope; ++ memset(&v.cpt_address, 0, sizeof(v.cpt_address)); ++ memset(&v.cpt_peer, 0, sizeof(v.cpt_peer)); ++ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); ++ v.cpt_address[0] = ifa->ifa_local; ++ v.cpt_peer[0] = ifa->ifa_address; ++ v.cpt_broadcast[0] = ifa->ifa_broadcast; ++ memcpy(v.cpt_label, ifa->ifa_label, IFNAMSIZ); ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ } ++ in_dev_put(idev); ++ } ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ for (dev = dev_base; dev; dev = dev->next) { ++ struct inet6_dev *idev = in6_dev_get(dev); ++ struct inet6_ifaddr *ifa; ++ ++ if (!idev) ++ continue; ++ ++ for (ifa = idev->addr_list; ifa; ifa = ifa->if_next) { ++ struct cpt_ifaddr_image v; ++ ++ if (dev == &loopback_dev && ++ ifa->prefix_len == 128 && ++ ifa->addr.s6_addr32[0] == 0 && ++ ifa->addr.s6_addr32[1] == 0 && ++ ifa->addr.s6_addr32[2] == 0 && ++ ifa->addr.s6_addr32[3] == htonl(1)) ++ continue; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_IFADDR; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_index = dev->ifindex; ++ v.cpt_family = AF_INET6; ++ v.cpt_masklen = ifa->prefix_len; ++ v.cpt_flags = ifa->flags; ++ v.cpt_scope = ifa->scope; ++ memcpy(&v.cpt_address, &ifa->addr, 16); ++ memcpy(&v.cpt_peer, &ifa->addr, 16); ++ memset(&v.cpt_broadcast, 0, sizeof(v.cpt_broadcast)); ++ memcpy(v.cpt_label, dev->name, IFNAMSIZ); ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ } ++ in6_dev_put(idev); ++ } ++#endif ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int cpt_dump_route(struct cpt_context * ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct msghdr msg; ++ struct iovec iov; ++ struct { ++ struct nlmsghdr nlh; ++ struct rtgenmsg g; ++ } req; ++ struct sockaddr_nl nladdr; ++ struct cpt_object_hdr v; ++ mm_segment_t oldfs; ++ char *pg; ++ ++ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); ++ if (err) ++ return err; ++ ++ memset(&nladdr, 0, sizeof(nladdr)); ++ nladdr.nl_family = AF_NETLINK; ++ ++ req.nlh.nlmsg_len = sizeof(req); ++ req.nlh.nlmsg_type = RTM_GETROUTE; ++ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; ++ req.nlh.nlmsg_pid = 0; ++ req.g.rtgen_family = AF_INET; ++ ++ iov.iov_base=&req; ++ iov.iov_len=sizeof(req); ++ msg.msg_name=&nladdr; ++ msg.msg_namelen=sizeof(nladdr); ++ msg.msg_iov=&iov; ++ msg.msg_iovlen=1; ++ msg.msg_control=NULL; ++ msg.msg_controllen=0; ++ msg.msg_flags=MSG_DONTWAIT; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_sendmsg(sock, &msg, sizeof(req)); ++ set_fs(oldfs); ++ ++ if (err < 0) ++ goto out_sock; ++ ++ pg = (char*)__get_free_page(GFP_KERNEL); ++ if (pg == NULL) { ++ err = -ENOMEM; ++ goto out_sock; ++ } ++ ++ cpt_open_section(ctx, CPT_SECT_NET_ROUTE); ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NET_ROUTE; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_NLMARRAY; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++restart: ++#endif ++ for (;;) { ++ struct nlmsghdr *h; ++ ++ iov.iov_base = pg; ++ iov.iov_len = PAGE_SIZE; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); ++ set_fs(oldfs); ++ ++ if (err < 0) ++ goto out_sock_pg; ++ if (msg.msg_flags & MSG_TRUNC) { ++ err = -ENOBUFS; ++ goto out_sock_pg; ++ } ++ ++ h = (struct nlmsghdr*)pg; ++ while (NLMSG_OK(h, err)) { ++ if (h->nlmsg_type == NLMSG_DONE) { ++ err = 0; ++ goto done; ++ } ++ if (h->nlmsg_type == NLMSG_ERROR) { ++ struct nlmsgerr *errm = (struct nlmsgerr*)NLMSG_DATA(h); ++ err = errm->error; ++ eprintk_ctx("NLMSG error: %d\n", errm->error); ++ goto done; ++ } ++ if (h->nlmsg_type != RTM_NEWROUTE) { ++ eprintk_ctx("NLMSG: %d\n", h->nlmsg_type); ++ err = -EINVAL; ++ goto done; ++ } ++ ctx->write(h, NLMSG_ALIGN(h->nlmsg_len), ctx); ++ h = NLMSG_NEXT(h, err); ++ } ++ if (err) { ++ eprintk_ctx("!!!Remnant of size %d %d %d\n", err, h->nlmsg_len, h->nlmsg_type); ++ err = -EINVAL; ++ break; ++ } ++ } ++done: ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ if (!err && req.g.rtgen_family == AF_INET) { ++ req.g.rtgen_family = AF_INET6; ++ iov.iov_base=&req; ++ iov.iov_len=sizeof(req); ++ msg.msg_name=&nladdr; ++ msg.msg_namelen=sizeof(nladdr); ++ msg.msg_iov=&iov; ++ msg.msg_iovlen=1; ++ msg.msg_control=NULL; ++ msg.msg_controllen=0; ++ msg.msg_flags=MSG_DONTWAIT; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_sendmsg(sock, &msg, sizeof(req)); ++ set_fs(oldfs); ++ ++ if (err > 0) ++ goto restart; ++ } ++#endif ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ ++out_sock_pg: ++ free_page((unsigned long)pg); ++out_sock: ++ sock_release(sock); ++ return err; ++} ++ ++static int dumpfn(void *arg) ++{ ++ int i; ++ int *pfd = arg; ++ char *argv[] = { "iptables-save", "-c", NULL }; ++ ++ i = real_env_create(VEID(get_exec_env()), VE_ENTER|VE_SKIPLOCK, 2, NULL, 0); ++ if (i < 0) { ++ eprintk("cannot enter ve to dump iptables\n"); ++ module_put(THIS_MODULE); ++ return 1; ++ } ++ ++ if (pfd[1] != 1) ++ sc_dup2(pfd[1], 1); ++ ++ for (i=0; i<current->files->fdt->max_fds; i++) { ++ if (i != 1) ++ sc_close(i); ++ } ++ ++ module_put(THIS_MODULE); ++ ++ set_fs(KERNEL_DS); ++ i = sc_execve("/sbin/iptables-save", argv, NULL); ++ eprintk("failed to exec /sbin/iptables-save: %d\n", i); ++ return -1; ++} ++ ++ ++static int cpt_dump_iptables(struct cpt_context * ctx) ++{ ++ int err; ++ int pid; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ char buf[16]; ++ loff_t pos; ++ int n; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) { ++ eprintk_ctx("sc_pipe: %d\n", err); ++ return err; ++ } ++ err = pid = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); ++ if (err < 0) { ++ eprintk_ctx("local_kernel_thread: %d\n", err); ++ goto out; ++ } ++ f = fget(pfd[0]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ cpt_open_section(ctx, CPT_SECT_NET_IPTABLES); ++ ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_NAME; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_NAME; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ pos = ctx->file->f_pos; ++ do { ++ mm_segment_t oldfs; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ n = f->f_op->read(f, buf, sizeof(buf), &f->f_pos); ++ set_fs(oldfs); ++ if (n > 0) ++ ctx->write(buf, n, ctx); ++ } while (n > 0); ++ ++ if (n < 0) ++ eprintk_ctx("read: %d\n", n); ++ ++ fput(f); ++ ++ if ((err = sc_waitx(pid, 0)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ ++ if (ctx->file->f_pos != pos) { ++ buf[0] = 0; ++ ctx->write(buf, 1, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ } else { ++ pos = ctx->current_section; ++ cpt_close_object(ctx); ++ cpt_close_section(ctx); ++ ctx->sections[CPT_SECT_NET_IPTABLES] = CPT_NULL; ++ ctx->file->f_pos = pos; ++ } ++ return n; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ return err; ++} ++ ++int cpt_dump_ifinfo(struct cpt_context * ctx) ++{ ++ int err; ++ ++ err = cpt_dump_link(ctx); ++ if (!err) ++ err = cpt_dump_ifaddr(ctx); ++ if (!err) ++ err = cpt_dump_route(ctx); ++ if (!err) ++ err = cpt_dump_iptables(ctx); ++ return err; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_net.h linux-2.6.16-026test015/kernel/cpt/cpt_net.h +--- linux-2.6.16.orig/kernel/cpt/cpt_net.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_net.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,7 @@ ++int cpt_dump_ifinfo(struct cpt_context *ctx); ++int rst_restore_net(struct cpt_context *ctx); ++int cpt_suspend_network(struct cpt_context *ctx); ++int cpt_resume_network(struct cpt_context *ctx); ++int rst_resume_network(struct cpt_context *ctx); ++int cpt_dump_ip_conntrack(struct cpt_context *ctx); ++int rst_restore_ip_conntrack(struct cpt_context * ctx); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_obj.c linux-2.6.16-026test015/kernel/cpt/cpt_obj.c +--- linux-2.6.16.orig/kernel/cpt/cpt_obj.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_obj.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,172 @@ ++/* ++ * ++ * kernel/cpt/cpt_obj.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = kmalloc(sizeof(cpt_object_t), gfp); ++ if (obj) { ++ INIT_LIST_HEAD(&obj->o_list); ++ INIT_LIST_HEAD(&obj->o_hash); ++ INIT_LIST_HEAD(&obj->o_alist); ++ obj->o_count = 1; ++ obj->o_pos = CPT_NULL; ++ obj->o_lock = 0; ++ obj->o_parent = NULL; ++ obj->o_index = CPT_NOINDEX; ++ obj->o_obj = NULL; ++ obj->o_image = NULL; ++ ctx->objcount++; ++ } ++ return obj; ++} ++// //EXPORT_SYMBOL(alloc_cpt_object); ++ ++void free_cpt_object(cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ list_del(&obj->o_alist); ++ kfree(obj); ++ ctx->objcount--; ++} ++ ++void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_context_t *ctx) ++{ ++ list_add_tail(&obj->o_list, &ctx->object_array[type]); ++} ++// //EXPORT_SYMBOL(intern_cpt_object); ++ ++void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, ++ cpt_object_t *head, cpt_context_t *ctx) ++{ ++ list_add(&obj->o_list, &head->o_list); ++} ++// //EXPORT_SYMBOL(insert_cpt_object); ++ ++cpt_object_t * __cpt_object_add(enum _cpt_object_type type, void *p, ++ unsigned gfp_mask, cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_object(type, p, ctx); ++ ++ if (obj) { ++ obj->o_count++; ++ return obj; ++ } ++ ++ if ((obj = alloc_cpt_object(gfp_mask, ctx)) != NULL) { ++ if (p) ++ cpt_obj_setobj(obj, p, ctx); ++ intern_cpt_object(type, obj, ctx); ++ return obj; ++ } ++ return NULL; ++} ++// //EXPORT_SYMBOL(__cpt_object_add); ++ ++cpt_object_t * cpt_object_add(enum _cpt_object_type type, void *p, cpt_context_t *ctx) ++{ ++ return __cpt_object_add(type, p, GFP_KERNEL, ctx); ++} ++// //EXPORT_SYMBOL(cpt_object_add); ++ ++cpt_object_t * cpt_object_get(enum _cpt_object_type type, void *p, cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_object(type, p, ctx); ++ ++ if (obj) ++ obj->o_count++; ++ ++ return obj; ++} ++// //EXPORT_SYMBOL(cpt_object_get); ++ ++int cpt_object_init(cpt_context_t *ctx) ++{ ++ int i; ++ ++ for (i=0; i<CPT_OBJ_MAX; i++) { ++ INIT_LIST_HEAD(&ctx->object_array[i]); ++ } ++ return 0; ++} ++ ++int cpt_object_destroy(cpt_context_t *ctx) ++{ ++ int i; ++ ++ for (i=0; i<CPT_OBJ_MAX; i++) { ++ while (!list_empty(&ctx->object_array[i])) { ++ struct list_head *head = ctx->object_array[i].next; ++ cpt_object_t *obj = list_entry(head, cpt_object_t, o_list); ++ list_del(head); ++ if (obj->o_image) ++ kfree(obj->o_image); ++ free_cpt_object(obj, ctx); ++ } ++ } ++ if (ctx->objcount != 0) ++ eprintk_ctx("BUG: ctx->objcount=%d\n", ctx->objcount); ++ return 0; ++} ++ ++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, type) { ++ if (obj->o_obj == p) ++ return obj; ++ } ++ return NULL; ++} ++// //EXPORT_SYMBOL(lookup_cpt_object); ++ ++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, type) { ++ if (obj->o_pos == pos) ++ return obj; ++ } ++ return NULL; ++} ++// //EXPORT_SYMBOL(lookup_cpt_obj_bypos); ++ ++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, type) { ++ if (obj->o_index == index) ++ return obj; ++ } ++ return NULL; ++} ++// //EXPORT_SYMBOL(lookup_cpt_obj_byindex); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_obj.h linux-2.6.16-026test015/kernel/cpt/cpt_obj.h +--- linux-2.6.16.orig/kernel/cpt/cpt_obj.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_obj.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,62 @@ ++#ifndef __CPT_OBJ_H_ ++#define __CPT_OBJ_H_ 1 ++ ++#include <linux/list.h> ++#include <linux/cpt_image.h> ++ ++typedef struct _cpt_object ++{ ++ struct list_head o_list; ++ struct list_head o_hash; ++ int o_count; ++ int o_index; ++ int o_lock; ++ loff_t o_pos; ++ loff_t o_ppos; ++ void *o_obj; ++ void *o_image; ++ void *o_parent; ++ struct list_head o_alist; ++} cpt_object_t; ++ ++struct cpt_context; ++ ++#define for_each_object(obj, type) list_for_each_entry(obj, &ctx->object_array[type], o_list) ++ ++ ++extern cpt_object_t *alloc_cpt_object(int gfp, struct cpt_context *ctx); ++extern void free_cpt_object(cpt_object_t *obj, struct cpt_context *ctx); ++ ++cpt_object_t *lookup_cpt_object(enum _cpt_object_type type, void *p, struct cpt_context *ctx); ++cpt_object_t *lookup_cpt_obj_bypos(enum _cpt_object_type type, loff_t pos, struct cpt_context *ctx); ++cpt_object_t *lookup_cpt_obj_byindex(enum _cpt_object_type type, __u32 index, struct cpt_context *ctx); ++ ++static inline void cpt_obj_setpos(cpt_object_t *cpt, loff_t pos, struct cpt_context *ctx) ++{ ++ cpt->o_pos = pos; ++ /* Add to pos hash table */ ++} ++ ++static inline void cpt_obj_setobj(cpt_object_t *cpt, void *ptr, struct cpt_context *ctx) ++{ ++ cpt->o_obj = ptr; ++ /* Add to hash table */ ++} ++ ++static inline void cpt_obj_setindex(cpt_object_t *cpt, __u32 index, struct cpt_context *ctx) ++{ ++ cpt->o_index = index; ++ /* Add to index hash table */ ++} ++ ++ ++extern void intern_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, struct cpt_context *ctx); ++extern void insert_cpt_object(enum _cpt_object_type type, cpt_object_t *obj, cpt_object_t *head, struct cpt_context *ctx); ++extern cpt_object_t *cpt_object_add(enum _cpt_object_type type, void *p, struct cpt_context *ctx); ++extern cpt_object_t *__cpt_object_add(enum _cpt_object_type type, void *p, unsigned int gfp_mask, struct cpt_context *ctx); ++extern cpt_object_t *cpt_object_get(enum _cpt_object_type type, void *p, struct cpt_context *ctx); ++ ++extern int cpt_object_init(struct cpt_context *ctx); ++extern int cpt_object_destroy(struct cpt_context *ctx); ++ ++#endif /* __CPT_OBJ_H_ */ +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_proc.c linux-2.6.16-026test015/kernel/cpt/cpt_proc.c +--- linux-2.6.16.orig/kernel/cpt/cpt_proc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_proc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,575 @@ ++/* ++ * ++ * kernel/cpt/cpt_proc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/list.h> ++#include <linux/proc_fs.h> ++#include <linux/smp_lock.h> ++#include <asm/uaccess.h> ++#include <linux/cpt_ioctl.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_dump.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++ ++MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>"); ++MODULE_LICENSE("GPL"); ++ ++/* List of contexts and lock protecting the list */ ++static struct list_head cpt_context_list; ++static spinlock_t cpt_context_lock; ++ ++static int proc_read(char *buffer, char **start, off_t offset, ++ int length, int *eof, void *data) ++{ ++ off_t pos = 0; ++ off_t begin = 0; ++ int len = 0; ++ cpt_context_t *ctx; ++ ++ len += sprintf(buffer, "Ctx Id VE State\n"); ++ ++ spin_lock(&cpt_context_lock); ++ ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ len += sprintf(buffer+len,"%p %08x %-8u %d", ++ ctx, ++ ctx->contextid, ++ ctx->ve_id, ++ ctx->ctx_state ++ ); ++ ++ buffer[len++] = '\n'; ++ ++ pos = begin+len; ++ if (pos < offset) { ++ len = 0; ++ begin = pos; ++ } ++ if (pos > offset+length) ++ goto done; ++ } ++ *eof = 1; ++ ++done: ++ spin_unlock(&cpt_context_lock); ++ *start = buffer + (offset - begin); ++ len -= (offset - begin); ++ if(len > length) ++ len = length; ++ if(len < 0) ++ len = 0; ++ return len; ++} ++ ++void cpt_context_release(cpt_context_t *ctx) ++{ ++ list_del(&ctx->ctx_list); ++ spin_unlock(&cpt_context_lock); ++ ++ if (ctx->ctx_state > 0) ++ cpt_resume(ctx); ++ ctx->ctx_state = CPT_CTX_ERROR; ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pgin_task) ++ put_task_struct(ctx->pgin_task); ++ if (ctx->pgin_dir) ++ cpt_free_pgin_dir(ctx); ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++#endif ++ if (ctx->objcount) ++ eprintk_ctx("%d objects leaked\n", ctx->objcount); ++ if (ctx->file) ++ fput(ctx->file); ++ cpt_flush_error(ctx); ++ if (ctx->errorfile) { ++ fput(ctx->errorfile); ++ ctx->errorfile = NULL; ++ } ++ if (ctx->error_msg) { ++ free_page((unsigned long)ctx->error_msg); ++ ctx->error_msg = NULL; ++ } ++ if (ctx->statusfile) ++ fput(ctx->statusfile); ++ if (ctx->lockfile) ++ fput(ctx->lockfile); ++ kfree(ctx); ++ ++ spin_lock(&cpt_context_lock); ++} ++ ++static void __cpt_context_put(cpt_context_t *ctx) ++{ ++ if (!--ctx->refcount) ++ cpt_context_release(ctx); ++} ++ ++static void cpt_context_put(cpt_context_t *ctx) ++{ ++ spin_lock(&cpt_context_lock); ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++} ++ ++cpt_context_t * cpt_context_open(void) ++{ ++ cpt_context_t *ctx; ++ ++ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { ++ cpt_context_init(ctx); ++ spin_lock(&cpt_context_lock); ++ list_add_tail(&ctx->ctx_list, &cpt_context_list); ++ spin_unlock(&cpt_context_lock); ++ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->error_msg != NULL) ++ ctx->error_msg[0] = 0; ++ } ++ return ctx; ++} ++ ++static cpt_context_t * cpt_context_lookup(unsigned int contextid) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ if (ctx->contextid == contextid) { ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ return ctx; ++ } ++ } ++ spin_unlock(&cpt_context_lock); ++ return NULL; ++} ++ ++int cpt_context_lookup_veid(unsigned int veid) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ if (ctx->ve_id == veid && ctx->ctx_state > 0) { ++ spin_unlock(&cpt_context_lock); ++ return 1; ++ } ++ } ++ spin_unlock(&cpt_context_lock); ++ return 0; ++} ++ ++static int cpt_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) ++{ ++ int err = 0; ++ cpt_context_t *ctx; ++ struct file *dfile = NULL; ++ ++ unlock_kernel(); ++ ++ if (cmd == CPT_VMPREP) { ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ err = cpt_mm_prepare(arg); ++#else ++ err = -EINVAL; ++#endif ++ goto out_lock; ++ } ++ ++ if (cmd == CPT_TEST_CAPS) { ++ unsigned int src_flags, dst_flags = arg; ++ ++ err = 0; ++ src_flags = test_cpu_caps(); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); ++ goto out_lock; ++ } ++ ++ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { ++ cpt_context_t *old_ctx; ++ ++ ctx = NULL; ++ if (cmd == CPT_JOIN_CONTEXT) { ++ err = -ENOENT; ++ ctx = cpt_context_lookup(arg); ++ if (!ctx) ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ file->private_data = ctx; ++ ++ if (old_ctx) { ++ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { ++ old_ctx->sticky = 0; ++ old_ctx->refcount--; ++ } ++ __cpt_context_put(old_ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++ err = 0; ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ if (ctx) ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ ++ if (!ctx) { ++ cpt_context_t *old_ctx; ++ ++ err = -ENOMEM; ++ ctx = cpt_context_open(); ++ if (!ctx) ++ goto out_lock; ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ if (!old_ctx) { ++ ctx->refcount++; ++ file->private_data = ctx; ++ } else { ++ old_ctx->refcount++; ++ } ++ if (old_ctx) { ++ __cpt_context_put(ctx); ++ ctx = old_ctx; ++ } ++ spin_unlock(&cpt_context_lock); ++ } ++ ++ if (cmd == CPT_GET_CONTEXT) { ++ unsigned int contextid = (unsigned int)arg; ++ ++ if (ctx->contextid && ctx->contextid != contextid) { ++ err = -EINVAL; ++ goto out_nosem; ++ } ++ if (!ctx->contextid) { ++ cpt_context_t *c1 = cpt_context_lookup(contextid); ++ if (c1) { ++ cpt_context_put(c1); ++ err = -EEXIST; ++ goto out_nosem; ++ } ++ ctx->contextid = contextid; ++ } ++ spin_lock(&cpt_context_lock); ++ if (!ctx->sticky) { ++ ctx->sticky = 1; ++ ctx->refcount++; ++ } ++ spin_unlock(&cpt_context_lock); ++ goto out_nosem; ++ } ++ ++ down(&ctx->main_sem); ++ ++ err = -EBUSY; ++ if (ctx->ctx_state < 0) ++ goto out; ++ ++ err = 0; ++ switch (cmd) { ++ case CPT_SET_DUMPFD: ++ if (ctx->ctx_state == CPT_CTX_DUMPING) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ if (dfile->f_op == NULL || ++ dfile->f_op->write == NULL) { ++ fput(dfile); ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->file) ++ fput(ctx->file); ++ ctx->file = dfile; ++ break; ++ case CPT_SET_ERRORFD: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->errorfile) ++ fput(ctx->errorfile); ++ ctx->errorfile = dfile; ++ break; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ case CPT_SET_PAGEINFDIN: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++ ctx->pagein_file_in = dfile; ++ break; ++ case CPT_SET_PAGEINFDOUT: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ ctx->pagein_file_out = dfile; ++ break; ++ case CPT_SET_LAZY: ++ ctx->lazy_vm = arg; ++ break; ++ case CPT_PAGEIND: ++ err = cpt_start_pagein(ctx); ++ break; ++#endif ++ case CPT_SET_VEID: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->ve_id = arg; ++ break; ++ case CPT_SET_CPU_FLAGS: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->dst_cpu_flags = arg; ++ ctx->src_cpu_flags = test_cpu_caps(); ++ break; ++ case CPT_SUSPEND: ++ if (cpt_context_lookup_veid(ctx->ve_id) || ++ ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->ctx_state = CPT_CTX_SUSPENDING; ++ err = cpt_vps_suspend(ctx); ++ if (err) { ++ if (cpt_resume(ctx) == 0) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ } else { ++ ctx->ctx_state = CPT_CTX_SUSPENDED; ++ } ++ break; ++ case CPT_DUMP: ++ if (!ctx->ctx_state) { ++ err = -ENOENT; ++ break; ++ } ++ err = cpt_dump(ctx); ++ break; ++ case CPT_RESUME: ++ if (ctx->ctx_state == CPT_CTX_IDLE) { ++ err = -ENOENT; ++ break; ++ } ++ err = cpt_resume(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ case CPT_KILL: ++ if (ctx->ctx_state == CPT_CTX_IDLE) { ++ err = -ENOENT; ++ break; ++ } ++ err = cpt_kill(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ case CPT_TEST_VECAPS: ++ { ++ __u32 dst_flags = arg; ++ __u32 src_flags; ++ ++ err = cpt_vps_caps(ctx, &src_flags); ++ if (err) ++ break; ++ ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_CMOV, "cmov", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_FXSR, "fxsr", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE, "sse", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SSE2, "sse2", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_MMX, "mmx", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW, "3dnow", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_3DNOW2, "3dnowext", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_SEP, "sysenter", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_EMT64, "emt64", err); ++ test_one_flag(src_flags, dst_flags, CPT_CPU_X86_IA64, "ia64", err); ++ break; ++ } ++ default: ++ err = -EINVAL; ++ break; ++ } ++ ++out: ++ cpt_flush_error(ctx); ++ up(&ctx->main_sem); ++out_nosem: ++ cpt_context_put(ctx); ++out_lock: ++ lock_kernel(); ++ return err; ++} ++ ++static int cpt_open(struct inode *inode, struct file *file) ++{ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static int cpt_release(struct inode * inode, struct file * file) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ file->private_data = NULL; ++ ++ if (ctx) ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++ ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++ ++static struct file_operations cpt_fops = { ++ .owner = THIS_MODULE, ++ .open = cpt_open, ++ .release = cpt_release, ++ .ioctl = cpt_ioctl, ++}; ++ ++static struct proc_dir_entry *proc_ent; ++ ++static struct ctl_table_header *ctl_header; ++ ++static ctl_table debug_table[] = { ++ { ++ .ctl_name = 9475, ++ .procname = "cpt", ++ .data = &debug_level, ++ .maxlen = sizeof(debug_level), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { .ctl_name = 0 } ++}; ++static ctl_table root_table[] = { ++ { ++ .ctl_name = CTL_DEBUG, ++ .procname = "debug", ++ .mode = 0555, ++ .child = debug_table, ++ }, ++ { .ctl_name = 0 } ++}; ++ ++static int __init init_cpt(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ctl_header = register_sysctl_table(root_table, 0); ++ if (!ctl_header) ++ goto err_mon; ++ ++ spin_lock_init(&cpt_context_lock); ++ INIT_LIST_HEAD(&cpt_context_list); ++ ++ err = -EINVAL; ++ proc_ent = create_proc_entry("cpt", 0600, NULL); ++ if (!proc_ent) ++ goto err_out; ++ ++ cpt_fops.read = proc_ent->proc_fops->read; ++ cpt_fops.write = proc_ent->proc_fops->write; ++ cpt_fops.llseek = proc_ent->proc_fops->llseek; ++ proc_ent->proc_fops = &cpt_fops; ++ ++ proc_ent->read_proc = proc_read; ++ proc_ent->data = NULL; ++ proc_ent->owner = THIS_MODULE; ++ return 0; ++ ++err_out: ++ unregister_sysctl_table(ctl_header); ++err_mon: ++ return err; ++} ++module_init(init_cpt); ++ ++static void __exit exit_cpt(void) ++{ ++ remove_proc_entry("cpt", NULL); ++ unregister_sysctl_table(ctl_header); ++ ++ spin_lock(&cpt_context_lock); ++ while (!list_empty(&cpt_context_list)) { ++ cpt_context_t *ctx; ++ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); ++ ++ if (!ctx->sticky) ++ ctx->refcount++; ++ ctx->sticky = 0; ++ ++ BUG_ON(ctx->refcount != 1); ++ ++ __cpt_context_put(ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++} ++module_exit(exit_cpt); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_process.c linux-2.6.16-026test015/kernel/cpt/cpt_process.c +--- linux-2.6.16.orig/kernel/cpt/cpt_process.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,986 @@ ++/* ++ * ++ * kernel/cpt/cpt_process.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/compat.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_ubc.h" ++#include "cpt_process.h" ++#include "cpt_kernel.h" ++ ++#ifdef CONFIG_X86_32 ++#undef task_pt_regs ++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) ++#endif ++ ++static u32 encode_segment(u32 segreg) ++{ ++ segreg &= 0xFFFF; ++ ++ if (segreg == 0) ++ return CPT_SEG_ZERO; ++ if ((segreg & 3) != 3) { ++ wprintk("Invalid RPL of a segment reg %x\n", segreg); ++ return CPT_SEG_ZERO; ++ } ++ ++ /* LDT descriptor, it is just an index to LDT array */ ++ if (segreg & 4) ++ return CPT_SEG_LDT + (segreg >> 3); ++ ++ /* TLS descriptor. */ ++ if ((segreg >> 3) >= GDT_ENTRY_TLS_MIN && ++ (segreg >> 3) <= GDT_ENTRY_TLS_MAX) ++ return CPT_SEG_TLS1 + ((segreg>>3) - GDT_ENTRY_TLS_MIN); ++ ++ /* One of standard desriptors */ ++#ifdef CONFIG_X86_64 ++ if (segreg == __USER32_DS) ++ return CPT_SEG_USER32_DS; ++ if (segreg == __USER32_CS) ++ return CPT_SEG_USER32_CS; ++ if (segreg == __USER_DS) ++ return CPT_SEG_USER64_DS; ++ if (segreg == __USER_CS) ++ return CPT_SEG_USER64_CS; ++#else ++ if (segreg == __USER_DS) ++ return CPT_SEG_USER32_DS; ++ if (segreg == __USER_CS) ++ return CPT_SEG_USER32_CS; ++#endif ++ wprintk("Invalid segment reg %x\n", segreg); ++ return CPT_SEG_ZERO; ++} ++ ++#ifdef CONFIG_X86_64 ++static void xlate_ptregs_64_to_32(struct cpt_x86_regs *d, struct pt_regs *s, task_t *tsk) ++{ ++ d->cpt_ebp = s->rbp; ++ d->cpt_ebx = s->rbx; ++ d->cpt_eax = s->rax; ++ d->cpt_ecx = s->rcx; ++ d->cpt_edx = s->rdx; ++ d->cpt_esi = s->rsi; ++ d->cpt_edi = s->rdi; ++ d->cpt_orig_eax = s->orig_rax; ++ d->cpt_eip = s->rip; ++ d->cpt_xcs = encode_segment(s->cs); ++ d->cpt_eflags = s->eflags; ++ d->cpt_esp = s->rsp; ++ d->cpt_xss = encode_segment(s->ss); ++ d->cpt_xds = encode_segment(tsk->thread.ds); ++ d->cpt_xes = encode_segment(tsk->thread.es); ++} ++ ++static int dump_registers(task_t *tsk, struct cpt_context *ctx) ++{ ++ cpt_open_object(NULL, ctx); ++ ++ if (tsk->thread_info->flags&_TIF_IA32) { ++ struct cpt_x86_regs ri; ++ ri.cpt_next = sizeof(ri); ++ ri.cpt_object = CPT_OBJ_X86_REGS; ++ ri.cpt_hdrlen = sizeof(ri); ++ ri.cpt_content = CPT_CONTENT_VOID; ++ ++ ri.cpt_debugreg[0] = tsk->thread.debugreg0; ++ ri.cpt_debugreg[1] = tsk->thread.debugreg1; ++ ri.cpt_debugreg[2] = tsk->thread.debugreg2; ++ ri.cpt_debugreg[3] = tsk->thread.debugreg3; ++ ri.cpt_debugreg[4] = 0; ++ ri.cpt_debugreg[5] = 0; ++ ri.cpt_debugreg[6] = tsk->thread.debugreg6; ++ ri.cpt_debugreg[7] = tsk->thread.debugreg7; ++ ri.cpt_fs = encode_segment(tsk->thread.fsindex); ++ ri.cpt_gs = encode_segment(tsk->thread.gsindex); ++ ++ xlate_ptregs_64_to_32(&ri, task_pt_regs(tsk), tsk); ++ ++ ctx->write(&ri, sizeof(ri), ctx); ++ } else { ++ struct cpt_x86_64_regs ri; ++ ri.cpt_next = sizeof(ri); ++ ri.cpt_object = CPT_OBJ_X86_64_REGS; ++ ri.cpt_hdrlen = sizeof(ri); ++ ri.cpt_content = CPT_CONTENT_VOID; ++ ++ ri.cpt_fsbase = tsk->thread.fs; ++ ri.cpt_gsbase = tsk->thread.gs; ++ ri.cpt_fsindex = encode_segment(tsk->thread.fsindex); ++ ri.cpt_gsindex = encode_segment(tsk->thread.gsindex); ++ ri.cpt_ds = encode_segment(tsk->thread.ds); ++ ri.cpt_es = encode_segment(tsk->thread.es); ++ ri.cpt_debugreg[0] = tsk->thread.debugreg0; ++ ri.cpt_debugreg[1] = tsk->thread.debugreg1; ++ ri.cpt_debugreg[2] = tsk->thread.debugreg2; ++ ri.cpt_debugreg[3] = tsk->thread.debugreg3; ++ ri.cpt_debugreg[4] = 0; ++ ri.cpt_debugreg[5] = 0; ++ ri.cpt_debugreg[6] = tsk->thread.debugreg6; ++ ri.cpt_debugreg[7] = tsk->thread.debugreg7; ++ ++ memcpy(&ri.cpt_r15, task_pt_regs(tsk), sizeof(struct pt_regs)); ++ ++ ri.cpt_cs = encode_segment(task_pt_regs(tsk)->cs); ++ ri.cpt_ss = encode_segment(task_pt_regs(tsk)->ss); ++ ++ ctx->write(&ri, sizeof(ri), ctx); ++ ++#if 0 ++ if (ri.cpt_rip >= VSYSCALL_START && ri.cpt_rip < VSYSCALL_END) { ++ eprintk_ctx(CPT_FID "cannot be checkpointied while vsyscall, try later\n", CPT_TID(tsk)); ++ return -EAGAIN; ++ } ++#endif ++ } ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++#else ++ ++static int dump_registers(task_t *tsk, struct cpt_context *ctx) ++{ ++ struct cpt_x86_regs ri; ++ ++ cpt_open_object(NULL, ctx); ++ ++ ri.cpt_next = sizeof(ri); ++ ri.cpt_object = CPT_OBJ_X86_REGS; ++ ri.cpt_hdrlen = sizeof(ri); ++ ri.cpt_content = CPT_CONTENT_VOID; ++ ++ ri.cpt_debugreg[0] = tsk->thread.debugreg[0]; ++ ri.cpt_debugreg[1] = tsk->thread.debugreg[1]; ++ ri.cpt_debugreg[2] = tsk->thread.debugreg[2]; ++ ri.cpt_debugreg[3] = tsk->thread.debugreg[3]; ++ ri.cpt_debugreg[4] = tsk->thread.debugreg[4]; ++ ri.cpt_debugreg[5] = tsk->thread.debugreg[5]; ++ ri.cpt_debugreg[6] = tsk->thread.debugreg[6]; ++ ri.cpt_debugreg[7] = tsk->thread.debugreg[7]; ++ ri.cpt_fs = encode_segment(tsk->thread.fs); ++ ri.cpt_gs = encode_segment(tsk->thread.gs); ++ ++ memcpy(&ri.cpt_ebx, task_pt_regs(tsk), sizeof(struct pt_regs)); ++ ++ ri.cpt_xcs = encode_segment(task_pt_regs(tsk)->xcs); ++ ri.cpt_xss = encode_segment(task_pt_regs(tsk)->xss); ++ ri.cpt_xds = encode_segment(task_pt_regs(tsk)->xds); ++ ri.cpt_xes = encode_segment(task_pt_regs(tsk)->xes); ++ ++ ctx->write(&ri, sizeof(ri), ctx); ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++#endif ++ ++static int dump_kstack(task_t *tsk, struct cpt_context *ctx) ++{ ++ struct cpt_obj_bits hdr; ++ unsigned long size; ++ void *start; ++ ++ cpt_open_object(NULL, ctx); ++ ++#ifdef CONFIG_X86_64 ++ size = tsk->thread.rsp0 - tsk->thread.rsp; ++ start = (void*)tsk->thread.rsp; ++#else ++ size = tsk->thread.esp0 - tsk->thread.esp; ++ start = (void*)tsk->thread.esp; ++#endif ++ ++ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); ++ hdr.cpt_object = CPT_OBJ_BITS; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_content = CPT_CONTENT_STACK; ++ hdr.cpt_size = size; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ ctx->write(start, size, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++/* Formats of i387_fxsave_struct are the same for x86_64 ++ * and i386. Plain luck. */ ++ ++static int dump_fpustate(task_t *tsk, struct cpt_context *ctx) ++{ ++ struct cpt_obj_bits hdr; ++ unsigned long size; ++ int type; ++ ++ cpt_open_object(NULL, ctx); ++ ++ type = CPT_CONTENT_X86_FPUSTATE; ++ size = sizeof(struct i387_fxsave_struct); ++#ifndef CONFIG_X86_64 ++ if (!cpu_has_fxsr) { ++ size = sizeof(struct i387_fsave_struct); ++ type = CPT_CONTENT_X86_FPUSTATE_OLD; ++ } ++#endif ++ ++ hdr.cpt_next = sizeof(hdr) + CPT_ALIGN(size); ++ hdr.cpt_object = CPT_OBJ_BITS; ++ hdr.cpt_hdrlen = sizeof(hdr); ++ hdr.cpt_content = type; ++ hdr.cpt_size = size; ++ ++ ctx->write(&hdr, sizeof(hdr), ctx); ++ ctx->write(&tsk->thread.i387, size, ctx); ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++static int encode_siginfo(struct cpt_siginfo_image *si, siginfo_t *info) ++{ ++ si->cpt_signo = info->si_signo; ++ si->cpt_errno = info->si_errno; ++ si->cpt_code = info->si_code; ++ ++ switch(si->cpt_code & __SI_MASK) { ++ case __SI_TIMER: ++ si->cpt_pid = info->si_tid; ++ si->cpt_uid = info->si_overrun; ++ si->cpt_sigval = cpt_ptr_export(info->_sifields._timer._sigval.sival_ptr); ++ si->cpt_utime = info->si_sys_private; ++ break; ++ case __SI_POLL: ++ si->cpt_pid = info->si_band; ++ si->cpt_uid = info->si_fd; ++ break; ++ case __SI_FAULT: ++ si->cpt_sigval = cpt_ptr_export(info->si_addr); ++#ifdef __ARCH_SI_TRAPNO ++ si->cpt_pid = info->si_trapno; ++#endif ++ break; ++ case __SI_CHLD: ++ si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_type_to_vpid(PIDTYPE_PID, info->si_pid); ++ si->cpt_uid = info->si_uid; ++ si->cpt_sigval = info->si_status; ++ si->cpt_stime = info->si_stime; ++ si->cpt_utime = info->si_utime; ++ break; ++ case __SI_KILL: ++ case __SI_RT: ++ case __SI_MESGQ: ++ default: ++ si->cpt_pid = is_virtual_pid(info->si_pid) ? info->si_pid : pid_type_to_vpid(PIDTYPE_TGID, info->si_pid); ++ si->cpt_uid = info->si_uid; ++ si->cpt_sigval = cpt_ptr_export(info->si_ptr); ++ break; ++ } ++ return 0; ++} ++ ++static int dump_sigqueue(struct sigpending *list, struct cpt_context *ctx) ++{ ++ struct sigqueue *q; ++ loff_t saved_obj; ++ ++ if (list_empty(&list->list)) ++ return 0; ++ ++ cpt_push_object(&saved_obj, ctx); ++ list_for_each_entry(q, &list->list, list) { ++ struct cpt_siginfo_image si; ++ ++ si.cpt_next = sizeof(si); ++ si.cpt_object = CPT_OBJ_SIGINFO; ++ si.cpt_hdrlen = sizeof(si); ++ si.cpt_content = CPT_CONTENT_VOID; ++ ++ si.cpt_qflags = q->flags; ++ si.cpt_user = q->user->uid; ++ ++ if (encode_siginfo(&si, &q->info)) ++ return -EINVAL; ++ ++ ctx->write(&si, sizeof(si), ctx); ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ return 0; ++} ++ ++ ++ ++static int dump_one_signal_struct(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct signal_struct *sig = obj->o_obj; ++ struct cpt_signal_image *v = cpt_get_buf(ctx); ++ task_t *tsk; ++ int i; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SIGNAL_STRUCT; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ if (sig->pgrp <= 0) { ++ eprintk_ctx("bad pgid\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_pgrp_type = CPT_PGRP_NORMAL; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->pgrp); ++ if (tsk == NULL) ++ v->cpt_pgrp_type = CPT_PGRP_ORPHAN; ++ read_unlock(&tasklist_lock); ++ v->cpt_pgrp = pid_type_to_vpid(PIDTYPE_PGID, sig->pgrp); ++ ++ v->cpt_old_pgrp = 0; ++ if (sig->tty_old_pgrp < 0) { ++ eprintk_ctx("bad tty_old_pgrp\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ if (sig->tty_old_pgrp > 0) { ++ v->cpt_old_pgrp_type = CPT_PGRP_NORMAL; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->tty_old_pgrp); ++ if (tsk == NULL) { ++ v->cpt_old_pgrp_type = CPT_PGRP_ORPHAN; ++ tsk = find_task_by_pid_type_ve(PIDTYPE_PGID, sig->tty_old_pgrp); ++ } ++ read_unlock(&tasklist_lock); ++ if (tsk == NULL) { ++ eprintk_ctx("tty_old_pgrp does not exist anymore\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_old_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, sig->tty_old_pgrp); ++ if ((int)v->cpt_old_pgrp < 0) { ++ dprintk_ctx("stray tty_old_pgrp %d\n", sig->tty_old_pgrp); ++ v->cpt_old_pgrp = -1; ++ v->cpt_old_pgrp_type = CPT_PGRP_STRAY; ++ } ++ } ++ ++ if (sig->session <= 0) { ++ eprintk_ctx("bad session\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_session_type = CPT_PGRP_NORMAL; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid_type_ve(PIDTYPE_PID, sig->session); ++ if (tsk == NULL) ++ v->cpt_session_type = CPT_PGRP_ORPHAN; ++ read_unlock(&tasklist_lock); ++ v->cpt_session = pid_type_to_vpid(PIDTYPE_SID, sig->session); ++ ++ v->cpt_leader = sig->leader; ++ v->cpt_ctty = CPT_NULL; ++ if (sig->tty) { ++ cpt_object_t *cobj = lookup_cpt_object(CPT_OBJ_TTY, sig->tty, ctx); ++ if (cobj) ++ v->cpt_ctty = cobj->o_pos; ++ else { ++ eprintk_ctx("controlling tty is not found\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ memcpy(&v->cpt_sigpending, &sig->shared_pending.signal, 8); ++ ++ v->cpt_curr_target = 0; ++ if (sig->curr_target) ++ v->cpt_curr_target = virt_pid(sig->curr_target); ++ v->cpt_group_exit = ((sig->flags & SIGNAL_GROUP_EXIT) != 0); ++ v->cpt_group_exit_code = sig->group_exit_code; ++ v->cpt_group_exit_task = 0; ++ if (sig->group_exit_task) ++ v->cpt_group_exit_task = virt_pid(sig->group_exit_task); ++ v->cpt_notify_count = sig->notify_count; ++ v->cpt_group_stop_count = sig->group_stop_count; ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,8) ++ v->cpt_utime = sig->utime; ++ v->cpt_stime = sig->stime; ++ v->cpt_cutime = sig->cutime; ++ v->cpt_cstime = sig->cstime; ++ v->cpt_nvcsw = sig->nvcsw; ++ v->cpt_nivcsw = sig->nivcsw; ++ v->cpt_cnvcsw = sig->cnvcsw; ++ v->cpt_cnivcsw = sig->cnivcsw; ++ v->cpt_min_flt = sig->min_flt; ++ v->cpt_maj_flt = sig->maj_flt; ++ v->cpt_cmin_flt = sig->cmin_flt; ++ v->cpt_cmaj_flt = sig->cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; i<CPT_RLIM_NLIMITS; i++) { ++ if (i < RLIM_NLIMITS) { ++ v->cpt_rlim_cur[i] = sig->rlim[i].rlim_cur; ++ v->cpt_rlim_max[i] = sig->rlim[i].rlim_max; ++ } else { ++ v->cpt_rlim_cur[i] = CPT_NULL; ++ v->cpt_rlim_max[i] = CPT_NULL; ++ } ++ } ++#endif ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ dump_sigqueue(&sig->shared_pending, ctx); ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++ ++static int dump_one_process(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ task_t *tsk = obj->o_obj; ++ int last_thread; ++ struct cpt_task_image *v = cpt_get_buf(ctx); ++ cpt_object_t *tobj; ++ cpt_object_t *tg_obj; ++ loff_t saved_obj; ++ int i; ++ int err; ++ struct timespec delta; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_signal = CPT_NULL; ++ tg_obj = lookup_cpt_object(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx); ++ if (!tg_obj) BUG(); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_TASK; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_state = tsk->state; ++ if (tsk->state == EXIT_ZOMBIE) { ++ eprintk_ctx("invalid zombie state on" CPT_FID "\n", CPT_TID(tsk)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } else if (tsk->state == EXIT_DEAD) { ++ if (tsk->exit_state != EXIT_DEAD && ++ tsk->exit_state != EXIT_ZOMBIE) { ++ eprintk_ctx("invalid exit_state %ld on" CPT_FID "\n", tsk->exit_state, CPT_TID(tsk)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ if (tsk->exit_state) { ++ v->cpt_state = tsk->exit_state; ++ if (tsk->state != EXIT_DEAD) { ++ eprintk_ctx("invalid tsk->state %ld/%ld on" CPT_FID "\n", ++ tsk->state, tsk->exit_state, CPT_TID(tsk)); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ v->cpt_flags = tsk->flags&~PF_FROZEN; ++ v->cpt_ptrace = tsk->ptrace; ++ v->cpt_prio = tsk->prio; ++ v->cpt_exit_code = tsk->exit_code; ++ v->cpt_exit_signal = tsk->exit_signal; ++ v->cpt_pdeath_signal = tsk->pdeath_signal; ++ v->cpt_static_prio = tsk->static_prio; ++ v->cpt_rt_priority = tsk->rt_priority; ++ v->cpt_policy = tsk->policy; ++ if (v->cpt_policy != SCHED_NORMAL) { ++ eprintk_ctx("scheduler policy is not supported %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ v->cpt_mm = CPT_NULL; ++ if (tsk->mm) { ++ tobj = lookup_cpt_object(CPT_OBJ_MM, tsk->mm, ctx); ++ if (!tobj) BUG(); ++ v->cpt_mm = tobj->o_pos; ++ } ++ v->cpt_files = CPT_NULL; ++ if (tsk->files) { ++ tobj = lookup_cpt_object(CPT_OBJ_FILES, tsk->files, ctx); ++ if (!tobj) BUG(); ++ v->cpt_files = tobj->o_pos; ++ } ++ v->cpt_fs = CPT_NULL; ++ if (tsk->fs) { ++ tobj = lookup_cpt_object(CPT_OBJ_FS, tsk->fs, ctx); ++ if (!tobj) BUG(); ++ v->cpt_fs = tobj->o_pos; ++ } ++ v->cpt_namespace = CPT_NULL; ++ if (tsk->namespace) { ++ tobj = lookup_cpt_object(CPT_OBJ_NAMESPACE, tsk->namespace, ctx); ++ if (!tobj) BUG(); ++ v->cpt_namespace = tobj->o_pos; ++ ++ if (tsk->namespace != current->namespace) ++ eprintk_ctx("namespaces are not supported: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); ++ } ++ v->cpt_sysvsem_undo = CPT_NULL; ++ if (tsk->sysvsem.undo_list && !tsk->exit_state) { ++ tobj = lookup_cpt_object(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx); ++ if (!tobj) BUG(); ++ v->cpt_sysvsem_undo = tobj->o_pos; ++ } ++ v->cpt_sighand = CPT_NULL; ++ if (tsk->sighand) { ++ tobj = lookup_cpt_object(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx); ++ if (!tobj) BUG(); ++ v->cpt_sighand = tobj->o_pos; ++ } ++ v->cpt_sigblocked = cpt_sigset_export(&tsk->blocked); ++ v->cpt_sigrblocked = cpt_sigset_export(&tsk->real_blocked); ++ v->cpt_sigsuspend_blocked = cpt_sigset_export(&tsk->saved_sigmask); ++ ++ v->cpt_pid = virt_pid(tsk); ++ v->cpt_tgid = virt_tgid(tsk); ++ v->cpt_ppid = 0; ++ if (tsk->parent) { ++ if (tsk->parent != tsk->real_parent && ++ !lookup_cpt_object(CPT_OBJ_TASK, tsk->parent, ctx)) { ++ eprintk_ctx("task %d/%d(%s) is ptraced from ve0\n", tsk->pid, virt_pid(tsk), tsk->comm); ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++ v->cpt_ppid = virt_pid(tsk->parent); ++ } ++ v->cpt_rppid = tsk->real_parent ? virt_pid(tsk->real_parent) : 0; ++ v->cpt_pgrp = virt_pgid(tsk); ++ v->cpt_session = virt_sid(tsk); ++ v->cpt_old_pgrp = 0; ++ if (tsk->signal->tty_old_pgrp) ++ v->cpt_old_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, tsk->signal->tty_old_pgrp); ++ v->cpt_leader = tsk->group_leader ? virt_pid(tsk->group_leader) : 0; ++ v->cpt_set_tid = (unsigned long)tsk->set_child_tid; ++ v->cpt_clear_tid = (unsigned long)tsk->clear_child_tid; ++ memcpy(v->cpt_comm, tsk->comm, 16); ++ v->cpt_user = tsk->user->uid; ++ v->cpt_uid = tsk->uid; ++ v->cpt_euid = tsk->euid; ++ v->cpt_suid = tsk->suid; ++ v->cpt_fsuid = tsk->fsuid; ++ v->cpt_gid = tsk->gid; ++ v->cpt_egid = tsk->egid; ++ v->cpt_sgid = tsk->sgid; ++ v->cpt_fsgid = tsk->fsgid; ++ v->cpt_ngids = 0; ++ if (tsk->group_info && tsk->group_info->ngroups != 0) { ++ int i = tsk->group_info->ngroups; ++ if (i > 32) { ++ /* Shame... I did a simplified version and _forgot_ ++ * about this. Later, later. */ ++ eprintk_ctx("too many of groups " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ v->cpt_ngids = i; ++ for (i--; i>=0; i--) ++ v->cpt_gids[i] = tsk->group_info->small_block[i]; ++ } ++ memcpy(&v->cpt_ecap, &tsk->cap_effective, 8); ++ memcpy(&v->cpt_icap, &tsk->cap_inheritable, 8); ++ memcpy(&v->cpt_pcap, &tsk->cap_permitted, 8); ++ v->cpt_keepcap = tsk->keep_capabilities; ++ ++ v->cpt_did_exec = tsk->did_exec; ++ v->cpt_exec_domain = -1; ++ v->cpt_thrflags = tsk->thread_info->flags & ~(1<<TIF_FREEZE); ++ v->cpt_64bit = 0; ++#ifdef CONFIG_X86_64 ++ /* Clear x86_64 specific flags */ ++ v->cpt_thrflags &= ~(_TIF_FORK|_TIF_ABI_PENDING|_TIF_IA32); ++ if (!(tsk->thread_info->flags & _TIF_IA32)) { ++ ctx->tasks64++; ++ v->cpt_64bit = 1; ++ } ++#endif ++ v->cpt_thrstatus = tsk->thread_info->status; ++ v->cpt_addr_limit = -1; ++ ++ v->cpt_personality = tsk->personality; ++ ++ for (i=0; i<GDT_ENTRY_TLS_ENTRIES; i++) { ++ if (i>=3) { ++ eprintk_ctx("too many tls descs\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++#ifndef CONFIG_X86_64 ++ v->cpt_tls[i] = (((u64)tsk->thread.tls_array[i].b)<<32) + tsk->thread.tls_array[i].a; ++#else ++ v->cpt_tls[i] = tsk->thread.tls_array[i]; ++#endif ++ } ++ ++ v->cpt_restart.fn = CPT_RBL_0; ++ if (tsk->thread_info->restart_block.fn != current->thread_info->restart_block.fn) { ++ if (tsk->thread_info->restart_block.fn != nanosleep_restart ++#ifdef CONFIG_X86_64 ++ && tsk->thread_info->restart_block.fn != compat_nanosleep_restart ++#endif ++ ) { ++ eprintk_ctx("unknown restart block %p\n", tsk->thread_info->restart_block.fn); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_restart.fn = CPT_RBL_NANOSLEEP; ++#ifdef CONFIG_X86_64 ++ if (tsk->thread_info->restart_block.fn == compat_nanosleep_restart) ++ v->cpt_restart.fn = CPT_RBL_COMPAT_NANOSLEEP; ++#endif ++ v->cpt_restart.arg0 = tsk->thread_info->restart_block.arg0; ++ v->cpt_restart.arg1 = tsk->thread_info->restart_block.arg1; ++ v->cpt_restart.arg2 = tsk->thread_info->restart_block.arg2; ++ v->cpt_restart.arg3 = tsk->thread_info->restart_block.arg3; ++ if (debug_level > 2) { ++ ktime_t e, e1; ++ struct timespec now; ++ ++ do_posix_clock_monotonic_gettime(&now); ++ e = timespec_to_ktime(now); ++ e1.tv64 = ((u64)tsk->thread_info->restart_block.arg1 << 32) | (u64) tsk->thread_info->restart_block.arg0; ++ e = ktime_sub(e1, e); ++ dprintk("cpt " CPT_FID " RBL %ld/%ld %Ld\n", CPT_TID(tsk), ++ tsk->thread_info->restart_block.arg1, ++ tsk->thread_info->restart_block.arg0, e.tv64); ++ } ++ } ++ ++ v->cpt_it_real_incr = 0; ++ v->cpt_it_prof_incr = 0; ++ v->cpt_it_virt_incr = 0; ++ v->cpt_it_real_value = 0; ++ v->cpt_it_prof_value = 0; ++ v->cpt_it_virt_value = 0; ++ if (thread_group_leader(tsk) && tsk->exit_state == 0) { ++ ktime_t rem; ++ ++ v->cpt_it_real_incr = ktime_to_ns(tsk->signal->it_real_incr); ++ v->cpt_it_prof_incr = tsk->signal->it_prof_incr; ++ v->cpt_it_virt_incr = tsk->signal->it_virt_incr; ++ ++ rem = hrtimer_get_remaining(&tsk->signal->real_timer); ++ ++ if (hrtimer_active(&tsk->signal->real_timer)) { ++ if (rem.tv64 <= 0) ++ rem.tv64 = NSEC_PER_USEC; ++ v->cpt_it_real_value = ktime_to_ns(rem); ++ dprintk("cpt itimer " CPT_FID " %Lu\n", CPT_TID(tsk), v->cpt_it_real_value); ++ } ++ v->cpt_it_prof_value = tsk->signal->it_prof_expires; ++ v->cpt_it_virt_value = tsk->signal->it_virt_expires; ++ } ++ v->cpt_used_math = (tsk_used_math(tsk) != 0); ++ ++ if (tsk->notifier) { ++ eprintk_ctx("task notifier is in use: process %d/%d(%s)\n", virt_pid(tsk), tsk->pid, tsk->comm); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ v->cpt_utime = tsk->utime; ++ v->cpt_stime = tsk->stime; ++ delta = tsk->start_time; ++ _set_normalized_timespec(&delta, ++ delta.tv_sec - get_exec_env()->init_entry->start_time.tv_sec, ++ delta.tv_nsec - get_exec_env()->init_entry->start_time.tv_nsec); ++ v->cpt_starttime = cpt_timespec_export(&delta); ++ v->cpt_nvcsw = tsk->nvcsw; ++ v->cpt_nivcsw = tsk->nivcsw; ++ v->cpt_min_flt = tsk->min_flt; ++ v->cpt_maj_flt = tsk->maj_flt; ++ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) ++ v->cpt_cutime = tsk->cutime; ++ v->cpt_cstime = tsk->cstime; ++ v->cpt_cnvcsw = tsk->cnvcsw; ++ v->cpt_cnivcsw = tsk->cnivcsw; ++ v->cpt_cmin_flt = tsk->cmin_flt; ++ v->cpt_cmaj_flt = tsk->cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; i<CPT_RLIM_NLIMITS; i++) { ++ if (i < RLIM_NLIMITS) { ++ v->cpt_rlim_cur[i] = tsk->rlim[i].rlim_cur; ++ v->cpt_rlim_max[i] = tsk->rlim[i].rlim_max; ++ } else { ++ v->cpt_rlim_cur[i] = CPT_NULL; ++ v->cpt_rlim_max[i] = CPT_NULL; ++ } ++ } ++#else ++ v->cpt_cutime = tsk->signal->cutime; ++ v->cpt_cstime = tsk->signal->cstime; ++ v->cpt_cnvcsw = tsk->signal->cnvcsw; ++ v->cpt_cnivcsw = tsk->signal->cnivcsw; ++ v->cpt_cmin_flt = tsk->signal->cmin_flt; ++ v->cpt_cmaj_flt = tsk->signal->cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; i<CPT_RLIM_NLIMITS; i++) { ++ if (i < RLIM_NLIMITS) { ++ v->cpt_rlim_cur[i] = tsk->signal->rlim[i].rlim_cur; ++ v->cpt_rlim_max[i] = tsk->signal->rlim[i].rlim_max; ++ } else { ++ v->cpt_rlim_cur[i] = CPT_NULL; ++ v->cpt_rlim_max[i] = CPT_NULL; ++ } ++ } ++#endif ++ ++ if (tsk->mm) ++ v->cpt_mm_ub = cpt_lookup_ubc(tsk->mm->mm_ub, ctx); ++ else ++ v->cpt_mm_ub = CPT_NULL; ++ v->cpt_task_ub = cpt_lookup_ubc(tsk->task_bc.task_ub, ctx); ++ v->cpt_exec_ub = cpt_lookup_ubc(tsk->task_bc.exec_ub, ctx); ++ v->cpt_fork_sub = cpt_lookup_ubc(tsk->task_bc.fork_sub, ctx); ++ ++ v->cpt_ptrace_message = tsk->ptrace_message; ++ v->cpt_pn_state = tsk->pn_state; ++ v->cpt_stopped_state = tsk->stopped_state; ++ v->cpt_sigsuspend_state = 0; ++ ++#ifndef CONFIG_X86_64 ++ if (tsk->thread.vm86_info) { ++ eprintk_ctx("vm86 task is running\n"); ++ cpt_release_buf(ctx); ++ return -EBUSY; ++ } ++#endif ++ ++ v->cpt_sigpending = cpt_sigset_export(&tsk->pending.signal); ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ dump_kstack(tsk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ err = dump_registers(tsk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ if (err) ++ return err; ++ ++ if (tsk_used_math(tsk)) { ++ cpt_push_object(&saved_obj, ctx); ++ dump_fpustate(tsk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ if (tsk->last_siginfo) { ++ struct cpt_siginfo_image si; ++ cpt_push_object(&saved_obj, ctx); ++ ++ si.cpt_next = sizeof(si); ++ si.cpt_object = CPT_OBJ_LASTSIGINFO; ++ si.cpt_hdrlen = sizeof(si); ++ si.cpt_content = CPT_CONTENT_VOID; ++ ++ if (encode_siginfo(&si, tsk->last_siginfo)) ++ return -EINVAL; ++ ++ ctx->write(&si, sizeof(si), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ if (tsk->sas_ss_size) { ++ struct cpt_sigaltstack_image si; ++ cpt_push_object(&saved_obj, ctx); ++ ++ si.cpt_next = sizeof(si); ++ si.cpt_object = CPT_OBJ_SIGALTSTACK; ++ si.cpt_hdrlen = sizeof(si); ++ si.cpt_content = CPT_CONTENT_VOID; ++ ++ si.cpt_stack = tsk->sas_ss_sp; ++ si.cpt_stacksize = tsk->sas_ss_size; ++ ++ ctx->write(&si, sizeof(si), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ ++ dump_sigqueue(&tsk->pending, ctx); ++ ++ last_thread = 1; ++ read_lock(&tasklist_lock); ++ do { ++ task_t * next = next_thread(tsk); ++ if (next != tsk && !thread_group_leader(next)) ++ last_thread = 0; ++ } while (0); ++ read_unlock(&tasklist_lock); ++ ++ if (last_thread) { ++ task_t *prev_tsk; ++ int err; ++ loff_t pos = ctx->file->f_pos; ++ ++ cpt_push_object(&saved_obj, ctx); ++ err = dump_one_signal_struct(tg_obj, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ if (err) ++ return err; ++ ++ prev_tsk = tsk; ++ for (;;) { ++ if (prev_tsk->tgid == tsk->tgid) { ++ loff_t tg_pos; ++ ++ tg_pos = obj->o_pos + offsetof(struct cpt_task_image, cpt_signal); ++ ctx->pwrite(&pos, sizeof(pos), ctx, tg_pos); ++ if (thread_group_leader(prev_tsk)) ++ break; ++ } ++ ++ if (obj->o_list.prev == &ctx->object_array[CPT_OBJ_TASK]) { ++ eprintk_ctx("bug: thread group leader is lost\n"); ++ return -EINVAL; ++ } ++ ++ obj = list_entry(obj->o_list.prev, cpt_object_t, o_list); ++ prev_tsk = obj->o_obj; ++ } ++ } ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_dump_tasks(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_TASKS); ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ int err; ++ ++ if ((err = dump_one_process(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++int cpt_collect_signals(cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ /* Collect process fd sets */ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ if (tsk->signal && !list_empty(&tsk->signal->posix_timers)) { ++ eprintk_ctx("task %d/%d(%s) uses posix timers\n", tsk->pid, virt_pid(tsk), tsk->comm); ++ return -EBUSY; ++ } ++ if (tsk->signal && cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, tsk->signal, ctx) == NULL) ++ return -ENOMEM; ++ if (tsk->sighand && cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, tsk->sighand, ctx) == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++ ++static int dump_one_sighand_struct(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct sighand_struct *sig = obj->o_obj; ++ struct cpt_sighand_image *v = cpt_get_buf(ctx); ++ int i; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SIGHAND_STRUCT; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ for (i=0; i< _NSIG; i++) { ++ if (sig->action[i].sa.sa_handler != SIG_DFL) { ++ loff_t saved_obj; ++ struct cpt_sighandler_image *o = cpt_get_buf(ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ o->cpt_next = CPT_NULL; ++ o->cpt_object = CPT_OBJ_SIGHANDLER; ++ o->cpt_hdrlen = sizeof(*o); ++ o->cpt_content = CPT_CONTENT_VOID; ++ ++ o->cpt_signo = i; ++ o->cpt_handler = (unsigned long)sig->action[i].sa.sa_handler; ++ o->cpt_restorer = (unsigned long)sig->action[i].sa.sa_restorer; ++ o->cpt_flags = sig->action[i].sa.sa_flags; ++ memcpy(&o->cpt_mask, &sig->action[i].sa.sa_mask, 8); ++ ctx->write(o, sizeof(*o), ctx); ++ cpt_release_buf(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ } ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_dump_sighand(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ cpt_open_section(ctx, CPT_SECT_SIGHAND_STRUCT); ++ ++ for_each_object(obj, CPT_OBJ_SIGHAND_STRUCT) { ++ int err; ++ ++ if ((err = dump_one_sighand_struct(obj, ctx)) != 0) ++ return err; ++ } ++ ++ cpt_close_section(ctx); ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_process.h linux-2.6.16-026test015/kernel/cpt/cpt_process.h +--- linux-2.6.16.orig/kernel/cpt/cpt_process.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_process.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,10 @@ ++int cpt_collect_signals(cpt_context_t *); ++int cpt_dump_signal(struct cpt_context *); ++int cpt_dump_sighand(struct cpt_context *); ++int cpt_dump_tasks(struct cpt_context *); ++ ++int rst_signal_complete(struct cpt_task_image *ti, struct cpt_context *ctx); ++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx); ++ ++int rst_restore_process(struct cpt_context *ctx); ++int rst_process_linkage(struct cpt_context *ctx); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket.c linux-2.6.16-026test015/kernel/cpt/cpt_socket.c +--- linux-2.6.16.orig/kernel/cpt/cpt_socket.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_socket.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,779 @@ ++/* ++ * ++ * kernel/cpt/cpt_socket.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/un.h> ++#include <linux/tcp.h> ++#include <net/sock.h> ++#include <net/scm.h> ++#include <net/af_unix.h> ++#include <net/tcp.h> ++#include <net/netlink_sock.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_socket.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++ ++static int dump_rqueue(int owner, struct sock *sk, struct cpt_context *ctx); ++ ++ ++/* Sockets are quite different of another kinds of files. ++ * There is one simplification: only one struct file can refer to a socket, ++ * so we could store information about socket directly in section FILES as ++ * a description of a file and append f.e. array of not-yet-accepted ++ * connections of listening socket as array of auxiliary data. ++ * ++ * Complications are: ++ * 1. TCP sockets can be orphans. We have to relocate orphans as well, ++ * so we have to create special section for orphans. ++ * 2. AF_UNIX sockets are distinguished objects: set of links between ++ * AF_UNIX sockets is quite arbitrary. ++ * A. Each socket can refers to many of files due to FD passing. ++ * B. Each socket except for connected ones can have in queue skbs ++ * sent by any of sockets. ++ * ++ * 2A is relatively easy: after our tasks are frozen we make an additional ++ * recursive pass throgh set of collected files and get referenced to ++ * FD passed files. After end of recursion, all the files are treated ++ * in the same way. All they will be stored in section FILES. ++ * ++ * 2B. We have to resolve all those references at some point. ++ * It is the place where pipe-like approach to image fails. ++ * ++ * All this makes socket checkpointing quite chumbersome. ++ * Right now we collect all the sockets and assign some numeric index value ++ * to each of them. The socket section is separate and put after section FILES, ++ * so section FILES refers to sockets by index, section SOCKET refers to FILES ++ * as usual by position in image. All the refs inside socket section are ++ * by index. When restoring we read socket section, create objects to hold ++ * mappings index <-> pos. At the second pass we open sockets (simultaneosly ++ * with their pairs) and create FILE objects. ++ */ ++ ++ ++/* ====== FD passing ====== */ ++ ++/* Almost nobody does FD passing via AF_UNIX sockets, nevertheless we ++ * have to implement this. A problem is that in general case we receive ++ * skbs from an unknown context, so new files can arrive to checkpointed ++ * set of processes even after they are stopped. Well, we are going just ++ * to ignore unknown fds while doing real checkpointing. It is fair because ++ * links outside checkpointed set are going to fail anyway. ++ * ++ * ATTN: the procedure is recursive. We linearize the recursion adding ++ * newly found files to the end of file list, so they will be analyzed ++ * in the same loop. ++ */ ++ ++static int collect_one_passedfd(struct file *file, cpt_context_t * ctx) ++{ ++ struct inode *inode = file->f_dentry->d_inode; ++ struct socket *sock; ++ struct sock *sk; ++ struct sk_buff *skb; ++ ++ if (!S_ISSOCK(inode->i_mode)) ++ return -ENOTSOCK; ++ ++ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; ++ ++ if (sock->ops->family != AF_UNIX) ++ return 0; ++ ++ sk = sock->sk; ++ ++ /* Subtle locking issue. skbs cannot be removed while ++ * we are scanning, because all the processes are stopped. ++ * They still can be added to tail of queue. Locking while ++ * we dereference skb->next is enough to resolve this. ++ * See above about collision with skbs added after we started ++ * checkpointing. ++ */ ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { ++ if (UNIXCB(skb).fp && skb->sk && ++ (!sock_flag(skb->sk, SOCK_DEAD) || unix_peer(sk) == skb->sk)) { ++ struct scm_fp_list *fpl = UNIXCB(skb).fp; ++ int i; ++ ++ for (i = fpl->count-1; i >= 0; i--) { ++ if (cpt_object_add(CPT_OBJ_FILE, fpl->fp[i], ctx) == NULL) ++ return -ENOMEM; ++ } ++ } ++ ++ spin_lock_irq(&sk->sk_receive_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_receive_queue.lock); ++ } ++ ++ return 0; ++} ++ ++int cpt_collect_passedfds(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ ++ if (S_ISSOCK(file->f_dentry->d_inode->i_mode)) { ++ int err; ++ ++ if ((err = collect_one_passedfd(file, ctx)) < 0) ++ return err; ++ } ++ } ++ ++ return 0; ++} ++ ++/* ====== End of FD passing ====== */ ++ ++/* Must be called under bh_lock_sock() */ ++ ++void clear_backlog(struct sock *sk) ++{ ++ struct sk_buff *skb = sk->sk_backlog.head; ++ ++ sk->sk_backlog.head = sk->sk_backlog.tail = NULL; ++ while (skb) { ++ struct sk_buff *next = skb->next; ++ ++ skb->next = NULL; ++ kfree_skb(skb); ++ skb = next; ++ } ++} ++ ++void release_sock_nobacklog(struct sock *sk) ++{ ++ spin_lock_bh(&(sk->sk_lock.slock)); ++ clear_backlog(sk); ++ sk->sk_lock.owner = NULL; ++ if (waitqueue_active(&(sk->sk_lock.wq))) ++ wake_up(&(sk->sk_lock.wq)); ++ spin_unlock_bh(&(sk->sk_lock.slock)); ++} ++ ++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, ++ struct cpt_context *ctx) ++{ ++ struct cpt_skb_image *v = cpt_get_buf(ctx); ++ loff_t saved_obj; ++ struct timeval tmptv; ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SKB; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_owner = owner; ++ v->cpt_queue = type; ++ skb_get_timestamp(skb, &tmptv); ++ v->cpt_stamp = cpt_timeval_export(&tmptv); ++ v->cpt_hspace = skb->data - skb->head; ++ v->cpt_tspace = skb->end - skb->tail; ++ v->cpt_h = skb->h.raw - skb->head; ++ v->cpt_nh = skb->nh.raw - skb->head; ++ v->cpt_mac = skb->mac.raw - skb->head; ++ if (sizeof(skb->cb) < sizeof(v->cpt_cb)) BUG(); ++ memcpy(v->cpt_cb, skb->cb, sizeof(v->cpt_cb)); ++ if (sizeof(skb->cb) > sizeof(v->cpt_cb)) { ++ int i; ++ for (i=sizeof(v->cpt_cb); i<sizeof(skb->cb); i++) { ++ if (skb->cb[i]) { ++ wprintk_ctx("dirty skb cb"); ++ break; ++ } ++ } ++ } ++ v->cpt_len = skb->len; ++ v->cpt_mac_len = skb->mac_len; ++ v->cpt_csum = skb->csum; ++ v->cpt_local_df = skb->local_df; ++ v->cpt_pkt_type = skb->pkt_type; ++ v->cpt_ip_summed = skb->ip_summed; ++ v->cpt_priority = skb->priority; ++ v->cpt_protocol = skb->protocol; ++ v->cpt_security = 0; ++ v->cpt_tso_segs = skb_shinfo(skb)->tso_segs; ++ v->cpt_tso_size = skb_shinfo(skb)->tso_size; ++ if (skb_shinfo(skb)->ufo_size) { ++ eprintk_ctx("skb ufo is not supported\n"); ++ return -EINVAL; ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (skb->len + (skb->data - skb->head) > 0) { ++ struct cpt_obj_bits ob; ++ loff_t saved_obj2; ++ ++ cpt_push_object(&saved_obj2, ctx); ++ cpt_open_object(NULL, ctx); ++ ob.cpt_next = CPT_NULL; ++ ob.cpt_object = CPT_OBJ_BITS; ++ ob.cpt_hdrlen = sizeof(ob); ++ ob.cpt_content = CPT_CONTENT_DATA; ++ ob.cpt_size = skb->len + v->cpt_hspace; ++ ++ ctx->write(&ob, sizeof(ob), ctx); ++ ++ ctx->write(skb->head, (skb->data-skb->head) + (skb->len-skb->data_len), ctx); ++ if (skb->data_len) { ++ int offset = skb->len - skb->data_len; ++ while (offset < skb->len) { ++ int copy = skb->len - offset; ++ if (copy > PAGE_SIZE) ++ copy = PAGE_SIZE; ++ (void)cpt_get_buf(ctx); ++ if (skb_copy_bits(skb, offset, ctx->tmpbuf, copy)) ++ BUG(); ++ ctx->write(ctx->tmpbuf, copy, ctx); ++ __cpt_release_buf(ctx); ++ offset += copy; ++ } ++ } ++ ++ ctx->align(ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj2, ctx); ++ } ++ ++ if (skb->sk && skb->sk->sk_family == AF_UNIX) { ++ struct scm_fp_list *fpl = UNIXCB(skb).fp; ++ ++ if (fpl) { ++ int i; ++ ++ for (i = 0; i < fpl->count; i++) { ++ struct cpt_fd_image v; ++ cpt_object_t *obj; ++ loff_t saved_obj2; ++ ++ obj = lookup_cpt_object(CPT_OBJ_FILE, fpl->fp[i], ctx); ++ ++ if (!obj) { ++ eprintk_ctx("lost passed FD\n"); ++ return -EINVAL; ++ } ++ ++ cpt_push_object(&saved_obj2, ctx); ++ cpt_open_object(NULL, ctx); ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_FILEDESC; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_VOID; ++ ++ v.cpt_fd = i; ++ v.cpt_file = obj->o_pos; ++ v.cpt_flags = 0; ++ ctx->write(&v, sizeof(v), ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj2, ctx); ++ } ++ } ++ } ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ return 0; ++} ++ ++static int dump_rqueue(int idx, struct sock *sk, struct cpt_context *ctx) ++{ ++ struct sk_buff *skb; ++ struct sock *sk_cache = NULL; ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { ++ int err; ++ ++ if (sk->sk_family == AF_UNIX) { ++ cpt_object_t *obj; ++ if (skb->sk != sk_cache) { ++ idx = -1; ++ sk_cache = NULL; ++ obj = lookup_cpt_object(CPT_OBJ_SOCKET, skb->sk, ctx); ++ if (obj) { ++ idx = obj->o_index; ++ sk_cache = skb->sk; ++ } else if (unix_peer(sk) != skb->sk) ++ goto next_skb; ++ } ++ } ++ ++ err = cpt_dump_skb(CPT_SKB_RQ, idx, skb, ctx); ++ if (err) ++ return err; ++ ++next_skb: ++ spin_lock_irq(&sk->sk_receive_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_receive_queue.lock); ++ } ++ return 0; ++} ++ ++static int dump_wqueue(int idx, struct sock *sk, struct cpt_context *ctx) ++{ ++ struct sk_buff *skb; ++ ++ skb = skb_peek(&sk->sk_write_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_write_queue) { ++ int err = cpt_dump_skb(CPT_SKB_WQ, idx, skb, ctx); ++ if (err) ++ return err; ++ ++ spin_lock_irq(&sk->sk_write_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_write_queue.lock); ++ } ++ return 0; ++} ++ ++void cpt_dump_sock_attr(struct sock *sk, cpt_context_t *ctx) ++{ ++ loff_t saved_obj; ++ if (sk->sk_filter) { ++ struct cpt_obj_bits v; ++ ++ cpt_push_object(&saved_obj, ctx); ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_SKFILTER; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_DATA; ++ v.cpt_size = sk->sk_filter->len*sizeof(struct sock_filter); ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ctx->write(sk->sk_filter->insns, v.cpt_size, ctx); ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { ++ cpt_push_object(&saved_obj, ctx); ++ cpt_dump_mcfilter(sk, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++} ++ ++/* Dump socket content */ ++ ++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx) ++{ ++ struct cpt_sock_image *v = cpt_get_buf(ctx); ++ struct socket *sock; ++ ++ cpt_open_object(obj, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_SOCKET; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_file = CPT_NULL; ++ sock = sk->sk_socket; ++ if (sock && sock->file) { ++ cpt_object_t *tobj; ++ tobj = lookup_cpt_object(CPT_OBJ_FILE, sock->file, ctx); ++ if (tobj) ++ v->cpt_file = tobj->o_pos; ++ } ++ v->cpt_index = index; ++ v->cpt_parent = parent; ++ ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { ++ if (sock && !obj->o_lock) { ++ lock_sock(sk); ++ obj->o_lock = 1; ++ } ++ } ++ ++ /* Some bits stored in inode */ ++ v->cpt_ssflags = sock ? sock->flags : 0; ++ v->cpt_sstate = sock ? sock->state : 0; ++ v->cpt_passcred = sock ? test_bit(SOCK_PASSCRED, &sock->flags) : 0; ++ ++ /* Common data */ ++ v->cpt_family = sk->sk_family; ++ v->cpt_type = sk->sk_type; ++ v->cpt_state = sk->sk_state; ++ v->cpt_reuse = sk->sk_reuse; ++ v->cpt_zapped = sock_flag(sk, SOCK_ZAPPED); ++ v->cpt_shutdown = sk->sk_shutdown; ++ v->cpt_userlocks = sk->sk_userlocks; ++ v->cpt_no_check = sk->sk_no_check; ++ v->cpt_zapped = sock_flag(sk, SOCK_DBG); ++ v->cpt_rcvtstamp = sock_flag(sk, SOCK_RCVTSTAMP); ++ v->cpt_localroute = sock_flag(sk, SOCK_LOCALROUTE); ++ v->cpt_protocol = sk->sk_protocol; ++ v->cpt_err = sk->sk_err; ++ v->cpt_err_soft = sk->sk_err_soft; ++ v->cpt_max_ack_backlog = sk->sk_max_ack_backlog; ++ v->cpt_priority = sk->sk_priority; ++ v->cpt_rcvlowat = sk->sk_rcvlowat; ++ v->cpt_rcvtimeo = CPT_NULL; ++ if (sk->sk_rcvtimeo != MAX_SCHEDULE_TIMEOUT) ++ v->cpt_rcvtimeo = sk->sk_rcvtimeo > INT_MAX ? INT_MAX : sk->sk_rcvtimeo; ++ v->cpt_sndtimeo = CPT_NULL; ++ if (sk->sk_sndtimeo != MAX_SCHEDULE_TIMEOUT) ++ v->cpt_sndtimeo = sk->sk_sndtimeo > INT_MAX ? INT_MAX : sk->sk_sndtimeo; ++ v->cpt_rcvbuf = sk->sk_rcvbuf; ++ v->cpt_sndbuf = sk->sk_sndbuf; ++ v->cpt_bound_dev_if = sk->sk_bound_dev_if; ++ v->cpt_flags = sk->sk_flags; ++ v->cpt_lingertime = CPT_NULL; ++ if (sk->sk_lingertime != MAX_SCHEDULE_TIMEOUT) ++ v->cpt_lingertime = sk->sk_lingertime > INT_MAX ? INT_MAX : sk->sk_lingertime; ++ v->cpt_peer_pid = sk->sk_peercred.pid; ++ v->cpt_peer_uid = sk->sk_peercred.uid; ++ v->cpt_peer_gid = sk->sk_peercred.gid; ++ v->cpt_stamp = cpt_timeval_export(&sk->sk_stamp); ++ ++ v->cpt_peer = -1; ++ v->cpt_socketpair = 0; ++ v->cpt_deleted = 0; ++ ++ v->cpt_laddrlen = 0; ++ if (sock) { ++ int alen = sizeof(v->cpt_laddr); ++ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_laddr, &alen, 0); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ v->cpt_laddrlen = alen; ++ } ++ v->cpt_raddrlen = 0; ++ if (sock) { ++ int alen = sizeof(v->cpt_raddr); ++ int err = sock->ops->getname(sock, (struct sockaddr*)&v->cpt_raddr, &alen, 2); ++ if (!err) ++ v->cpt_raddrlen = alen; ++ } ++ ++ if (sk->sk_family == AF_UNIX) { ++ if (unix_sk(sk)->dentry) { ++ struct dentry *d = unix_sk(sk)->dentry; ++ v->cpt_deleted = !IS_ROOT(d) && d_unhashed(d); ++ if (!v->cpt_deleted) { ++ int err = 0; ++ char *path; ++ unsigned long pg = __get_free_page(GFP_KERNEL); ++ ++ if (!pg) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ ++ path = d_path(d, unix_sk(sk)->mnt, (char *)pg, PAGE_SIZE); ++ ++ if (!IS_ERR(path)) { ++ int len = strlen(path); ++ if (len < 126) { ++ strcpy(((char*)v->cpt_laddr)+2, path); ++ v->cpt_laddrlen = len + 2; ++ } else { ++ wprintk_ctx("af_unix path is too long: %s (%s)\n", path, ((char*)v->cpt_laddr)+2); ++ } ++ err = cpt_verify_overmount(path, d, unix_sk(sk)->mnt, ctx); ++ } else { ++ eprintk_ctx("cannot get path of an af_unix socket\n"); ++ err = PTR_ERR(path); ++ } ++ free_page(pg); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ } ++ } ++ ++ /* If the socket is connected, find its peer. If peer is not ++ * in our table, the socket is connected to external process ++ * and we consider it disconnected. ++ */ ++ if (unix_peer(sk)) { ++ cpt_object_t *pobj; ++ pobj = lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(sk), ctx); ++ if (pobj) ++ v->cpt_peer = pobj->o_index; ++ else ++ v->cpt_shutdown = SHUTDOWN_MASK; ++ ++ if (unix_peer(unix_peer(sk)) == sk) ++ v->cpt_socketpair = 1; ++ } ++ ++ /* If the socket shares address with another socket it is ++ * child of some listening socket. Find and record it. */ ++ if (unix_sk(sk)->addr && ++ atomic_read(&unix_sk(sk)->addr->refcnt) > 1 && ++ sk->sk_state != TCP_LISTEN) { ++ cpt_object_t *pobj; ++ for_each_object(pobj, CPT_OBJ_SOCKET) { ++ struct sock *psk = pobj->o_obj; ++ if (psk->sk_family == AF_UNIX && ++ psk->sk_state == TCP_LISTEN && ++ unix_sk(psk)->addr == unix_sk(sk)->addr) { ++ v->cpt_parent = pobj->o_index; ++ break; ++ } ++ } ++ } ++ } ++ ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) ++ cpt_dump_socket_in(v, sk, ctx); ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_dump_sock_attr(sk, ctx); ++ ++ dump_rqueue(index, sk, ctx); ++ if (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) { ++ dump_wqueue(index, sk, ctx); ++ cpt_dump_ofo_queue(index, sk, ctx); ++ } ++ ++ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) ++ && sk->sk_state == TCP_LISTEN) ++ cpt_dump_synwait_queue(sk, index, ctx); ++ ++ cpt_close_object(ctx); ++ ++ if ((sk->sk_family == AF_INET || sk->sk_family == AF_INET6) ++ && sk->sk_state == TCP_LISTEN) ++ cpt_dump_accept_queue(sk, index, ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_orphaned_sockets(struct cpt_context *ctx) ++{ ++ int i; ++ ++ cpt_open_section(ctx, CPT_SECT_ORPHANS); ++ ++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { ++ struct sock *sk; ++ struct hlist_node *node; ++ ++retry: ++ read_lock_bh(&tcp_hashinfo.ehash[i].lock); ++ sk_for_each(sk, node, &tcp_hashinfo.ehash[i].chain) { ++ ++ if (VE_OWNER_SK(sk) != get_exec_env()) ++ continue; ++ if (sk->sk_socket) ++ continue; ++ if (!sock_flag(sk, SOCK_DEAD)) ++ continue; ++ if (lookup_cpt_object(CPT_OBJ_SOCKET, sk, ctx)) ++ continue; ++ sock_hold(sk); ++ read_unlock_bh(&tcp_hashinfo.ehash[i].lock); ++ ++ local_bh_disable(); ++ bh_lock_sock(sk); ++ if (sock_owned_by_user(sk)) ++ eprintk_ctx("BUG: sk locked by whom?\n"); ++ sk->sk_lock.owner = (void *)1; ++ bh_unlock_sock(sk); ++ local_bh_enable(); ++ ++ cpt_dump_socket(NULL, sk, -1, -1, ctx); ++ ++ local_bh_disable(); ++ bh_lock_sock(sk); ++ sk->sk_lock.owner = NULL; ++ clear_backlog(sk); ++ tcp_done(sk); ++ bh_unlock_sock(sk); ++ local_bh_enable(); ++ sock_put(sk); ++ ++ goto retry; ++ } ++ read_unlock_bh(&tcp_hashinfo.ehash[i].lock); ++ } ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int can_dump(struct sock *sk, cpt_context_t *ctx) ++{ ++ switch (sk->sk_family) { ++ case AF_NETLINK: ++ if (((struct netlink_sock *)sk)->cb) { ++ eprintk_ctx("netlink socket has active callback\n"); ++ return 0; ++ } ++ break; ++ } ++ return 1; ++} ++ ++/* We are not going to block suspend when we have external AF_UNIX connections. ++ * But we cannot stop feed of new packets/connections to our environment ++ * from outside. Taking into account that it is intrincically unreliable, ++ * we collect some amount of data, but when checkpointing/restoring we ++ * are going to drop everything, which does not make sense: skbs sent ++ * by outside processes, connections from outside etc. etc. ++ */ ++ ++/* The first pass. When we see socket referenced by a file, we just ++ * add it to socket table */ ++int cpt_collect_socket(struct file *file, cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ struct socket *sock; ++ struct sock *sk; ++ ++ if (!S_ISSOCK(file->f_dentry->d_inode->i_mode)) ++ return -ENOTSOCK; ++ sock = &container_of(file->f_dentry->d_inode, struct socket_alloc, vfs_inode)->socket; ++ sk = sock->sk; ++ if (!can_dump(sk, ctx)) ++ return -EBUSY; ++ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sk, ctx)) == NULL) ++ return -ENOMEM; ++ obj->o_parent = file; ++ ++ return 0; ++} ++ ++/* ++ * We should end with table containing: ++ * * all sockets opened by our processes in the table. ++ * * all the sockets queued in listening queues on _our_ listening sockets, ++ * which are connected to our opened sockets. ++ */ ++ ++static int collect_one_unix_listening_sock(cpt_object_t *obj, cpt_context_t * ctx) ++{ ++ struct sock *sk = obj->o_obj; ++ cpt_object_t *cobj; ++ struct sk_buff *skb; ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ while (skb && skb != (struct sk_buff*)&sk->sk_receive_queue) { ++ struct sock *lsk = skb->sk; ++ if (unix_peer(lsk) && ++ lookup_cpt_object(CPT_OBJ_SOCKET, unix_peer(lsk), ctx)) { ++ if ((cobj = cpt_object_add(CPT_OBJ_SOCKET, lsk, ctx)) == NULL) ++ return -ENOMEM; ++ cobj->o_parent = obj->o_parent; ++ } ++ spin_lock_irq(&sk->sk_receive_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&sk->sk_receive_queue.lock); ++ } ++ ++ return 0; ++} ++ ++int cpt_index_sockets(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ unsigned long index = 0; ++ ++ /* Collect not-yet-accepted children of listening sockets. */ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ ++ if (sk->sk_state != TCP_LISTEN) ++ continue; ++ ++ if (sk->sk_family == AF_UNIX) ++ collect_one_unix_listening_sock(obj, ctx); ++ } ++ ++ /* Assign indices to all the sockets. */ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ cpt_obj_setindex(obj, index++, ctx); ++ ++ if (sk->sk_socket && sk->sk_socket->file) { ++ cpt_object_t *tobj; ++ tobj = lookup_cpt_object(CPT_OBJ_FILE, sk->sk_socket->file, ctx); ++ if (tobj) ++ cpt_obj_setindex(tobj, obj->o_index, ctx); ++ } ++ } ++ ++ return 0; ++} ++ ++void cpt_unlock_sockets(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ if (sk && obj->o_lock) { ++ if (sk->sk_socket) ++ release_sock(sk); ++ } ++ } ++} ++ ++void cpt_kill_sockets(cpt_context_t * ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ if (sk && obj->o_lock) { ++ cpt_kill_socket(sk, ctx); ++ if (sk->sk_socket) ++ release_sock_nobacklog(sk); ++ } ++ } ++} ++ ++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx) ++{ ++ struct fasync_struct *fa; ++ struct inode *inode = file->f_dentry->d_inode; ++ struct socket *sock; ++ ++ sock = &container_of(inode, struct socket_alloc, vfs_inode)->socket; ++ ++ for (fa = sock->fasync_list; fa; fa = fa->fa_next) { ++ if (fa->fa_file == file) ++ return fa->fa_fd; ++ } ++ return -1; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket.h linux-2.6.16-026test015/kernel/cpt/cpt_socket.h +--- linux-2.6.16.orig/kernel/cpt/cpt_socket.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_socket.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,33 @@ ++struct sock; ++ ++int cpt_collect_passedfds(cpt_context_t *); ++int cpt_index_sockets(cpt_context_t *); ++int cpt_collect_socket(struct file *, cpt_context_t *); ++int cpt_dump_socket(cpt_object_t *obj, struct sock *sk, int index, int parent, struct cpt_context *ctx); ++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx); ++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx); ++int rst_sockets(struct cpt_context *ctx); ++int rst_sockets_complete(struct cpt_context *ctx); ++int cpt_dump_orphaned_sockets(struct cpt_context *ctx); ++ ++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx); ++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx); ++ ++void cpt_unlock_sockets(cpt_context_t *); ++void cpt_kill_sockets(cpt_context_t *); ++ ++ ++int cpt_kill_socket(struct sock *, cpt_context_t *); ++int cpt_dump_socket_in(struct cpt_sock_image *, struct sock *, struct cpt_context*); ++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *, struct cpt_context *ctx); ++__u32 cpt_socket_fasync(struct file *file, struct cpt_context *ctx); ++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *); ++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, loff_t pos, struct cpt_context *ctx); ++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx); ++int cpt_dump_skb(int type, int owner, struct sk_buff *skb, struct cpt_context *ctx); ++int cpt_dump_mcfilter(struct sock *sk, struct cpt_context *ctx); ++ ++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx); ++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_socket_in.c linux-2.6.16-026test015/kernel/cpt/cpt_socket_in.c +--- linux-2.6.16.orig/kernel/cpt/cpt_socket_in.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_socket_in.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,443 @@ ++/* ++ * ++ * kernel/cpt/cpt_socket_in.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/tcp.h> ++#include <net/sock.h> ++#include <net/tcp.h> ++#include <linux/igmp.h> ++#include <linux/ipv6.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++ ++static inline __u32 jiffies_export(unsigned long tmo) ++{ ++ __s32 delta = (long)(tmo - jiffies); ++ return delta; ++} ++ ++static inline __u32 tcp_jiffies_export(__u32 tmo) ++{ ++ __s32 delta = tmo - tcp_time_stamp; ++ return delta; ++} ++ ++int cpt_dump_ofo_queue(int idx, struct sock *sk, struct cpt_context *ctx) ++{ ++ struct sk_buff *skb; ++ struct tcp_sock *tp; ++ ++ if (sk->sk_type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP) ++ return 0; ++ ++ tp = tcp_sk(sk); ++ ++ skb = skb_peek(&tp->out_of_order_queue); ++ while (skb && skb != (struct sk_buff*)&tp->out_of_order_queue) { ++ int err; ++ ++ err = cpt_dump_skb(CPT_SKB_OFOQ, idx, skb, ctx); ++ if (err) ++ return err; ++ ++ spin_lock_irq(&tp->out_of_order_queue.lock); ++ skb = skb->next; ++ spin_unlock_irq(&tp->out_of_order_queue.lock); ++ } ++ return 0; ++} ++ ++static int cpt_dump_socket_tcp(struct cpt_sock_image *si, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ si->cpt_pred_flags = tp->pred_flags; ++ si->cpt_rcv_nxt = tp->rcv_nxt; ++ si->cpt_snd_nxt = tp->snd_nxt; ++ si->cpt_snd_una = tp->snd_una; ++ si->cpt_snd_sml = tp->snd_sml; ++ si->cpt_rcv_tstamp = tcp_jiffies_export(tp->rcv_tstamp); ++ si->cpt_lsndtime = tcp_jiffies_export(tp->lsndtime); ++ si->cpt_tcp_header_len = tp->tcp_header_len; ++ si->cpt_ack_pending = inet_csk(sk)->icsk_ack.pending; ++ si->cpt_quick = inet_csk(sk)->icsk_ack.quick; ++ si->cpt_pingpong = inet_csk(sk)->icsk_ack.pingpong; ++ si->cpt_blocked = inet_csk(sk)->icsk_ack.blocked; ++ si->cpt_ato = inet_csk(sk)->icsk_ack.ato; ++ si->cpt_ack_timeout = jiffies_export(inet_csk(sk)->icsk_ack.timeout); ++ si->cpt_lrcvtime = tcp_jiffies_export(inet_csk(sk)->icsk_ack.lrcvtime); ++ si->cpt_last_seg_size = inet_csk(sk)->icsk_ack.last_seg_size; ++ si->cpt_rcv_mss = inet_csk(sk)->icsk_ack.rcv_mss; ++ si->cpt_snd_wl1 = tp->snd_wl1; ++ si->cpt_snd_wnd = tp->snd_wnd; ++ si->cpt_max_window = tp->max_window; ++ si->cpt_pmtu_cookie = inet_csk(sk)->icsk_pmtu_cookie; ++ si->cpt_mss_cache = tp->mss_cache; ++ si->cpt_mss_cache_std = tp->mss_cache; /* FIXMW was tp->mss_cache_std */ ++ si->cpt_mss_clamp = tp->rx_opt.mss_clamp; ++ si->cpt_ext_header_len = inet_csk(sk)->icsk_ext_hdr_len; ++ si->cpt_ext2_header_len = 0; ++ si->cpt_ca_state = inet_csk(sk)->icsk_ca_state; ++ si->cpt_retransmits = inet_csk(sk)->icsk_retransmits; ++ si->cpt_reordering = tp->reordering; ++ si->cpt_frto_counter = tp->frto_counter; ++ si->cpt_frto_highmark = tp->frto_highmark; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) ++ // // si->cpt_adv_cong = tp->adv_cong; ++#endif ++ si->cpt_defer_accept = inet_csk(sk)->icsk_accept_queue.rskq_defer_accept; ++ si->cpt_backoff = inet_csk(sk)->icsk_backoff; ++ si->cpt_srtt = tp->srtt; ++ si->cpt_mdev = tp->mdev; ++ si->cpt_mdev_max = tp->mdev_max; ++ si->cpt_rttvar = tp->rttvar; ++ si->cpt_rtt_seq = tp->rtt_seq; ++ si->cpt_rto = inet_csk(sk)->icsk_rto; ++ si->cpt_packets_out = tp->packets_out; ++ si->cpt_left_out = tp->left_out; ++ si->cpt_retrans_out = tp->retrans_out; ++ si->cpt_lost_out = tp->lost_out; ++ si->cpt_sacked_out = tp->sacked_out; ++ si->cpt_fackets_out = tp->fackets_out; ++ si->cpt_snd_ssthresh = tp->snd_ssthresh; ++ si->cpt_snd_cwnd = tp->snd_cwnd; ++ si->cpt_snd_cwnd_cnt = tp->snd_cwnd_cnt; ++ si->cpt_snd_cwnd_clamp = tp->snd_cwnd_clamp; ++ si->cpt_snd_cwnd_used = tp->snd_cwnd_used; ++ si->cpt_snd_cwnd_stamp = tcp_jiffies_export(tp->snd_cwnd_stamp); ++ si->cpt_timeout = jiffies_export(inet_csk(sk)->icsk_timeout); ++ si->cpt_ka_timeout = 0; ++ si->cpt_rcv_wnd = tp->rcv_wnd; ++ si->cpt_rcv_wup = tp->rcv_wup; ++ si->cpt_write_seq = tp->write_seq; ++ si->cpt_pushed_seq = tp->pushed_seq; ++ si->cpt_copied_seq = tp->copied_seq; ++ si->cpt_tstamp_ok = tp->rx_opt.tstamp_ok; ++ si->cpt_wscale_ok = tp->rx_opt.wscale_ok; ++ si->cpt_sack_ok = tp->rx_opt.sack_ok; ++ si->cpt_saw_tstamp = tp->rx_opt.saw_tstamp; ++ si->cpt_snd_wscale = tp->rx_opt.snd_wscale; ++ si->cpt_rcv_wscale = tp->rx_opt.rcv_wscale; ++ si->cpt_nonagle = tp->nonagle; ++ si->cpt_keepalive_probes = tp->keepalive_probes; ++ si->cpt_rcv_tsval = tp->rx_opt.rcv_tsval; ++ si->cpt_rcv_tsecr = tp->rx_opt.rcv_tsecr; ++ si->cpt_ts_recent = tp->rx_opt.ts_recent; ++ si->cpt_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; ++ si->cpt_user_mss = tp->rx_opt.user_mss; ++ si->cpt_dsack = tp->rx_opt.dsack; ++ si->cpt_eff_sacks = tp->rx_opt.eff_sacks; ++ si->cpt_sack_array[0] = tp->duplicate_sack[0].start_seq; ++ si->cpt_sack_array[1] = tp->duplicate_sack[0].end_seq; ++ si->cpt_sack_array[2] = tp->selective_acks[0].start_seq; ++ si->cpt_sack_array[3] = tp->selective_acks[0].end_seq; ++ si->cpt_sack_array[4] = tp->selective_acks[1].start_seq; ++ si->cpt_sack_array[5] = tp->selective_acks[1].end_seq; ++ si->cpt_sack_array[6] = tp->selective_acks[2].start_seq; ++ si->cpt_sack_array[7] = tp->selective_acks[2].end_seq; ++ si->cpt_sack_array[8] = tp->selective_acks[3].start_seq; ++ si->cpt_sack_array[9] = tp->selective_acks[3].end_seq; ++ si->cpt_window_clamp = tp->window_clamp; ++ si->cpt_rcv_ssthresh = tp->rcv_ssthresh; ++ si->cpt_probes_out = inet_csk(sk)->icsk_probes_out; ++ si->cpt_num_sacks = tp->rx_opt.num_sacks; ++ si->cpt_advmss = tp->advmss; ++ si->cpt_syn_retries = inet_csk(sk)->icsk_syn_retries; ++ si->cpt_ecn_flags = tp->ecn_flags; ++ si->cpt_prior_ssthresh = tp->prior_ssthresh; ++ si->cpt_high_seq = tp->high_seq; ++ si->cpt_retrans_stamp = tp->retrans_stamp; ++ si->cpt_undo_marker = tp->undo_marker; ++ si->cpt_undo_retrans = tp->undo_retrans; ++ si->cpt_urg_seq = tp->urg_seq; ++ si->cpt_urg_data = tp->urg_data; ++ si->cpt_pending = inet_csk(sk)->icsk_pending; ++ si->cpt_urg_mode = tp->urg_mode; ++ si->cpt_snd_up = tp->snd_up; ++ si->cpt_keepalive_time = tp->keepalive_time; ++ si->cpt_keepalive_intvl = tp->keepalive_intvl; ++ si->cpt_linger2 = tp->linger2; ++ ++ if (sk->sk_state != TCP_LISTEN && ++ sk->sk_state != TCP_CLOSE && ++ sock_flag(sk, SOCK_KEEPOPEN)) { ++ si->cpt_ka_timeout = jiffies_export(sk->sk_timer.expires); ++ } ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ { ++ extern struct inet_connection_sock_af_ops ipv6_mapped; ++ if (sk->sk_family == AF_INET6 && ++ inet_csk(sk)->icsk_af_ops == &ipv6_mapped) ++ si->cpt_mapped = 1; ++ } ++#endif ++ ++ return 0; ++} ++ ++ ++int cpt_dump_socket_in(struct cpt_sock_image *si, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ if (sk->sk_family == AF_INET) { ++ struct sockaddr_in *sin = ((struct sockaddr_in*)si->cpt_laddr); ++ sin->sin_family = AF_INET; ++ sin->sin_port = inet->sport; ++ sin->sin_addr.s_addr = inet->rcv_saddr; ++ si->cpt_laddrlen = sizeof(*sin); ++ } else if (sk->sk_family == AF_INET6) { ++ struct sockaddr_in6 *sin6 = ((struct sockaddr_in6*)si->cpt_laddr); ++ sin6->sin6_family = AF_INET6; ++ sin6->sin6_port = inet->sport; ++ memcpy(&sin6->sin6_addr, &np->rcv_saddr, 16); ++ si->cpt_laddrlen = sizeof(*sin6); ++ } ++ if (!inet->num) ++ si->cpt_laddrlen = 0; ++ ++ si->cpt_daddr = inet->daddr; ++ si->cpt_dport = inet->dport; ++ si->cpt_saddr = inet->saddr; ++ si->cpt_rcv_saddr = inet->rcv_saddr; ++ si->cpt_sport = inet->sport; ++ si->cpt_uc_ttl = inet->uc_ttl; ++ si->cpt_tos = inet->tos; ++ si->cpt_cmsg_flags = inet->cmsg_flags; ++ si->cpt_mc_index = inet->mc_index; ++ si->cpt_mc_addr = inet->mc_addr; ++ si->cpt_hdrincl = inet->hdrincl; ++ si->cpt_mc_ttl = inet->mc_ttl; ++ si->cpt_mc_loop = inet->mc_loop; ++ si->cpt_pmtudisc = inet->pmtudisc; ++ si->cpt_recverr = inet->recverr; ++ si->cpt_freebind = inet->freebind; ++ si->cpt_idcounter = inet->id; ++ ++ si->cpt_cork_flags = inet->cork.flags; ++ si->cpt_cork_fragsize = 0; ++ si->cpt_cork_length = inet->cork.length; ++ si->cpt_cork_addr = inet->cork.addr; ++ si->cpt_cork_saddr = inet->cork.fl.fl4_src; ++ si->cpt_cork_daddr = inet->cork.fl.fl4_dst; ++ si->cpt_cork_oif = inet->cork.fl.oif; ++ if (inet->cork.rt) { ++ si->cpt_cork_fragsize = inet->cork.fragsize; ++ si->cpt_cork_saddr = inet->cork.rt->fl.fl4_src; ++ si->cpt_cork_daddr = inet->cork.rt->fl.fl4_dst; ++ si->cpt_cork_oif = inet->cork.rt->fl.oif; ++ } ++ ++ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { ++ struct udp_sock *up = udp_sk(sk); ++ si->cpt_udp_pending = up->pending; ++ si->cpt_udp_corkflag = up->corkflag; ++ si->cpt_udp_encap = up->encap_type; ++ si->cpt_udp_len = up->len; ++ } ++ ++ if (sk->sk_family == AF_INET6) { ++ memcpy(si->cpt_saddr6, &np->saddr, 16); ++ memcpy(si->cpt_rcv_saddr6, &np->rcv_saddr, 16); ++ memcpy(si->cpt_daddr6, &np->daddr, 16); ++ si->cpt_flow_label6 = np->flow_label; ++ si->cpt_frag_size6 = np->frag_size; ++ si->cpt_hop_limit6 = np->hop_limit; ++ si->cpt_mcast_hops6 = np->mcast_hops; ++ si->cpt_mcast_oif6 = np->mcast_oif; ++ si->cpt_rxopt6 = np->rxopt.all; ++ si->cpt_mc_loop6 = np->mc_loop; ++ si->cpt_recverr6 = np->recverr; ++ si->cpt_sndflow6 = np->sndflow; ++ si->cpt_pmtudisc6 = np->pmtudisc; ++ si->cpt_ipv6only6 = np->ipv6only; ++ si->cpt_mapped = 0; ++ } ++ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) ++ cpt_dump_socket_tcp(si, sk, ctx); ++ ++ return 0; ++} ++ ++int cpt_dump_accept_queue(struct sock *sk, int index, struct cpt_context *ctx) ++{ ++ struct request_sock *req; ++ ++ for (req=inet_csk(sk)->icsk_accept_queue.rskq_accept_head; req; req=req->dl_next) ++ cpt_dump_socket(NULL, req->sk, -1, index, ctx); ++ return 0; ++} ++ ++ ++static int dump_openreq(struct request_sock *req, struct sock *sk, int index, ++ struct cpt_context *ctx) ++{ ++ struct cpt_openreq_image *v = cpt_get_buf(ctx); ++ ++ cpt_open_object(NULL, ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_OPENREQ; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ v->cpt_rcv_isn = tcp_rsk(req)->rcv_isn; ++ v->cpt_snt_isn = tcp_rsk(req)->snt_isn; ++ v->cpt_rmt_port = inet_rsk(req)->rmt_port; ++ v->cpt_mss = req->mss; ++ // // v->cpt_family = (req->class == &or_ipv4 ? AF_INET : AF_INET6); ++ v->cpt_retrans = req->retrans; ++ v->cpt_snd_wscale = inet_rsk(req)->snd_wscale; ++ v->cpt_rcv_wscale = inet_rsk(req)->rcv_wscale; ++ v->cpt_tstamp_ok = inet_rsk(req)->tstamp_ok; ++ v->cpt_sack_ok = inet_rsk(req)->sack_ok; ++ v->cpt_wscale_ok = inet_rsk(req)->wscale_ok; ++ v->cpt_ecn_ok = inet_rsk(req)->ecn_ok; ++ v->cpt_acked = inet_rsk(req)->acked; ++ v->cpt_window_clamp = req->window_clamp; ++ v->cpt_rcv_wnd = req->rcv_wnd; ++ v->cpt_ts_recent = req->ts_recent; ++ v->cpt_expires = jiffies_export(req->expires); ++ ++ if (v->cpt_family == AF_INET) { ++ memcpy(v->cpt_loc_addr, &inet_rsk(req)->loc_addr, 4); ++ memcpy(v->cpt_rmt_addr, &inet_rsk(req)->rmt_addr, 4); ++ } else { ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ memcpy(v->cpt_loc_addr, &inet6_rsk(req)->loc_addr, 16); ++ memcpy(v->cpt_rmt_addr, &inet6_rsk(req)->rmt_addr, 16); ++ v->cpt_iif = inet6_rsk(req)->iif; ++#endif ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++int cpt_dump_synwait_queue(struct sock *sk, int index, struct cpt_context *ctx) ++{ ++ struct listen_sock *lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; ++ struct request_sock *req; ++ int i; ++ ++ for (i=0; i<TCP_SYNQ_HSIZE; i++) { ++ for (req=lopt->syn_table[i]; req; req=req->dl_next) { ++ loff_t saved_obj; ++ cpt_push_object(&saved_obj, ctx); ++ dump_openreq(req, sk, index, ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ } ++ } ++ return 0; ++} ++ ++ ++int cpt_kill_socket(struct sock *sk, cpt_context_t * ctx) ++{ ++ if (sk->sk_state != TCP_CLOSE && ++ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && ++ sk->sk_protocol == IPPROTO_TCP) { ++ if (sk->sk_state != TCP_LISTEN) ++ tcp_set_state(sk, TCP_CLOSE); ++ else ++ sk->sk_prot->disconnect(sk, 0); ++ } ++ return 0; ++} ++ ++int cpt_dump_mcfilter(struct sock *sk, cpt_context_t *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ struct ip_mc_socklist *iml; ++ ++ for (iml = inet->mc_list; iml; iml = iml->next) { ++ struct cpt_sockmc_image smi; ++ int scnt = 0; ++ int i; ++ ++ if (iml->sflist) ++ scnt = iml->sflist->sl_count*16; ++ ++ smi.cpt_next = sizeof(smi) + scnt; ++ smi.cpt_object = CPT_OBJ_SOCK_MCADDR; ++ smi.cpt_hdrlen = sizeof(smi); ++ smi.cpt_content = CPT_CONTENT_DATA; ++ ++ smi.cpt_family = AF_INET; ++ smi.cpt_mode = iml->sfmode; ++ smi.cpt_ifindex = iml->multi.imr_ifindex; ++ memset(&smi.cpt_mcaddr, 0, sizeof(smi.cpt_mcaddr)); ++ smi.cpt_mcaddr[0] = iml->multi.imr_multiaddr.s_addr; ++ ++ ctx->write(&smi, sizeof(smi), ctx); ++ ++ for (i = 0; i < scnt; i++) { ++ u32 addr[4]; ++ memset(&addr, 0, sizeof(addr)); ++ addr[0] = iml->sflist->sl_addr[i]; ++ ctx->write(&addr, sizeof(addr), ctx); ++ } ++ } ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (sk->sk_family == AF_INET6) { ++ struct ipv6_mc_socklist *mcl; ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ for (mcl = np->ipv6_mc_list; mcl; mcl = mcl->next) { ++ struct cpt_sockmc_image smi; ++ int scnt = 0; ++ int i; ++ ++ if (mcl->sflist) ++ scnt = mcl->sflist->sl_count*16; ++ ++ smi.cpt_next = sizeof(smi) + scnt; ++ smi.cpt_object = CPT_OBJ_SOCK_MCADDR; ++ smi.cpt_hdrlen = sizeof(smi); ++ smi.cpt_content = CPT_CONTENT_DATA; ++ ++ smi.cpt_family = AF_INET6; ++ smi.cpt_mode = mcl->sfmode; ++ smi.cpt_ifindex = mcl->ifindex; ++ memcpy(&smi.cpt_mcaddr, &mcl->addr, sizeof(smi.cpt_mcaddr)); ++ ++ ctx->write(&smi, sizeof(smi), ctx); ++ for (i = 0; i < scnt; i++) ++ ctx->write(&mcl->sflist->sl_addr[i], 16, ctx); ++ } ++ } ++#endif ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_syscalls.h linux-2.6.16-026test015/kernel/cpt/cpt_syscalls.h +--- linux-2.6.16.orig/kernel/cpt/cpt_syscalls.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_syscalls.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,95 @@ ++#include <linux/unistd.h> ++#include <linux/syscalls.h> ++#include <asm/uaccess.h> ++ ++#define WRAP(c, args) return sys_##c args ++#define WRAP2(c, args) int err; mm_segment_t oldfs; \ ++ oldfs = get_fs(); set_fs(KERNEL_DS); \ ++ err = sys_##c args ;\ ++ set_fs(oldfs); \ ++ return err ++ ++static inline int sc_close(int fd) ++{ ++ WRAP(close, (fd)); ++} ++ ++static inline int sc_dup2(int fd1, int fd2) ++{ ++ WRAP(dup2, (fd1, fd2)); ++} ++ ++static inline int sc_unlink(char *name) ++{ ++ WRAP2(unlink, (name)); ++} ++ ++static inline int sc_pipe(int *pfd) ++{ ++ return do_pipe(pfd); ++} ++ ++static inline int sc_mknod(char *name, int mode, int dev) ++{ ++ WRAP2(mknod, (name, mode, dev)); ++} ++ ++static inline int sc_chmod(char *name, int mode) ++{ ++ WRAP2(mkdir, (name, mode)); ++} ++ ++static inline int sc_chown(char *name, int uid, int gid) ++{ ++ WRAP2(chown, (name, uid, gid)); ++} ++ ++static inline int sc_mkdir(char *name, int mode) ++{ ++ WRAP2(mkdir, (name, mode)); ++} ++ ++static inline int sc_rmdir(char *name) ++{ ++ WRAP2(rmdir, (name)); ++} ++ ++static inline int sc_mount(char *mntdev, char *mntpnt, char *type, unsigned long flags) ++{ ++ WRAP2(mount, (mntdev ? : "none", mntpnt, type, flags, NULL)); ++} ++ ++static inline int sc_mprotect(unsigned long start, size_t len, ++ unsigned long prot) ++{ ++ WRAP(mprotect, (start, len, prot)); ++} ++ ++static inline int sc_mlock(unsigned long start, size_t len) ++{ ++ WRAP(mlock, (start, len)); ++} ++ ++static inline int sc_munlock(unsigned long start, size_t len) ++{ ++ WRAP(munlock, (start, len)); ++} ++ ++static inline int sc_remap_file_pages(unsigned long start, size_t len, ++ unsigned long prot, unsigned long pgoff, ++ unsigned long flags) ++{ ++ WRAP(remap_file_pages, (start, len, prot, pgoff, flags)); ++} ++ ++static inline int sc_waitx(int pid, int opt) ++{ ++ WRAP(wait4, (pid, NULL, opt, NULL)); ++} ++ ++static inline int sc_flock(int fd, int flags) ++{ ++ WRAP(flock, (fd, flags)); ++} ++ ++extern int sc_execve(char *cms, char **argv, char **env); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_sysvipc.c linux-2.6.16-026test015/kernel/cpt/cpt_sysvipc.c +--- linux-2.6.16.orig/kernel/cpt/cpt_sysvipc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_sysvipc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,317 @@ ++/* ++ * ++ * kernel/cpt/cpt_sysvipc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/shm.h> ++#include <linux/sem.h> ++#include <linux/msg.h> ++#include <asm/uaccess.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++ ++struct _warg { ++ struct file *file; ++ struct cpt_sysvshm_image *v; ++}; ++ ++static int dump_one_shm(struct shmid_kernel *shp, void *arg) ++{ ++ struct _warg *warg = arg; ++ struct cpt_sysvshm_image *v = (struct cpt_sysvshm_image *)warg->v; ++ ++ if (shp->shm_file != warg->file) ++ return 0; ++ ++ v->cpt_key = shp->shm_perm.key; ++ v->cpt_uid = shp->shm_perm.uid; ++ v->cpt_gid = shp->shm_perm.gid; ++ v->cpt_cuid = shp->shm_perm.cuid; ++ v->cpt_cgid = shp->shm_perm.cgid; ++ v->cpt_mode = shp->shm_perm.mode; ++ v->cpt_seq = shp->shm_perm.seq; ++ ++ v->cpt_id = shp->id; ++ v->cpt_segsz = shp->shm_segsz; ++ v->cpt_atime = shp->shm_atim; ++ v->cpt_ctime = shp->shm_ctim; ++ v->cpt_dtime = shp->shm_dtim; ++ v->cpt_creator = shp->shm_cprid; ++ v->cpt_last = shp->shm_lprid; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) ++ v->cpt_mlockuser = shp->mlock_user ? shp->mlock_user->uid : -1; ++#else ++ v->cpt_mlockuser = -1; ++#endif ++ return 1; ++} ++ ++int cpt_dump_content_sysvshm(struct file *file, struct cpt_context *ctx) ++{ ++ struct cpt_sysvshm_image *v = cpt_get_buf(ctx); ++ struct _warg warg; ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_SYSV_SHM; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ warg.file = file; ++ warg.v = v; ++ if (sysvipc_walk_shm(dump_one_shm, &warg) == 0) { ++ cpt_release_buf(ctx); ++ return -ESRCH; ++ } ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++ ++int match_sem(int id, struct sem_array *sema, void *arg) ++{ ++ if (id != (unsigned long)arg) ++ return 0; ++ return sema->sem_nsems + 1; ++} ++ ++static int get_sem_nsem(int id, cpt_context_t *ctx) ++{ ++ int res; ++ res = sysvipc_walk_sem(match_sem, (void*)(unsigned long)id); ++ if (res > 0) ++ return res - 1; ++ eprintk_ctx("get_sem_nsem: SYSV semaphore %d not found\n", id); ++ return -ESRCH; ++} ++ ++static int dump_one_semundo(struct sem_undo *su, struct cpt_context *ctx) ++{ ++ struct cpt_sysvsem_undo_image v; ++ loff_t saved_obj; ++ ++ cpt_open_object(NULL, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO_REC; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_SEMUNDO; ++ v.cpt_id = su->semid; ++ v.cpt_nsem = get_sem_nsem(su->semid, ctx); ++ if ((int)v.cpt_nsem < 0) ++ return -ESRCH; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ ctx->write(su->semadj, v.cpt_nsem*sizeof(short), ctx); ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ return 0; ++} ++ ++struct sem_warg { ++ int last_id; ++ struct cpt_sysvsem_image *v; ++}; ++ ++static int dump_one_sem(int id, struct sem_array *sma, void *arg) ++{ ++ struct sem_warg * warg = (struct sem_warg *)arg; ++ struct cpt_sysvsem_image *v = warg->v; ++ int i; ++ ++ if (warg->last_id != -1) { ++ if ((id % IPCMNI) <= warg->last_id) ++ return 0; ++ } ++ ++ v->cpt_next = sizeof(*v); ++ v->cpt_object = CPT_OBJ_SYSV_SEM; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_SEMARRAY; ++ ++ v->cpt_key = sma->sem_perm.key; ++ v->cpt_uid = sma->sem_perm.uid; ++ v->cpt_gid = sma->sem_perm.gid; ++ v->cpt_cuid = sma->sem_perm.cuid; ++ v->cpt_cgid = sma->sem_perm.cgid; ++ v->cpt_mode = sma->sem_perm.mode; ++ v->cpt_seq = sma->sem_perm.seq; ++ ++ v->cpt_id = id; ++ v->cpt_ctime = sma->sem_ctime; ++ v->cpt_otime = sma->sem_otime; ++ ++ for (i=0; i<sma->sem_nsems; i++) { ++ struct { ++ __u32 semval; ++ __u32 sempid; ++ } *s = (void*)v + v->cpt_next; ++ if (v->cpt_next >= PAGE_SIZE - sizeof(*s)) ++ return -EINVAL; ++ s->semval = sma->sem_base[i].semval; ++ s->sempid = sma->sem_base[i].sempid; ++ v->cpt_next += sizeof(*s); ++ } ++ ++ warg->last_id = id % IPCMNI; ++ return 1; ++} ++ ++ ++int cpt_dump_sysvsem(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ struct sem_warg warg; ++ ++ /* Dumping semaphores is quite tricky because we cannot ++ * write to dump file under lock inside sysvipc_walk_sem(). ++ */ ++ cpt_open_section(ctx, CPT_SECT_SYSV_SEM); ++ warg.last_id = -1; ++ warg.v = cpt_get_buf(ctx); ++ for (;;) { ++ if (sysvipc_walk_sem(dump_one_sem, &warg) <= 0) ++ break; ++ ctx->write(warg.v, warg.v->cpt_next, ctx); ++ } ++ cpt_release_buf(ctx); ++ cpt_close_section(ctx); ++ ++ cpt_open_section(ctx, CPT_SECT_SYSVSEM_UNDO); ++ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { ++ struct sem_undo_list *semu = obj->o_obj; ++ struct sem_undo *su; ++ struct cpt_object_hdr v; ++ loff_t saved_obj; ++ ++ cpt_open_object(obj, ctx); ++ ++ v.cpt_next = CPT_NULL; ++ v.cpt_object = CPT_OBJ_SYSVSEM_UNDO; ++ v.cpt_hdrlen = sizeof(v); ++ v.cpt_content = CPT_CONTENT_ARRAY; ++ ++ ctx->write(&v, sizeof(v), ctx); ++ ++ cpt_push_object(&saved_obj, ctx); ++ for (su = semu->proc_list; su; su = su->proc_next) { ++ if (su->semid != -1) { ++ int err; ++ err = dump_one_semundo(su, ctx); ++ if (err < 0) ++ return err; ++ } ++ } ++ cpt_pop_object(&saved_obj, ctx); ++ ++ cpt_close_object(ctx); ++ } ++ cpt_close_section(ctx); ++ return 0; ++} ++ ++static int collect_one_msg(int id, struct msg_queue *msq, void *arg) ++{ ++ int *retp = arg; ++ (*retp)++; ++ return 0; ++} ++ ++int cpt_collect_sysvmsg(cpt_context_t * ctx) ++{ ++ int ret = 0; ++ sysvipc_walk_msg(collect_one_msg, &ret); ++ if (ret) { ++ eprintk_ctx("SYSV msgqueues are not supported, found %d\n", ret); ++ return -EBUSY; ++ } ++ return 0; ++} ++ ++static int cpt_collect_sysvsem_undo(cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ if (tsk->exit_state) { ++ /* ipc/sem.c forgets to clear tsk->sysvsem.undo_list ++ * on exit. Grrr... */ ++ continue; ++ } ++ if (tsk->sysvsem.undo_list && ++ cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, tsk->sysvsem.undo_list, ctx) == NULL) ++ return -ENOMEM; ++ } ++ ++ for_each_object(obj, CPT_OBJ_SYSVSEM_UNDO) { ++ struct sem_undo_list *semu = obj->o_obj; ++ ++ if (atomic_read(&semu->refcnt) != obj->o_count) { ++ eprintk_ctx("sem_undo_list is referenced outside %d %d\n", obj->o_count, atomic_read(&semu->refcnt)); ++ return -EBUSY; ++ } ++ } ++ return 0; ++} ++ ++static int collect_one_shm(struct shmid_kernel *shp, void *arg) ++{ ++ cpt_context_t *ctx = arg; ++ ++ if (__cpt_object_add(CPT_OBJ_FILE, shp->shm_file, GFP_ATOMIC, ctx) == NULL) ++ return -ENOMEM; ++ return 0; ++} ++ ++int cpt_collect_sysvshm(cpt_context_t * ctx) ++{ ++ int err; ++ ++ err = sysvipc_walk_shm(collect_one_shm, ctx); ++ ++ return err < 0 ? err : 0; ++} ++ ++int cpt_collect_sysv(cpt_context_t * ctx) ++{ ++ int err; ++ ++ err = cpt_collect_sysvsem_undo(ctx); ++ if (err) ++ return err; ++ err = cpt_collect_sysvmsg(ctx); ++ if (err) ++ return err; ++ err = cpt_collect_sysvshm(ctx); ++ if (err) ++ return err; ++ ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_tty.c linux-2.6.16-026test015/kernel/cpt/cpt_tty.c +--- linux-2.6.16.orig/kernel/cpt/cpt_tty.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_tty.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,216 @@ ++/* ++ * ++ * kernel/cpt/cpt_tty.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/tty.h> ++#include <asm/uaccess.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++/* We must support at least N_TTY. */ ++ ++int cpt_dump_content_tty(struct file *file, struct cpt_context *ctx) ++{ ++ struct tty_struct *tty = file->private_data; ++ cpt_object_t *obj; ++ struct cpt_obj_ref o; ++ loff_t saved_pos; ++ ++ obj = lookup_cpt_object(CPT_OBJ_TTY, tty, ctx); ++ if (!obj) ++ return -EINVAL; ++ ++ cpt_push_object(&saved_pos, ctx); ++ ++ o.cpt_next = sizeof(o); ++ o.cpt_object = CPT_OBJ_REF; ++ o.cpt_hdrlen = sizeof(o); ++ o.cpt_content = CPT_CONTENT_VOID; ++ o.cpt_pos = obj->o_pos; ++ ctx->write(&o, sizeof(o), ctx); ++ ++ cpt_pop_object(&saved_pos, ctx); ++ ++ return 0; ++} ++ ++int cpt_collect_tty(struct file *file, cpt_context_t * ctx) ++{ ++ struct tty_struct *tty = file->private_data; ++ ++ if (tty) { ++ if (cpt_object_add(CPT_OBJ_TTY, tty, ctx) == NULL) ++ return -ENOMEM; ++ if (tty->link) { ++ cpt_object_t *obj; ++ ++ obj = cpt_object_add(CPT_OBJ_TTY, tty->link, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ /* Undo o_count, tty->link is not a reference */ ++ obj->o_count--; ++ } ++ } ++ return 0; ++} ++ ++int cpt_dump_tty(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct tty_struct *tty = obj->o_obj; ++ struct cpt_tty_image *v; ++ ++ if (tty->link) { ++ if (lookup_cpt_object(CPT_OBJ_TTY, tty->link, ctx) == NULL) { ++ eprintk_ctx("orphan pty %s %d\n", tty->name, tty->driver->subtype == PTY_TYPE_SLAVE); ++ return -EINVAL; ++ } ++ if (tty->link->link != tty) { ++ eprintk_ctx("bad pty pair\n"); ++ return -EINVAL; ++ } ++ if (tty->driver->type == TTY_DRIVER_TYPE_PTY && ++ tty->driver->subtype == PTY_TYPE_SLAVE && ++ tty->link->count) ++ obj->o_count++; ++ } ++ if (obj->o_count != tty->count) { ++ eprintk_ctx("tty %s is referenced outside %d %d\n", tty->name, obj->o_count, tty->count); ++ return -EBUSY; ++ } ++ ++ cpt_open_object(obj, ctx); ++ ++ v = cpt_get_buf(ctx); ++ v->cpt_next = -1; ++ v->cpt_object = CPT_OBJ_TTY; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_ARRAY; ++ ++ v->cpt_index = tty->index; ++ v->cpt_link = -1; ++ if (tty->link) ++ v->cpt_link = tty->link->index; ++ v->cpt_drv_type = tty->driver->type; ++ v->cpt_drv_subtype = tty->driver->subtype; ++ v->cpt_drv_flags = tty->driver->flags; ++ v->cpt_packet = tty->packet; ++ v->cpt_stopped = tty->stopped; ++ v->cpt_hw_stopped = tty->hw_stopped; ++ v->cpt_flow_stopped = tty->flow_stopped; ++ v->cpt_flags = tty->flags; ++ v->cpt_ctrl_status = tty->ctrl_status; ++ v->cpt_canon_data = tty->canon_data; ++ v->cpt_canon_head = tty->canon_head - tty->read_tail; ++ v->cpt_canon_column = tty->canon_column; ++ v->cpt_column = tty->column; ++ v->cpt_erasing = tty->erasing; ++ v->cpt_lnext = tty->lnext; ++ v->cpt_icanon = tty->icanon; ++ v->cpt_raw = tty->raw; ++ v->cpt_real_raw = tty->real_raw; ++ v->cpt_closing = tty->closing; ++ v->cpt_minimum_to_wake = tty->minimum_to_wake; ++ v->cpt_pgrp = 0; ++ if (tty->pgrp > 0) { ++ v->cpt_pgrp = _pid_type_to_vpid(PIDTYPE_PGID, tty->pgrp); ++ if ((int)v->cpt_pgrp < 0) { ++ dprintk_ctx("cannot map tty->pgrp %d -> %d\n", tty->pgrp, (int)v->cpt_pgrp); ++ v->cpt_pgrp = -1; ++ } ++ } ++ v->cpt_session = 0; ++ if (tty->session > 0) { ++ v->cpt_session = _pid_type_to_vpid(PIDTYPE_SID, tty->session); ++ if ((int)v->cpt_session < 0) { ++ eprintk_ctx("cannot map tty->session %d -> %d\n", tty->session, (int)v->cpt_session); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ memcpy(v->cpt_name, tty->name, 64); ++ v->cpt_ws_row = tty->winsize.ws_row; ++ v->cpt_ws_col = tty->winsize.ws_col; ++ v->cpt_ws_prow = tty->winsize.ws_ypixel; ++ v->cpt_ws_pcol = tty->winsize.ws_xpixel; ++ if (tty->termios == NULL) { ++ eprintk_ctx("NULL termios"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ v->cpt_c_line = tty->termios->c_line; ++ v->cpt_c_iflag = tty->termios->c_iflag; ++ v->cpt_c_oflag = tty->termios->c_oflag; ++ v->cpt_c_cflag = tty->termios->c_cflag; ++ v->cpt_c_lflag = tty->termios->c_lflag; ++ memcpy(v->cpt_c_cc, tty->termios->c_cc, NCCS); ++ if (NCCS < 32) ++ memset(v->cpt_c_cc + NCCS, 255, 32 - NCCS); ++ memcpy(v->cpt_read_flags, tty->read_flags, sizeof(v->cpt_read_flags)); ++ ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (tty->read_buf && tty->read_cnt) { ++ struct cpt_obj_bits *v = cpt_get_buf(ctx); ++ loff_t saved_pos; ++ ++ cpt_push_object(&saved_pos, ctx); ++ cpt_open_object(NULL, ctx); ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_BITS; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_DATA; ++ v->cpt_size = tty->read_cnt; ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_release_buf(ctx); ++ ++ if (tty->read_cnt) { ++ int n = min(tty->read_cnt, N_TTY_BUF_SIZE - tty->read_tail); ++ ctx->write(tty->read_buf + tty->read_tail, n, ctx); ++ if (tty->read_cnt > n) ++ ctx->write(tty->read_buf, tty->read_cnt-n, ctx); ++ ctx->align(ctx); ++ } ++ ++ cpt_close_object(ctx); ++ cpt_pop_object(&saved_pos, ctx); ++ } ++ ++ cpt_close_object(ctx); ++ ++ return 0; ++} ++ ++__u32 cpt_tty_fasync(struct file *file, struct cpt_context *ctx) ++{ ++ struct tty_struct * tty; ++ struct fasync_struct *fa; ++ ++ tty = (struct tty_struct *)file->private_data; ++ ++ for (fa = tty->fasync; fa; fa = fa->fa_next) { ++ if (fa->fa_file == file) ++ return fa->fa_fd; ++ } ++ return -1; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_ubc.c linux-2.6.16-026test015/kernel/cpt/cpt_ubc.c +--- linux-2.6.16.orig/kernel/cpt/cpt_ubc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_ubc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,132 @@ ++/* ++ * ++ * kernel/cpt/cpt_ubc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/types.h> ++#include <ub/beancounter.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = cpt_object_add(CPT_OBJ_UBC, bc, ctx); ++ if (obj != NULL) { ++ if (obj->o_count == 1) ++ get_beancounter(bc); ++ if (bc->parent != NULL && obj->o_parent == NULL) ++ obj->o_parent = cpt_add_ubc(bc->parent, ctx); ++ } ++ return obj; ++} ++ ++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_object(CPT_OBJ_UBC, bc, ctx); ++ if (obj == NULL) { ++ char buf[48]; ++ print_ub_uid(bc, buf, sizeof(buf)); ++ printk(KERN_ERR "CPT: unknown ub %s (%p)\n", buf, bc); ++ dump_stack(); ++ return CPT_NULL; ++ } ++ return obj->o_pos; ++} ++ ++static void dump_one_bc_parm(__u64 *dmp, struct ubparm *prm, int held) ++{ ++ dmp[0] = (prm->barrier < UB_MAXVALUE ? prm->barrier : CPT_NULL); ++ dmp[1] = (prm->limit < UB_MAXVALUE ? prm->limit : CPT_NULL); ++ dmp[2] = (held ? prm->held : CPT_NULL); ++ dmp[3] = prm->maxheld; ++ dmp[4] = prm->minheld; ++ dmp[5] = prm->failcnt; ++} ++ ++static int dump_one_bc(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct user_beancounter *bc; ++ struct cpt_beancounter_image *v; ++ int i; ++ ++ bc = obj->o_obj; ++ v = cpt_get_buf(ctx); ++ ++ v->cpt_next = CPT_NULL; ++ v->cpt_object = CPT_OBJ_UBC; ++ v->cpt_hdrlen = sizeof(*v); ++ v->cpt_content = CPT_CONTENT_VOID; ++ ++ if (obj->o_parent != NULL) ++ v->cpt_parent = ((cpt_object_t *)obj->o_parent)->o_pos; ++ else ++ v->cpt_parent = CPT_NULL; ++ v->cpt_id = (obj->o_parent != NULL) ? bc->ub_uid : 0; ++ for (i = 0; i < UB_RESOURCES; i++) ++ dump_one_bc_parm(v->cpt_parms, bc->ub_parms, 0); ++ for (i = 0; i < UB_RESOURCES; i++) ++ dump_one_bc_parm(v->cpt_parms + UB_RESOURCES * 6, ++ bc->ub_store, 1); ++ memset(v->cpt_parms + UB_RESOURCES * 12, 0, ++ sizeof(v->cpt_parms) ++ - UB_RESOURCES * 12 * sizeof(v->cpt_parms[0])); ++ ++ cpt_open_object(obj, ctx); ++ ctx->write(v, sizeof(*v), ctx); ++ cpt_close_object(ctx); ++ ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++int cpt_dump_ubc(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ int skipped; ++ int top; ++ ++ cpt_open_section(ctx, CPT_SECT_UBC); ++ ++ do { ++ skipped = 0; ++ top = 0; ++ for_each_object(obj, CPT_OBJ_UBC) { ++ if (obj->o_parent == NULL) ++ top++; ++ if (obj->o_pos != CPT_NULL) ++ continue; ++ if (obj->o_parent != NULL && ++ ((cpt_object_t *)obj->o_parent)->o_pos == CPT_NULL) ++ skipped++; ++ else ++ dump_one_bc(obj, ctx); ++ } ++ } while (skipped && (top < 2)); ++ ++ cpt_close_section(ctx); ++ if (top > 1) { ++ eprintk_ctx("More than one top level ub exist"); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++void cpt_finish_ubc(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_UBC) ++ put_beancounter(obj->o_obj); ++} +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_ubc.h linux-2.6.16-026test015/kernel/cpt/cpt_ubc.h +--- linux-2.6.16.orig/kernel/cpt/cpt_ubc.h 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_ubc.h 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,9 @@ ++cpt_object_t *cpt_add_ubc(struct user_beancounter *bc, struct cpt_context *ctx); ++__u64 cpt_lookup_ubc(struct user_beancounter *bc, struct cpt_context *ctx); ++int cpt_dump_ubc(struct cpt_context *ctx); ++ ++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx); ++int rst_undump_ubc(struct cpt_context *ctx); ++ ++void cpt_finish_ubc(struct cpt_context *ctx); ++void rst_finish_ubc(struct cpt_context *ctx); +diff -upr linux-2.6.16.orig/kernel/cpt/cpt_x8664.S linux-2.6.16-026test015/kernel/cpt/cpt_x8664.S +--- linux-2.6.16.orig/kernel/cpt/cpt_x8664.S 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/cpt_x8664.S 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,61 @@ ++#define ASSEMBLY 1 ++#include <linux/config.h> ++#include <linux/linkage.h> ++#include <asm/segment.h> ++#include <asm/smp.h> ++#include <asm/cache.h> ++#include <asm/errno.h> ++#include <asm/dwarf2.h> ++#include <asm/calling.h> ++#include <asm/msr.h> ++#include <asm/unistd.h> ++#include <asm/thread_info.h> ++#include <asm/hw_irq.h> ++#include <asm/errno.h> ++ ++ .code64 ++ ++ .macro FAKE_STACK_FRAME child_rip ++ /* push in order ss, rsp, eflags, cs, rip */ ++ xorq %rax, %rax ++ pushq %rax /* ss */ ++ pushq %rax /* rsp */ ++ pushq $(1<<9) /* eflags - interrupts on */ ++ pushq $__KERNEL_CS /* cs */ ++ pushq \child_rip /* rip */ ++ pushq %rax /* orig rax */ ++ .endm ++ ++ .macro UNFAKE_STACK_FRAME ++ addq $8*6, %rsp ++ .endm ++ ++ENTRY(asm_kernel_thread) ++ FAKE_STACK_FRAME $child_rip ++ SAVE_ALL ++ ++ # rdi: flags, rsi: usp, rdx: will be &pt_regs ++ movq %rdx,%rdi ++ orq $0x00800000,%rdi ++ movq $-1, %rsi ++ movq %rsp, %rdx ++ ++ xorl %r8d,%r8d ++ xorl %r9d,%r9d ++ pushq %rcx ++ call do_fork_pid ++ addq $8, %rsp ++ /* call do_fork */ ++ movq %rax,RAX(%rsp) ++ xorl %edi,%edi ++ RESTORE_ALL ++ UNFAKE_STACK_FRAME ++ ret ++ ++child_rip: ++ movq %rdi, %rax ++ movq %rsi, %rdi ++ call *%rax ++ xorq %rdi, %rdi ++ xorq %rsi, %rsi ++ call complete_and_exit +diff -upr linux-2.6.16.orig/kernel/cpt/rst_conntrack.c linux-2.6.16-026test015/kernel/cpt/rst_conntrack.c +--- linux-2.6.16.orig/kernel/cpt/rst_conntrack.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_conntrack.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,294 @@ ++/* ++ * ++ * kernel/cpt/rst_conntrack.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <linux/rtnetlink.h> ++#include <linux/unistd.h> ++#include <linux/ve.h> ++#include <linux/vzcalluser.h> ++#include <linux/cpt_image.h> ++#include <linux/icmp.h> ++#include <linux/ip.h> ++ ++#if defined(CONFIG_VE_IPTABLES) && \ ++ (defined(CONFIG_IP_NF_CONNTRACK) || defined(CONFIG_IP_NF_CONNTRACK_MODULE)) ++ ++#include <linux/netfilter.h> ++#include <linux/netfilter_ipv4/ip_conntrack.h> ++#include <linux/netfilter_ipv4/ip_nat.h> ++#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> ++#include <linux/netfilter_ipv4/ip_conntrack_helper.h> ++#include <linux/netfilter_ipv4/ip_conntrack_core.h> ++#include <linux/netfilter_ipv4/ip_nat_helper.h> ++#include <linux/netfilter_ipv4/ip_nat_core.h> ++ ++#define ASSERT_READ_LOCK(x) do { } while (0) ++#define ASSERT_WRITE_LOCK(x) do { } while (0) ++ ++#include <linux/netfilter_ipv4/listhelp.h> ++ ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++struct ct_holder ++{ ++ struct ct_holder *next; ++ struct ip_conntrack *ct; ++ int index; ++}; ++ ++static void decode_tuple(struct cpt_ipct_tuple *v, struct ip_conntrack_tuple *tuple, int dir) ++{ ++ tuple->dst.ip = v->cpt_dst; ++ tuple->dst.u.all = v->cpt_dstport; ++ tuple->dst.protonum = v->cpt_protonum; ++ tuple->dst.dir = v->cpt_dir; ++ if (dir != tuple->dst.dir) ++ wprintk("dir != tuple->dst.dir\n"); ++ ++ tuple->src.ip = v->cpt_src; ++ tuple->src.u.all = v->cpt_srcport; ++} ++ ++ ++static int undump_expect_list(struct ip_conntrack *ct, ++ struct cpt_ip_conntrack_image *ci, ++ loff_t pos, struct ct_holder *ct_list, ++ cpt_context_t *ctx) ++{ ++ loff_t end; ++ int err; ++ ++ end = pos + ci->cpt_next; ++ pos += ci->cpt_hdrlen; ++ while (pos < end) { ++ struct cpt_ip_connexpect_image v; ++ struct ip_conntrack_expect *exp; ++ struct ip_conntrack *sibling; ++ ++ err = rst_get_object(CPT_OBJ_NET_CONNTRACK_EXPECT, pos, &v, ctx); ++ if (err) ++ return err; ++ ++ sibling = NULL; ++ if (v.cpt_sibling_conntrack) { ++ struct ct_holder *c; ++ ++ for (c = ct_list; c; c = c->next) { ++ if (c->index == v.cpt_sibling_conntrack) { ++ sibling = c->ct; ++ break; ++ } ++ } ++ if (!sibling) { ++ eprintk_ctx("lost sibling of expectation\n"); ++ return -EINVAL; ++ } ++ } ++ ++ write_lock_bh(&ip_conntrack_lock); ++ ++ /* It is possible. Helper module could be just unregistered, ++ * if expectation were on the list, it would be destroyed. */ ++ if (ct->helper == NULL) { ++ write_unlock_bh(&ip_conntrack_lock); ++ dprintk_ctx("conntrack: no helper and non-trivial expectation\n"); ++ continue; ++ } ++ ++ exp = ip_conntrack_expect_alloc(NULL); ++ if (exp == NULL) { ++ write_unlock_bh(&ip_conntrack_lock); ++ return -ENOMEM; ++ } ++ ++ if (ct->helper->timeout && !del_timer(&exp->timeout)) { ++ /* Dying already. We can do nothing. */ ++ write_unlock_bh(&ip_conntrack_lock); ++ dprintk_ctx("conntrack expectation is dying\n"); ++ continue; ++ } ++ ++ decode_tuple(&v.cpt_tuple, &exp->tuple, 0); ++ decode_tuple(&v.cpt_mask, &exp->mask, 0); ++ ++ exp->master = ct; ++ nf_conntrack_get(&ct->ct_general); ++ ip_conntrack_expect_insert(exp); ++#if 0 ++ if (sibling) { ++ exp->sibling = sibling; ++ sibling->master = exp; ++ LIST_DELETE(&ve_ip_conntrack_expect_list, exp); ++ ct->expecting--; ++ nf_conntrack_get(&master_ct(sibling)->infos[0]); ++ } else ++#endif ++ if (ct->helper->timeout) { ++ exp->timeout.expires = jiffies + v.cpt_timeout; ++ add_timer(&exp->timeout); ++ } ++ write_unlock_bh(&ip_conntrack_lock); ++ ++ pos += v.cpt_next; ++ } ++ return 0; ++} ++ ++static int undump_one_ct(struct cpt_ip_conntrack_image *ci, loff_t pos, ++ struct ct_holder **ct_list, cpt_context_t *ctx) ++{ ++ int err = 0; ++ struct ip_conntrack *conntrack; ++ struct ct_holder *c; ++ struct ip_conntrack_tuple orig, repl; ++ ++ c = kmalloc(sizeof(struct ct_holder), GFP_KERNEL); ++ if (c == NULL) ++ return -ENOMEM; ++ ++ decode_tuple(&ci->cpt_tuple[0], &orig, 0); ++ decode_tuple(&ci->cpt_tuple[1], &repl, 1); ++ ++ conntrack = ip_conntrack_alloc(&orig, &repl, get_exec_env()->_ip_conntrack->ub); ++ if (!conntrack || IS_ERR(conntrack)) { ++ kfree(c); ++ return -ENOMEM; ++ } ++ ++ c->ct = conntrack; ++ c->next = *ct_list; ++ *ct_list = c; ++ c->index = ci->cpt_index; ++ ++ decode_tuple(&ci->cpt_tuple[0], &conntrack->tuplehash[0].tuple, 0); ++ decode_tuple(&ci->cpt_tuple[1], &conntrack->tuplehash[1].tuple, 1); ++ ++ conntrack->status = ci->cpt_status; ++ ++ memcpy(&conntrack->proto, ci->cpt_proto_data, sizeof(conntrack->proto)); ++ memcpy(&conntrack->help, ci->cpt_help_data, sizeof(conntrack->help)); ++ ++#ifdef CONFIG_IP_NF_NAT_NEEDED ++#if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ ++ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) ++ conntrack->nat.masq_index = ci->cpt_masq_index; ++#endif ++ if (ci->cpt_initialized) { ++ conntrack->nat.info.seq[0].correction_pos = ci->cpt_nat_seq[0].cpt_correction_pos; ++ conntrack->nat.info.seq[0].offset_before = ci->cpt_nat_seq[0].cpt_offset_before; ++ conntrack->nat.info.seq[0].offset_after = ci->cpt_nat_seq[0].cpt_offset_after; ++ conntrack->nat.info.seq[1].correction_pos = ci->cpt_nat_seq[1].cpt_correction_pos; ++ conntrack->nat.info.seq[1].offset_before = ci->cpt_nat_seq[1].cpt_offset_before; ++ conntrack->nat.info.seq[1].offset_after = ci->cpt_nat_seq[1].cpt_offset_after; ++ } ++ if (conntrack->status & IPS_NAT_DONE_MASK) ++ ip_nat_hash_conntrack(conntrack); ++#endif ++ ++ write_lock_bh(&ip_conntrack_lock); ++ ++ if (ci->cpt_ct_helper) { ++ conntrack->helper = ip_conntrack_helper_find_get(&conntrack->tuplehash[1].tuple); ++ if (conntrack->helper == NULL) { ++ eprintk_ctx("conntrack: cannot find helper, some module is not loaded\n"); ++ err = -EINVAL; ++ } ++ } ++ ++ ip_conntrack_hash_insert(conntrack); ++ conntrack->timeout.expires = jiffies + ci->cpt_timeout; ++ ++ write_unlock_bh(&ip_conntrack_lock); ++ ++ if (err == 0 && ci->cpt_next > ci->cpt_hdrlen) ++ err = undump_expect_list(conntrack, ci, pos, *ct_list, ctx); ++ ++ return err; ++} ++ ++int rst_restore_ip_conntrack(struct cpt_context * ctx) ++{ ++ int err = 0; ++ loff_t sec = ctx->sections[CPT_SECT_NET_CONNTRACK]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_ip_conntrack_image ci; ++ struct ct_holder *c; ++ struct ct_holder *ct_list = NULL; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ if (sizeof(ci.cpt_proto_data) != sizeof(union ip_conntrack_proto)) { ++ eprintk_ctx("conntrack module ct->proto version mismatch\n"); ++ return -EINVAL; ++ } ++ if (sizeof(ci.cpt_help_data) != sizeof(union ip_conntrack_help)) { ++ eprintk_ctx("conntrack module ct->help version mismatch\n"); ++ return -EINVAL; ++ } ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_CONNTRACK || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ err = rst_get_object(CPT_OBJ_NET_CONNTRACK, sec, &ci, ctx); ++ if (err) ++ break; ++ err = undump_one_ct(&ci, sec, &ct_list, ctx); ++ if (err) ++ break; ++ sec += ci.cpt_next; ++ } ++ ++ while ((c = ct_list) != NULL) { ++ ct_list = c->next; ++ if (c->ct) ++ add_timer(&c->ct->timeout); ++ kfree(c); ++ } ++ ++ return err; ++} ++ ++#else ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++int rst_restore_ip_conntrack(struct cpt_context * ctx) ++{ ++ if (ctx->sections[CPT_SECT_NET_CONNTRACK] != CPT_NULL) ++ return -EINVAL; ++ return 0; ++} ++ ++#endif +diff -upr linux-2.6.16.orig/kernel/cpt/rst_context.c linux-2.6.16-026test015/kernel/cpt/rst_context.c +--- linux-2.6.16.orig/kernel/cpt/rst_context.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_context.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,315 @@ ++/* ++ * ++ * kernel/cpt/rst_context.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/pagemap.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++static ssize_t file_read(void *addr, size_t count, struct cpt_context *ctx) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->read(file, addr, count, &file->f_pos); ++ set_fs(oldfs); ++ if (err != count) ++ return err >= 0 ? -EIO : err; ++ return 0; ++} ++ ++static ssize_t file_pread(void *addr, size_t count, struct cpt_context *ctx, loff_t pos) ++{ ++ mm_segment_t oldfs; ++ ssize_t err = -EBADF; ++ struct file *file = ctx->file; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (file) ++ err = file->f_op->read(file, addr, count, &pos); ++ set_fs(oldfs); ++ if (err != count) ++ return err >= 0 ? -EIO : err; ++ return 0; ++} ++ ++static void file_align(struct cpt_context *ctx) ++{ ++ struct file *file = ctx->file; ++ ++ if (file) ++ file->f_pos = CPT_ALIGN(file->f_pos); ++} ++ ++int rst_get_section(int type, struct cpt_context *ctx, loff_t *start, loff_t *end) ++{ ++ struct cpt_section_hdr hdr; ++ int err; ++ loff_t pos; ++ ++ pos = ctx->sections[type]; ++ *start = *end = pos; ++ ++ if (pos != CPT_NULL) { ++ if ((err = ctx->pread(&hdr, sizeof(hdr), ctx, pos)) != 0) ++ return err; ++ if (hdr.cpt_section != type || hdr.cpt_hdrlen < sizeof(hdr)) ++ return -EINVAL; ++ *start = pos + hdr.cpt_hdrlen; ++ *end = pos + hdr.cpt_next; ++ } ++ return 0; ++} ++EXPORT_SYMBOL(rst_get_section); ++ ++void rst_context_init(struct cpt_context *ctx) ++{ ++ int i; ++ ++ memset(ctx, 0, sizeof(*ctx)); ++ ++ init_MUTEX(&ctx->main_sem); ++ ctx->refcount = 1; ++ ++ ctx->current_section = -1; ++ ctx->current_object = -1; ++ ctx->pagesize = PAGE_SIZE; ++ ctx->read = file_read; ++ ctx->pread = file_pread; ++ ctx->align = file_align; ++ for (i=0; i < CPT_SECT_MAX; i++) ++ ctx->sections[i] = CPT_NULL; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ init_completion(&ctx->pgin_notify); ++#endif ++ cpt_object_init(ctx); ++} ++ ++static int parse_sections(loff_t start, loff_t end, cpt_context_t *ctx) ++{ ++ struct cpt_section_hdr h; ++ ++ while (start < end) { ++ int err; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, start); ++ if (err) ++ return err; ++ if (h.cpt_hdrlen < sizeof(h) || ++ h.cpt_next < h.cpt_hdrlen || ++ start + h.cpt_next > end) ++ return -EINVAL; ++ if (h.cpt_section >= CPT_SECT_MAX) ++ return -EINVAL; ++ ctx->sections[h.cpt_section] = start; ++ start += h.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_open_dumpfile(struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_major_tail *v; ++ struct cpt_major_hdr h; ++ unsigned long size; ++ ++ err = -EBADF; ++ if (!ctx->file) ++ goto err_out; ++ ++ err = -ENOMEM; ++ ctx->tmpbuf = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->tmpbuf == NULL) ++ goto err_out; ++ __cpt_release_buf(ctx); ++ ++ size = ctx->file->f_dentry->d_inode->i_size; ++ ++ if (size & 7) { ++ err = -EINVAL; ++ goto err_out; ++ } ++ if (size < sizeof(struct cpt_major_hdr) + ++ sizeof(struct cpt_major_tail)) { ++ err = -EINVAL; ++ goto err_out; ++ } ++ err = ctx->pread(&h, sizeof(h), ctx, 0); ++ if (err) { ++ eprintk_ctx("too short image 1 %d\n", err); ++ goto err_out; ++ } ++ if (h.cpt_signature[0] != CPT_SIGNATURE0 || ++ h.cpt_signature[1] != CPT_SIGNATURE1 || ++ h.cpt_signature[2] != CPT_SIGNATURE2 || ++ h.cpt_signature[3] != CPT_SIGNATURE3) { ++ err = -EINVAL; ++ goto err_out; ++ } ++ if (h.cpt_hz != HZ) { ++ err = -EINVAL; ++ eprintk_ctx("HZ mismatch: %d != %d\n", h.cpt_hz, HZ); ++ goto err_out; ++ } ++ ctx->virt_jiffies64 = h.cpt_start_jiffies64; ++ ctx->start_time.tv_sec = h.cpt_start_sec; ++ ctx->start_time.tv_nsec = h.cpt_start_nsec; ++ ctx->kernel_config_flags = h.cpt_kernel_config[0]; ++ ctx->iptables_mask = h.cpt_iptables_mask; ++ ctx->image_version = h.cpt_image_version; ++ ++ v = cpt_get_buf(ctx); ++ err = ctx->pread(v, sizeof(*v), ctx, size - sizeof(*v)); ++ if (err) { ++ eprintk_ctx("too short image 2 %d\n", err); ++ cpt_release_buf(ctx); ++ goto err_out; ++ } ++ if (v->cpt_signature[0] != CPT_SIGNATURE0 || ++ v->cpt_signature[1] != CPT_SIGNATURE1 || ++ v->cpt_signature[2] != CPT_SIGNATURE2 || ++ v->cpt_signature[3] != CPT_SIGNATURE3 || ++ v->cpt_nsect != CPT_SECT_MAX_INDEX) { ++ err = -EINVAL; ++ cpt_release_buf(ctx); ++ goto err_out; ++ } ++ if ((err = parse_sections(h.cpt_hdrlen, size - sizeof(*v) - sizeof(struct cpt_section_hdr), ctx)) < 0) { ++ cpt_release_buf(ctx); ++ goto err_out; ++ } ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ ctx->lazypages = v->cpt_lazypages; ++#endif ++ ctx->tasks64 = v->cpt_64bit; ++ cpt_release_buf(ctx); ++ return 0; ++ ++err_out: ++ if (ctx->tmpbuf) { ++ free_page((unsigned long)ctx->tmpbuf); ++ ctx->tmpbuf = NULL; ++ } ++ return err; ++} ++ ++void rst_close_dumpfile(struct cpt_context *ctx) ++{ ++ if (ctx->file) { ++ fput(ctx->file); ++ ctx->file = NULL; ++ } ++ if (ctx->tmpbuf) { ++ free_page((unsigned long)ctx->tmpbuf); ++ ctx->tmpbuf = NULL; ++ } ++} ++ ++int _rst_get_object(int type, loff_t pos, void *tmp, int size, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_object_hdr *hdr = tmp; ++ err = ctx->pread(hdr, sizeof(struct cpt_object_hdr), ctx, pos); ++ if (err) ++ return err; ++ if (type > 0 && type != hdr->cpt_object) ++ return -EINVAL; ++ if (hdr->cpt_hdrlen > hdr->cpt_next) ++ return -EINVAL; ++ if (hdr->cpt_hdrlen < sizeof(struct cpt_object_hdr)) ++ return -EINVAL; ++ if (size < sizeof(*hdr)) ++ return -EINVAL; ++ if (size > hdr->cpt_hdrlen) ++ size = hdr->cpt_hdrlen; ++ if (size > sizeof(*hdr)) ++ err = ctx->pread(hdr+1, size - sizeof(*hdr), ++ ctx, pos + sizeof(*hdr)); ++ return err; ++} ++EXPORT_SYMBOL(_rst_get_object); ++ ++void * __rst_get_object(int type, loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ void *tmp; ++ struct cpt_object_hdr hdr; ++ err = ctx->pread(&hdr, sizeof(hdr), ctx, pos); ++ if (err) ++ return NULL; ++ if (type > 0 && type != hdr.cpt_object) ++ return NULL; ++ if (hdr.cpt_hdrlen > hdr.cpt_next) ++ return NULL; ++ if (hdr.cpt_hdrlen < sizeof(struct cpt_object_hdr)) ++ return NULL; ++ tmp = kmalloc(hdr.cpt_hdrlen, GFP_KERNEL); ++ if (!tmp) ++ return NULL; ++ err = ctx->pread(tmp, hdr.cpt_hdrlen, ctx, pos); ++ if (!err) ++ return tmp; ++ kfree(tmp); ++ return NULL; ++} ++EXPORT_SYMBOL(__rst_get_object); ++ ++__u8 *__rst_get_name(loff_t *pos_p, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_object_hdr hdr; ++ __u8 *name; ++ ++ err = rst_get_object(CPT_OBJ_NAME, *pos_p, &hdr, ctx); ++ if (err) ++ return NULL; ++ if (hdr.cpt_next - hdr.cpt_hdrlen > PAGE_SIZE) ++ return NULL; ++ name = (void*)__get_free_page(GFP_KERNEL); ++ if (!name) ++ return NULL; ++ err = ctx->pread(name, hdr.cpt_next - hdr.cpt_hdrlen, ++ ctx, *pos_p + hdr.cpt_hdrlen); ++ if (err) { ++ free_page((unsigned long)name); ++ return NULL; ++ } ++ *pos_p += hdr.cpt_next; ++ return name; ++} ++ ++__u8 *rst_get_name(loff_t pos, struct cpt_context *ctx) ++{ ++ return __rst_get_name(&pos, ctx); ++} ++ ++void rst_put_name(__u8 *name, struct cpt_context *ctx) ++{ ++ unsigned long addr = (unsigned long)name; ++ ++ if (addr) ++ free_page(addr&~(PAGE_SIZE-1)); ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_epoll.c linux-2.6.16-026test015/kernel/cpt/rst_epoll.c +--- linux-2.6.16.orig/kernel/cpt/rst_epoll.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_epoll.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,173 @@ ++/* ++ * ++ * kernel/cpt/rst_epoll.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/namespace.h> ++#include <linux/mount.h> ++#include <linux/namei.h> ++#include <linux/smp_lock.h> ++#include <asm/uaccess.h> ++#include <linux/vzcalluser.h> ++#include <linux/eventpoll.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++#include "cpt_syscalls.h" ++ ++/* Those funcations are static in fs/eventpoll.c */ ++extern struct file_operations eventpoll_fops; ++extern int ep_insert(struct eventpoll *ep, struct epoll_event *event, ++ struct file *tfile, int fd); ++extern struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); ++extern void ep_release_epitem(struct epitem *epi); ++ ++ ++struct file *cpt_open_epolldev(struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx) ++{ ++ struct file *file; ++ int efd; ++ ++ /* Argument "size" is ignored, use just 1 */ ++ efd = sys_epoll_create(1); ++ if (efd < 0) ++ return ERR_PTR(efd); ++ ++ file = fget(efd); ++ sys_close(efd); ++ return file; ++} ++ ++static int restore_one_epoll(cpt_object_t *obj, ++ loff_t pos, ++ struct cpt_epoll_image *ebuf, ++ cpt_context_t *ctx) ++{ ++ int err = 0; ++ loff_t endpos; ++ struct file *file = obj->o_obj; ++ struct eventpoll *ep; ++ ++ if (file->f_op != &eventpoll_fops) { ++ eprintk_ctx("bad epoll file\n"); ++ return -EINVAL; ++ } ++ ++ ep = file->private_data; ++ ++ if (unlikely(ep == NULL)) { ++ eprintk_ctx("bad epoll device\n"); ++ return -EINVAL; ++ } ++ ++ endpos = pos + ebuf->cpt_next; ++ pos += ebuf->cpt_hdrlen; ++ while (pos < endpos) { ++ struct cpt_epoll_file_image efi; ++ struct epoll_event epds; ++ ++ cpt_object_t *tobj; ++ ++ err = rst_get_object(CPT_OBJ_EPOLL_FILE, pos, &efi, ctx); ++ if (err) ++ return err; ++ tobj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, efi.cpt_file, ctx); ++ if (!tobj) { ++ eprintk_ctx("epoll file not found\n"); ++ return -EINVAL; ++ } ++ epds.events = efi.cpt_events; ++ epds.data = efi.cpt_data; ++ down_write(&ep->sem); ++ err = ep_insert(ep, &epds, tobj->o_obj, efi.cpt_fd); ++ if (!err) { ++ struct epitem *epi; ++ epi = ep_find(ep, tobj->o_obj, efi.cpt_fd); ++ if (epi) { ++ epi->revents = efi.cpt_revents; ++ if (efi.cpt_ready) { ++ unsigned long flags; ++ write_lock_irqsave(&ep->lock, flags); ++ if (list_empty(&epi->rdllink)) ++ list_add_tail(&epi->rdllink, &ep->rdllist); ++ write_unlock_irqrestore(&ep->lock, flags); ++ } ++ ep_release_epitem(epi); ++ } ++ } ++ up_write(&ep->sem); ++ if (err) ++ break; ++ pos += efi.cpt_next; ++ } ++ return err; ++} ++ ++int rst_eventpoll(cpt_context_t *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_EPOLL]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_EPOLL || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ cpt_object_t *obj; ++ struct cpt_epoll_image *ebuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_EPOLL, sec, ebuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, ebuf->cpt_file, ctx); ++ if (obj == NULL) { ++ eprintk_ctx("cannot find epoll file object\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ err = restore_one_epoll(obj, sec, ebuf, ctx); ++ cpt_release_buf(ctx); ++ if (err) ++ return err; ++ sec += ebuf->cpt_next; ++ } ++ ++ return 0; ++ ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_files.c linux-2.6.16-026test015/kernel/cpt/rst_files.c +--- linux-2.6.16.orig/kernel/cpt/rst_files.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_files.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1453 @@ ++/* ++ * ++ * kernel/cpt/rst_files.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/mount.h> ++#include <linux/tty.h> ++#include <linux/namei.h> ++#include <linux/vmalloc.h> ++#include <linux/smp_lock.h> ++#include <linux/vmalloc.h> ++#include <linux/pagemap.h> ++#include <asm/uaccess.h> ++#include <ub/ub_mem.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++#include "cpt_fsmagic.h" ++ ++#include "cpt_syscalls.h" ++ ++ ++struct filejob { ++ struct filejob *next; ++ int pid; ++ loff_t fdi; ++}; ++ ++static int rst_filejob_queue(loff_t pos, cpt_context_t *ctx) ++{ ++ struct filejob *j; ++ ++ j = kmalloc(sizeof(*j), GFP_KERNEL); ++ if (j == NULL) ++ return -ENOMEM; ++ j->pid = current->pid; ++ j->fdi = pos; ++ j->next = ctx->filejob_queue; ++ ctx->filejob_queue = j; ++ return 0; ++} ++ ++static void _anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) ++{ ++ struct page *page = buf->page; ++ ++ if (info->tmp_page) { ++ __free_page(page); ++ } else { ++ info->tmp_page = page; ++ } ++ module_put(THIS_MODULE); ++} ++ ++static void *_anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) ++{ ++ return kmap(buf->page); ++} ++ ++static void _anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) ++{ ++ kunmap(buf->page); ++} ++ ++static struct pipe_buf_operations _anon_pipe_buf_ops = { ++ .can_merge = 1, ++ .map = _anon_pipe_buf_map, ++ .unmap = _anon_pipe_buf_unmap, ++ .release = _anon_pipe_buf_release, ++}; ++ ++/* Sorta ugly... Multiple readers/writers of named pipe rewrite buffer ++ * many times. We need to mark it in CPT_OBJ_INODE table in some way. ++ */ ++static int fixup_pipe_data(struct file *file, struct cpt_file_image *fi, ++ struct cpt_context *ctx) ++{ ++ struct inode *ino = file->f_dentry->d_inode; ++ struct cpt_inode_image ii; ++ struct cpt_obj_bits b; ++ struct pipe_inode_info *info; ++ int err; ++ int count; ++ ++ if (!S_ISFIFO(ino->i_mode)) { ++ eprintk_ctx("fixup_pipe_data: not a pipe %Ld\n", fi->cpt_inode); ++ return -EINVAL; ++ } ++ if (fi->cpt_inode == CPT_NULL) ++ return 0; ++ ++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); ++ if (err) ++ return err; ++ ++ if (ii.cpt_next <= ii.cpt_hdrlen) ++ return 0; ++ ++ err = rst_get_object(CPT_OBJ_BITS, fi->cpt_inode + ii.cpt_hdrlen, &b, ctx); ++ if (err) ++ return err; ++ ++ if (b.cpt_size == 0) ++ return 0; ++ ++ mutex_lock(PIPE_MUTEX(*ino)); ++ info = ino->i_pipe; ++ if (info->nrbufs) { ++ mutex_unlock(PIPE_MUTEX(*ino)); ++ eprintk("pipe buffer is restored already\n"); ++ return -EINVAL; ++ } ++ info->curbuf = 0; ++ count = 0; ++ while (count < b.cpt_size) { ++ struct pipe_buffer *buf = info->bufs + info->nrbufs; ++ void * addr; ++ int chars; ++ ++ chars = b.cpt_size - count; ++ if (chars > PAGE_SIZE) ++ chars = PAGE_SIZE; ++ if (!try_module_get(THIS_MODULE)) { ++ err = -EBUSY; ++ break; ++ } ++ ++ buf->page = alloc_page(GFP_HIGHUSER); ++ if (buf->page == NULL) { ++ err = -ENOMEM; ++ break; ++ } ++ buf->ops = &_anon_pipe_buf_ops; ++ buf->offset = 0; ++ buf->len = chars; ++ info->nrbufs++; ++ addr = kmap(buf->page); ++ err = ctx->pread(addr, chars, ctx, ++ fi->cpt_inode + ii.cpt_hdrlen + b.cpt_hdrlen + count); ++ if (err) ++ break; ++ count += chars; ++ } ++ mutex_unlock(PIPE_MUTEX(*ino)); ++ ++ return err; ++} ++ ++static int make_flags(struct cpt_file_image *fi) ++{ ++ int flags = O_NOFOLLOW; ++ switch (fi->cpt_mode&(FMODE_READ|FMODE_WRITE)) { ++ case FMODE_READ|FMODE_WRITE: ++ flags |= O_RDWR; break; ++ case FMODE_WRITE: ++ flags |= O_WRONLY; break; ++ case FMODE_READ: ++ flags |= O_RDONLY; break; ++ default: break; ++ } ++ flags |= fi->cpt_flags&~(O_ACCMODE|O_CREAT|O_TRUNC|O_EXCL|FASYNC); ++ flags |= O_NONBLOCK|O_NOCTTY; ++ return flags; ++} ++ ++static struct file *open_pipe(char *name, ++ struct cpt_file_image *fi, ++ unsigned flags, ++ struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ struct cpt_inode_image ii; ++ struct file *rf, *wf; ++ ++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); ++ if (err) ++ return ERR_PTR(err); ++ ++ if (ii.cpt_sb == FSMAGIC_PIPEFS) { ++ int pfd[2]; ++ ++ if ((err = sc_pipe(pfd)) < 0) ++ return ERR_PTR(err); ++ ++ rf = fcheck(pfd[0]); ++ wf = fcheck(pfd[1]); ++ get_file(rf); ++ get_file(wf); ++ sc_close(pfd[0]); ++ sc_close(pfd[1]); ++ ++ if (fi->cpt_mode&FMODE_READ) { ++ struct file *tf; ++ tf = wf; wf = rf; rf = tf; ++ } ++ } else { ++ if (fi->cpt_mode&FMODE_READ) { ++ rf = filp_open(name, flags, 0); ++ if (IS_ERR(rf)) { ++ dprintk_ctx("filp_open\n"); ++ return rf; ++ } ++ dprintk_ctx(CPT_FID "open RDONLY fifo ino %Ld %p %x\n", CPT_TID(current), fi->cpt_inode, rf, rf->f_dentry->d_inode->i_mode); ++ return rf; ++ } ++ ++ dprintk_ctx(CPT_FID "open WRONLY fifo ino %Ld\n", CPT_TID(current), fi->cpt_inode); ++ ++ rf = filp_open(name, O_RDWR|O_NONBLOCK, 0); ++ if (IS_ERR(rf)) ++ return rf; ++ wf = dentry_open(dget(rf->f_dentry), ++ mntget(rf->f_vfsmnt), flags); ++ } ++ ++ /* Add pipe inode to obj table. */ ++ obj = cpt_object_add(CPT_OBJ_INODE, wf->f_dentry->d_inode, ctx); ++ if (obj == NULL) { ++ fput(rf); fput(wf); ++ return ERR_PTR(-ENOMEM); ++ } ++ cpt_obj_setpos(obj, fi->cpt_inode, ctx); ++ obj->o_parent = rf; ++ ++ /* Add another side of pipe to obj table, it will not be used ++ * (o_pos = PT_NULL), another processes opeining pipe will find ++ * inode and open it with dentry_open(). */ ++ obj = cpt_object_add(CPT_OBJ_FILE, rf, ctx); ++ if (obj == NULL) { ++ fput(wf); ++ return ERR_PTR(-ENOMEM); ++ } ++ return wf; ++} ++ ++static struct file *open_special(struct cpt_file_image *fi, ++ unsigned flags, ++ int deleted, ++ struct cpt_context *ctx) ++{ ++ struct cpt_inode_image *ii; ++ struct file *file; ++ ++ /* Directories and named pipes are not special actually */ ++ if (S_ISDIR(fi->cpt_i_mode) || S_ISFIFO(fi->cpt_i_mode)) ++ return NULL; ++ ++ /* No support for block devices at the moment. */ ++ if (S_ISBLK(fi->cpt_i_mode)) ++ return ERR_PTR(-EINVAL); ++ ++ if (S_ISSOCK(fi->cpt_i_mode)) { ++ eprintk_ctx("bug: socket is not open\n"); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ /* Support only (some) character devices at the moment. */ ++ if (!S_ISCHR(fi->cpt_i_mode)) ++ return ERR_PTR(-EINVAL); ++ ++ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); ++ if (ii == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ /* Do not worry about this right now. /dev/null,zero,*random are here. ++ * To prohibit at least /dev/mem? ++ */ ++ if (MAJOR(ii->cpt_rdev) == MEM_MAJOR) { ++ kfree(ii); ++ return NULL; ++ } ++ ++ file = rst_open_tty(fi, ii, flags, ctx); ++ kfree(ii); ++ return file; ++} ++ ++static int restore_posix_lock(struct file *file, struct cpt_flock_image *fli, cpt_context_t *ctx) ++{ ++ struct file_lock lock; ++ cpt_object_t *obj; ++ ++ memset(&lock, 0, sizeof(lock)); ++ lock.fl_type = fli->cpt_type; ++ lock.fl_flags = fli->cpt_flags & ~FL_SLEEP; ++ lock.fl_start = fli->cpt_start; ++ lock.fl_end = fli->cpt_end; ++ obj = lookup_cpt_obj_byindex(CPT_OBJ_FILES, fli->cpt_owner, ctx); ++ if (!obj) { ++ eprintk_ctx("unknown lock owner %d\n", (int)fli->cpt_owner); ++ return -EINVAL; ++ } ++ lock.fl_owner = obj->o_obj; ++ lock.fl_pid = vpid_to_pid(fli->cpt_pid); ++ if (lock.fl_pid < 0) { ++ eprintk_ctx("unknown lock pid %d\n", lock.fl_pid); ++ return -EINVAL; ++ } ++ lock.fl_file = file; ++ ++ if (lock.fl_owner == NULL) ++ eprintk_ctx("no lock owner\n"); ++ return posix_lock_file(file, &lock); ++} ++ ++static int restore_flock(struct file *file, struct cpt_flock_image *fli, ++ cpt_context_t *ctx) ++{ ++ int cmd, err, fd; ++ fd = get_unused_fd(); ++ if (fd < 0) { ++ eprintk_ctx("BSD flock cannot be restored\n"); ++ return fd; ++ } ++ get_file(file); ++ fd_install(fd, file); ++ if (fli->cpt_type == F_RDLCK) { ++ cmd = LOCK_SH; ++ } else if (fli->cpt_type == F_WRLCK) { ++ cmd = LOCK_EX; ++ } else { ++ eprintk_ctx("flock flavor is unknown: %u\n", fli->cpt_type); ++ sc_close(fd); ++ return -EINVAL; ++ } ++ ++ err = sc_flock(fd, LOCK_NB | cmd); ++ sc_close(fd); ++ return err; ++} ++ ++ ++static int fixup_posix_locks(struct file *file, ++ struct cpt_file_image *fi, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t end; ++ struct cpt_flock_image fli; ++ ++ end = pos + fi->cpt_next; ++ pos += fi->cpt_hdrlen; ++ while (pos < end) { ++ err = rst_get_object(-1, pos, &fli, ctx); ++ if (err) ++ return err; ++ if (fli.cpt_object == CPT_OBJ_FLOCK && ++ (fli.cpt_flags&FL_POSIX)) { ++ err = restore_posix_lock(file, &fli, ctx); ++ if (err) ++ return err; ++ dprintk_ctx("posix lock restored\n"); ++ } ++ pos += fli.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_posix_locks(struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ struct cpt_file_image fi; ++ ++ if (obj->o_pos == CPT_NULL) ++ continue; ++ ++ err = rst_get_object(CPT_OBJ_FILE, obj->o_pos, &fi, ctx); ++ if (err < 0) ++ return err; ++ if (fi.cpt_next > fi.cpt_hdrlen) ++ fixup_posix_locks(file, &fi, obj->o_pos, ctx); ++ } ++ return 0; ++} ++ ++static int fixup_flocks(struct file *file, ++ struct cpt_file_image *fi, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t end; ++ struct cpt_flock_image fli; ++ ++ end = pos + fi->cpt_next; ++ pos += fi->cpt_hdrlen; ++ while (pos < end) { ++ err = rst_get_object(-1, pos, &fli, ctx); ++ if (err) ++ return err; ++ if (fli.cpt_object == CPT_OBJ_FLOCK && ++ (fli.cpt_flags&FL_FLOCK)) { ++ err = restore_flock(file, &fli, ctx); ++ if (err) ++ return err; ++ dprintk_ctx("bsd lock restored\n"); ++ } ++ pos += fli.cpt_next; ++ } ++ return 0; ++} ++ ++ ++static int fixup_reg_data(struct file *file, loff_t pos, loff_t end, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_page_block pgb; ++ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); ++ ++ do_write = file->f_op->write; ++ if (do_write == NULL) { ++ eprintk_ctx("no write method. Cannot restore contents of the file.\n"); ++ return -EINVAL; ++ } ++ ++ atomic_inc(&file->f_count); ++ ++ while (pos < end) { ++ loff_t opos; ++ loff_t ipos; ++ int count; ++ ++ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); ++ if (err) ++ goto out; ++ dprintk_ctx("restoring file data block: %08x-%08x\n", ++ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); ++ ipos = pos + pgb.cpt_hdrlen; ++ opos = pgb.cpt_start; ++ count = pgb.cpt_end-pgb.cpt_start; ++ while (count > 0) { ++ mm_segment_t oldfs; ++ int copy = count; ++ ++ if (copy > PAGE_SIZE) ++ copy = PAGE_SIZE; ++ (void)cpt_get_buf(ctx); ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); ++ set_fs(oldfs); ++ if (err) { ++ __cpt_release_buf(ctx); ++ goto out; ++ } ++ if (!(file->f_mode & FMODE_WRITE) || ++ (file->f_flags&O_DIRECT)) { ++ fput(file); ++ file = dentry_open(dget(file->f_dentry), ++ mntget(file->f_vfsmnt), O_WRONLY); ++ if (IS_ERR(file)) { ++ __cpt_release_buf(ctx); ++ return PTR_ERR(file); ++ } ++ } ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ ipos += copy; ++ err = do_write(file, ctx->tmpbuf, copy, &opos); ++ set_fs(oldfs); ++ __cpt_release_buf(ctx); ++ if (err != copy) { ++ if (err >= 0) ++ err = -EIO; ++ goto out; ++ } ++ count -= copy; ++ } ++ pos += pgb.cpt_next; ++ } ++ err = 0; ++ ++out: ++ fput(file); ++ return err; ++} ++ ++ ++static int fixup_file_content(struct file **file_p, struct cpt_file_image *fi, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_inode_image ii; ++ struct file *file = *file_p; ++ struct iattr newattrs; ++ ++ if (!S_ISREG(fi->cpt_i_mode)) ++ return 0; ++ ++ err = rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, &ii, ctx); ++ if (err) ++ return err; ++ ++ if (file == NULL) { ++ file = shmem_file_setup("dev/zero", ii.cpt_size, 0); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ *file_p = file; ++ } ++ ++ if (ii.cpt_next > ii.cpt_hdrlen) { ++ err = fixup_reg_data(file, fi->cpt_inode+ii.cpt_hdrlen, ++ fi->cpt_inode+ii.cpt_next, ctx); ++ if (err) ++ return err; ++ } ++ ++ mutex_lock(&file->f_dentry->d_inode->i_mutex); ++ /* stage 1 - update size like do_truncate does */ ++ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; ++ newattrs.ia_size = ii.cpt_size; ++ cpt_timespec_import(&newattrs.ia_ctime, ii.cpt_ctime); ++ err = notify_change(file->f_dentry, &newattrs); ++ if (err) ++ goto out; ++ ++ /* stage 2 - update times */ ++ newattrs.ia_valid = ATTR_MTIME | ATTR_ATIME | ++ ATTR_ATIME_SET | ATTR_MTIME_SET; ++ cpt_timespec_import(&newattrs.ia_atime, ii.cpt_atime); ++ cpt_timespec_import(&newattrs.ia_mtime, ii.cpt_mtime); ++ err = notify_change(file->f_dentry, &newattrs); ++ ++out: ++ mutex_unlock(&file->f_dentry->d_inode->i_mutex); ++ return err; ++} ++ ++static int fixup_file_flags(struct file *file, struct cpt_file_image *fi, ++ int was_dentry_open, loff_t pos, ++ cpt_context_t *ctx) ++{ ++ if (fi->cpt_pos != file->f_pos) { ++ int err = -ESPIPE; ++ if (file->f_op->llseek) ++ err = file->f_op->llseek(file, fi->cpt_pos, 0); ++ if (err < 0) { ++ dprintk_ctx("file %Ld lseek %Ld - %Ld\n", pos, file->f_pos, fi->cpt_pos); ++ file->f_pos = fi->cpt_pos; ++ } ++ } ++ file->f_uid = fi->cpt_uid; ++ file->f_gid = fi->cpt_gid; ++ file->f_owner.pid = 0; ++ if (fi->cpt_fown_pid) { ++ file->f_owner.pid = comb_vpid_to_pid(fi->cpt_fown_pid); ++ if (file->f_owner.pid == 0) { ++ wprintk_ctx("fixup_file_flags: owner %d does not exist anymore\n", file->f_owner.pid); ++ return -EINVAL; ++ } ++ } ++ file->f_owner.uid = fi->cpt_fown_uid; ++ file->f_owner.euid = fi->cpt_fown_euid; ++ file->f_owner.signum = fi->cpt_fown_signo; ++ ++ if (file->f_mode != fi->cpt_mode) { ++ if (was_dentry_open && ++ ((file->f_mode^fi->cpt_mode)&(FMODE_PREAD|FMODE_LSEEK))) { ++ file->f_mode &= ~(FMODE_PREAD|FMODE_LSEEK); ++ file->f_mode |= fi->cpt_mode&(FMODE_PREAD|FMODE_LSEEK); ++ } ++ if (file->f_mode != fi->cpt_mode) ++ wprintk_ctx("file %ld mode mismatch %08x %08x\n", (long)pos, file->f_mode, fi->cpt_mode); ++ } ++ if (file->f_flags != fi->cpt_flags) { ++ if (!(fi->cpt_flags&O_NOFOLLOW)) ++ file->f_flags &= ~O_NOFOLLOW; ++ if ((file->f_flags^fi->cpt_flags)&O_NONBLOCK) { ++ file->f_flags &= ~O_NONBLOCK; ++ file->f_flags |= fi->cpt_flags&O_NONBLOCK; ++ } ++ if (fi->cpt_flags&FASYNC) { ++ if (fi->cpt_fown_fd == -1) { ++ wprintk_ctx("No fd for FASYNC\n"); ++ return -EINVAL; ++ } else if (file->f_op && file->f_op->fasync) { ++ if (file->f_op->fasync(fi->cpt_fown_fd, file, 1) < 0) { ++ wprintk_ctx("FASYNC problem\n"); ++ return -EINVAL; ++ } else { ++ file->f_flags |= FASYNC; ++ } ++ } ++ } ++ if (file->f_flags != fi->cpt_flags) { ++ eprintk_ctx("file %ld flags mismatch %08x %08x\n", (long)pos, file->f_flags, fi->cpt_flags); ++ return -EINVAL; ++ } ++ } ++ return 0; ++} ++ ++static struct file * ++open_deleted(char *name, unsigned flags, struct cpt_file_image *fi, ++ cpt_context_t *ctx) ++{ ++ struct file * file; ++ char *suffix = NULL; ++ int attempt = 0; ++ int tmp_pass = 0; ++ mode_t mode = fi->cpt_i_mode; ++ ++ /* Strip (deleted) part... */ ++ if (strlen(name) > strlen(" (deleted)")) { ++ if (strcmp(name + strlen(name) - strlen(" (deleted)"), " (deleted)") == 0) { ++ suffix = &name[strlen(name) - strlen(" (deleted)")]; ++ *suffix = 0; ++ } else if (memcmp(name, "(deleted) ", strlen("(deleted) ")) == 0) { ++ memmove(name, name + strlen("(deleted) "), strlen(name) - strlen(" (deleted)") + 1); ++ suffix = name + strlen(name); ++ } ++ } ++ ++try_again: ++ for (;;) { ++ if (attempt) { ++ if (attempt > 1000) { ++ eprintk_ctx("open_deleted: failed after %d attempts\n", attempt); ++ return ERR_PTR(-EEXIST); ++ } ++ if (suffix == NULL) { ++ eprintk_ctx("open_deleted: no suffix\n"); ++ return ERR_PTR(-EEXIST); ++ } ++ sprintf(suffix, ".%08x", (unsigned)((xtime.tv_nsec>>10)+attempt)); ++ } ++ attempt++; ++ ++ if (S_ISFIFO(mode)) { ++ int err; ++ err = sc_mknod(name, S_IFIFO|(mode&017777), 0); ++ if (err == -EEXIST) ++ continue; ++ if (err < 0 && !tmp_pass) ++ goto change_dir; ++ if (err < 0) ++ return ERR_PTR(err); ++ file = open_pipe(name, fi, flags, ctx); ++ sc_unlink(name); ++ } else if (S_ISCHR(mode)) { ++ int err; ++ struct cpt_inode_image *ii; ++ ++ ii = __rst_get_object(CPT_OBJ_INODE, fi->cpt_inode, ctx); ++ if (ii == NULL) ++ return ERR_PTR(-ENOMEM); ++ err = sc_mknod(name, S_IFCHR|(mode&017777), new_encode_dev(ii->cpt_rdev)); ++ kfree(ii); ++ if (err == -EEXIST) ++ continue; ++ if (err < 0 && !tmp_pass) ++ goto change_dir; ++ if (err < 0) ++ return ERR_PTR(err); ++ file = filp_open(name, flags, mode&017777); ++ sc_unlink(name); ++ } else if (S_ISDIR(mode)) { ++ int err; ++ err = sc_mkdir(name, mode&017777); ++ if (err == -EEXIST) ++ continue; ++ if (err < 0 && !tmp_pass) ++ goto change_dir; ++ if (err < 0) ++ return ERR_PTR(err); ++ file = filp_open(name, flags, mode&017777); ++ sc_rmdir(name); ++ } else { ++ file = filp_open(name, O_CREAT|O_EXCL|flags, mode&017777); ++ if (IS_ERR(file)) { ++ if (PTR_ERR(file) == -EEXIST) ++ continue; ++ if (!tmp_pass) ++ goto change_dir; ++ } else { ++ sc_unlink(name); ++ } ++ } ++ break; ++ } ++ ++ if (IS_ERR(file)) { ++ eprintk_ctx("filp_open %s: %ld\n", name, PTR_ERR(file)); ++ return file; ++ } else { ++ dprintk_ctx("deleted file created as %s, %p, %x\n", name, file, file->f_dentry->d_inode->i_mode); ++ } ++ return file; ++ ++change_dir: ++ sprintf(name, "/tmp/rst%u", current->pid); ++ suffix = name + strlen(name); ++ attempt = 1; ++ tmp_pass = 1; ++ goto try_again; ++} ++ ++struct file *rst_file(loff_t pos, int fd, struct cpt_context *ctx) ++{ ++ int err; ++ int was_dentry_open = 0; ++ cpt_object_t *obj; ++ cpt_object_t *iobj; ++ struct cpt_file_image fi; ++ __u8 *name = NULL; ++ struct file *file; ++ int flags; ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, pos, ctx); ++ if (obj) { ++ file = obj->o_obj; ++ if (obj->o_index >= 0) { ++ dprintk_ctx("file is attached to a socket\n"); ++ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); ++ if (err < 0) ++ goto err_out; ++ fixup_file_flags(file, &fi, 0, pos, ctx); ++ } ++ get_file(file); ++ return file; ++ } ++ ++ err = rst_get_object(CPT_OBJ_FILE, pos, &fi, ctx); ++ if (err < 0) ++ goto err_out; ++ ++ flags = make_flags(&fi); ++ ++ /* Easy way, inode has been already open. */ ++ if (fi.cpt_inode != CPT_NULL && ++ !(fi.cpt_lflags & CPT_DENTRY_CLONING) && ++ (iobj = lookup_cpt_obj_bypos(CPT_OBJ_INODE, fi.cpt_inode, ctx)) != NULL && ++ iobj->o_parent) { ++ struct file *filp = iobj->o_parent; ++ file = dentry_open(dget(filp->f_dentry), ++ mntget(filp->f_vfsmnt), flags); ++ dprintk_ctx("rst_file: file obtained by dentry_open\n"); ++ was_dentry_open = 1; ++ goto map_file; ++ } ++ ++ if (fi.cpt_next > fi.cpt_hdrlen) ++ name = rst_get_name(pos + sizeof(fi), ctx); ++ ++ if (fi.cpt_lflags == CPT_DENTRY_DELETED) { ++ if (fi.cpt_inode == CPT_NULL) { ++ eprintk_ctx("deleted file and no inode.\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ ++ /* One very special case... */ ++ if (S_ISREG(fi.cpt_i_mode) && ++ (!name || !name[0] || strcmp(name, "/dev/zero (deleted)") == 0)) { ++ /* MAP_ANON|MAP_SHARED mapping. ++ * kernel makes this damn ugly way, when file which ++ * is passed to mmap by user does not match ++ * file finally attached to VMA. Ok, rst_mm ++ * has to take care of this. Otherwise, it will fail. ++ */ ++ file = NULL; ++ } else if (S_ISREG(fi.cpt_i_mode) || ++ S_ISCHR(fi.cpt_i_mode) || ++ S_ISFIFO(fi.cpt_i_mode) || ++ S_ISDIR(fi.cpt_i_mode)) { ++ if (S_ISCHR(fi.cpt_i_mode)) { ++ file = open_special(&fi, flags, 1, ctx); ++ if (file != NULL) ++ goto map_file; ++ } ++ file = open_deleted(name, flags, &fi, ctx); ++ if (IS_ERR(file)) ++ goto out; ++ } else { ++ eprintk_ctx("not a regular deleted file.\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ ++ err = fixup_file_content(&file, &fi, ctx); ++ if (err) ++ goto err_put; ++ goto map_file; ++ } else { ++ if (!name || !name[0]) { ++ eprintk_ctx("no name for file?\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++ if ((fi.cpt_lflags & CPT_DENTRY_EPOLL) && ++ (file = cpt_open_epolldev(&fi, flags, ctx)) != NULL) ++ goto map_file; ++ if (S_ISFIFO(fi.cpt_i_mode) && ++ (file = open_pipe(name, &fi, flags, ctx)) != NULL) ++ goto map_file; ++ if (!S_ISREG(fi.cpt_i_mode) && ++ (file = open_special(&fi, flags, 0, ctx)) != NULL) ++ goto map_file; ++ } ++ ++ file = filp_open(name, flags, 0); ++ ++map_file: ++ if (!IS_ERR(file)) { ++ fixup_file_flags(file, &fi, was_dentry_open, pos, ctx); ++ ++ if (S_ISFIFO(fi.cpt_i_mode) && !was_dentry_open) { ++ err = fixup_pipe_data(file, &fi, ctx); ++ if (err) ++ goto err_put; ++ } ++ ++ obj = cpt_object_get(CPT_OBJ_FILE, file, ctx); ++ if (!obj) { ++ obj = cpt_object_add(CPT_OBJ_FILE, file, ctx); ++ if (obj) ++ get_file(file); ++ } ++ if (obj) ++ cpt_obj_setpos(obj, pos, ctx); ++ ++ obj = cpt_object_add(CPT_OBJ_INODE, file->f_dentry->d_inode, ctx); ++ if (obj) { ++ cpt_obj_setpos(obj, fi.cpt_inode, ctx); ++ if (!obj->o_parent || fi.cpt_lflags != CPT_DENTRY_DELETED) ++ obj->o_parent = file; ++ } ++ ++ if (fi.cpt_next > fi.cpt_hdrlen) { ++ err = fixup_flocks(file, &fi, pos, ctx); ++ if (err) ++ goto err_put; ++ } ++ } else { ++ if (fi.cpt_lflags & CPT_DENTRY_PROC) { ++ dprintk_ctx("rst_file /proc delayed\n"); ++ file = NULL; ++ } ++ } ++ ++out: ++ if (name) ++ rst_put_name(name, ctx); ++ return file; ++ ++err_put: ++ if (file) ++ fput(file); ++err_out: ++ if (name) ++ rst_put_name(name, ctx); ++ return ERR_PTR(err); ++} ++ ++ ++__u32 rst_files_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ __u32 flag = 0; ++ ++ if (ti->cpt_files == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx)) ++ flag |= CLONE_FILES; ++ if (ti->cpt_fs == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx)) ++ flag |= CLONE_FS; ++ return flag; ++} ++ ++static void local_close_files(struct files_struct * files) ++{ ++ int i, j; ++ ++ j = 0; ++ for (;;) { ++ unsigned long set; ++ i = j * __NFDBITS; ++ if (i >= files->fdt->max_fdset || i >= files->fdt->max_fds) ++ break; ++ set = files->fdt->open_fds->fds_bits[j]; ++ while (set) { ++ if (set & 1) { ++ struct file * file = xchg(&files->fdt->fd[i], NULL); ++ if (file) ++ filp_close(file, files); ++ } ++ i++; ++ set >>= 1; ++ } ++ files->fdt->open_fds->fds_bits[j] = 0; ++ files->fdt->close_on_exec->fds_bits[j] = 0; ++ j++; ++ } ++} ++ ++extern int expand_fdtable(struct files_struct *files, int nr); ++ ++ ++int rst_files_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct cpt_files_struct_image fi; ++ struct files_struct *f = current->files; ++ cpt_object_t *obj; ++ loff_t pos, endpos; ++ int err; ++ ++ if (ti->cpt_files == CPT_NULL) { ++ current->files = NULL; ++ if (f) ++ put_files_struct(f); ++ return 0; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILES, ti->cpt_files, ctx); ++ if (obj) { ++ if (obj->o_obj != f) { ++ put_files_struct(f); ++ f = obj->o_obj; ++ atomic_inc(&f->count); ++ current->files = f; ++ } ++ return 0; ++ } ++ ++ err = rst_get_object(CPT_OBJ_FILES, ti->cpt_files, &fi, ctx); ++ if (err) ++ return err; ++ ++ local_close_files(f); ++ ++ if (fi.cpt_max_fds > f->fdt->max_fds) { ++ spin_lock(&f->file_lock); ++ err = expand_fdtable(f, fi.cpt_max_fds-1); ++ spin_unlock(&f->file_lock); ++ if (err) ++ return err; ++ } ++ ++ pos = ti->cpt_files + fi.cpt_hdrlen; ++ endpos = ti->cpt_files + fi.cpt_next; ++ while (pos < endpos) { ++ struct cpt_fd_image fdi; ++ struct file *filp; ++ ++ err = rst_get_object(CPT_OBJ_FILEDESC, pos, &fdi, ctx); ++ if (err) ++ return err; ++ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); ++ if (IS_ERR(filp)) { ++ eprintk_ctx("rst_file: %ld %Lu\n", PTR_ERR(filp), fdi.cpt_file); ++ return PTR_ERR(filp); ++ } ++ if (filp == NULL) { ++ int err = rst_filejob_queue(pos, ctx); ++ if (err) ++ return err; ++ } else { ++ if (fdi.cpt_fd >= f->fdt->max_fds) BUG(); ++ f->fdt->fd[fdi.cpt_fd] = filp; ++ FD_SET(fdi.cpt_fd, f->fdt->open_fds); ++ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) ++ FD_SET(fdi.cpt_fd, f->fdt->close_on_exec); ++ } ++ pos += fdi.cpt_next; ++ } ++ f->fdt->next_fd = fi.cpt_next_fd; ++ ++ obj = cpt_object_add(CPT_OBJ_FILES, f, ctx); ++ if (obj) { ++ cpt_obj_setpos(obj, ti->cpt_files, ctx); ++ cpt_obj_setindex(obj, fi.cpt_index, ctx); ++ } ++ return 0; ++} ++ ++int rst_do_filejobs(cpt_context_t *ctx) ++{ ++ struct filejob *j; ++ ++ while ((j = ctx->filejob_queue) != NULL) { ++ int err; ++ task_t *tsk; ++ struct cpt_fd_image fdi; ++ struct file *filp; ++ ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid_ve(j->pid); ++ if (tsk) ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ if (!tsk) ++ return -EINVAL; ++ ++ err = rst_get_object(CPT_OBJ_FILEDESC, j->fdi, &fdi, ctx); ++ if (err) { ++ put_task_struct(tsk); ++ return err; ++ } ++ ++ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); ++ if (tsk->files->fdt->fd[fdi.cpt_fd] || ++ FD_ISSET(fdi.cpt_fd, tsk->files->fdt->open_fds)) { ++ eprintk_ctx("doing filejob %Ld: fd is busy\n", j->fdi); ++ put_task_struct(tsk); ++ return -EBUSY; ++ } ++ ++ filp = rst_file(fdi.cpt_file, fdi.cpt_fd, ctx); ++ if (IS_ERR(filp)) { ++ eprintk_ctx("rst_do_filejobs: 1: %ld %Lu\n", PTR_ERR(filp), fdi.cpt_file); ++ put_task_struct(tsk); ++ return PTR_ERR(filp); ++ } ++ if (fdi.cpt_fd >= tsk->files->fdt->max_fds) BUG(); ++ tsk->files->fdt->fd[fdi.cpt_fd] = filp; ++ FD_SET(fdi.cpt_fd, tsk->files->fdt->open_fds); ++ if (fdi.cpt_flags&CPT_FD_FLAG_CLOSEEXEC) ++ FD_SET(fdi.cpt_fd, tsk->files->fdt->close_on_exec); ++ ++ dprintk_ctx("filejob %Ld done\n", j->fdi); ++ ++ put_task_struct(tsk); ++ ctx->filejob_queue = j->next; ++ kfree(j); ++ } ++ return 0; ++} ++ ++void rst_flush_filejobs(cpt_context_t *ctx) ++{ ++ struct filejob *j; ++ ++ while ((j = ctx->filejob_queue) != NULL) { ++ ctx->filejob_queue = j->next; ++ kfree(j); ++ } ++} ++ ++int rst_fs_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct fs_struct *f = current->fs; ++ cpt_object_t *obj; ++ ++ if (ti->cpt_fs == CPT_NULL) { ++ exit_fs(current); ++ return 0; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FS, ti->cpt_fs, ctx); ++ if (obj) { ++ if (obj->o_obj != f) { ++ exit_fs(current); ++ f = obj->o_obj; ++ atomic_inc(&f->count); ++ current->fs = f; ++ } ++ return 0; ++ } ++ ++ /* Do _not_ restore root. Image contains absolute pathnames. ++ * So, we fix it in context of rst process. ++ */ ++ ++ obj = cpt_object_add(CPT_OBJ_FS, f, ctx); ++ if (obj) ++ cpt_obj_setpos(obj, ti->cpt_fs, ctx); ++ ++ return 0; ++} ++ ++static int get_dir(struct dentry **dp, struct vfsmount **mp, ++ loff_t *pos, struct cpt_context *ctx) ++{ ++ struct cpt_file_image fi; ++ struct file * file; ++ int err; ++ ++ err = rst_get_object(CPT_OBJ_FILE, *pos, &fi, ctx); ++ if (err) ++ return err; ++ ++ file = rst_file(*pos, -1, ctx); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ ++ *dp = dget(file->f_dentry); ++ *mp = mntget(file->f_vfsmnt); ++ *pos += fi.cpt_next; ++ fput(file); ++ return 0; ++} ++ ++static void __set_fs_root(struct fs_struct *fs, struct vfsmount *mnt, ++ struct dentry *dentry) ++{ ++ struct dentry *old_root; ++ struct vfsmount *old_rootmnt; ++ write_lock(&fs->lock); ++ old_root = fs->root; ++ old_rootmnt = fs->rootmnt; ++ fs->rootmnt = mnt; ++ fs->root = dentry; ++ write_unlock(&fs->lock); ++ if (old_root) { ++ dput(old_root); ++ mntput(old_rootmnt); ++ } ++} ++ ++static void __set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, ++ struct dentry *dentry) ++{ ++ struct dentry *old_pwd; ++ struct vfsmount *old_pwdmnt; ++ ++ write_lock(&fs->lock); ++ old_pwd = fs->pwd; ++ old_pwdmnt = fs->pwdmnt; ++ fs->pwdmnt = mnt; ++ fs->pwd = dentry; ++ write_unlock(&fs->lock); ++ ++ if (old_pwd) { ++ dput(old_pwd); ++ mntput(old_pwdmnt); ++ } ++} ++ ++ ++int rst_restore_fs(struct cpt_context *ctx) ++{ ++ loff_t pos; ++ cpt_object_t *obj; ++ int err = 0; ++ ++ for_each_object(obj, CPT_OBJ_FS) { ++ struct cpt_fs_struct_image fi; ++ struct fs_struct *fs = obj->o_obj; ++ int i; ++ struct dentry *d[3]; ++ struct vfsmount *m[3]; ++ ++ err = rst_get_object(CPT_OBJ_FS, obj->o_pos, &fi, ctx); ++ if (err) ++ return err; ++ ++ fs->umask = fi.cpt_umask; ++ ++ pos = obj->o_pos + fi.cpt_hdrlen; ++ d[0] = d[1] = d[2] = NULL; ++ m[0] = m[1] = m[2] = NULL; ++ i = 0; ++ while (pos < obj->o_pos + fi.cpt_next && i<3) { ++ err = get_dir(d+i, m+i, &pos, ctx); ++ if (err) { ++ eprintk_ctx("cannot get_dir: %d", err); ++ for (--i; i >= 0; i--) { ++ if (d[i]) ++ dput(d[i]); ++ if (m[i]) ++ mntput(m[i]); ++ } ++ return err; ++ } ++ i++; ++ } ++ if (d[0]) ++ __set_fs_root(fs, m[0], d[0]); ++ if (d[1]) ++ __set_fs_pwd(fs, m[1], d[1]); ++ if (d[2]) { ++ struct dentry *olddentry; ++ struct vfsmount *oldmnt; ++ write_lock(&fs->lock); ++ oldmnt = fs->altrootmnt; ++ olddentry = fs->altroot; ++ fs->altrootmnt = m[2]; ++ fs->altroot = d[2]; ++ write_unlock(&fs->lock); ++ ++ if (olddentry) { ++ dput(olddentry); ++ mntput(oldmnt); ++ } ++ } ++ } ++ return err; ++} ++ ++int do_one_mount(char *mntpnt, char *mnttype, char *mntbind, unsigned long flags, struct cpt_context *ctx) ++{ ++ int err; ++ ++ if (mntbind && (strcmp(mntbind, "/") == 0 || strcmp(mntbind, "") == 0)) ++ mntbind = NULL; ++ ++ if (mntbind) ++ flags |= MS_BIND; ++ ++ err = sc_mount(mntbind, mntpnt, mnttype, flags); ++ if (err < 0) { ++ eprintk_ctx("%d mounting %s %s %08lx\n", err, mntpnt, mnttype, flags); ++ return err; ++ } ++ return 0; ++} ++ ++static int undumptmpfs(void *arg) ++{ ++ int i; ++ int *pfd = arg; ++ char *argv[] = { "tar", "x", "-C", "/", "-S", NULL }; ++ ++ if (pfd[0] != 0) ++ sc_dup2(pfd[0], 0); ++ ++ for (i=1; i<current->files->fdt->max_fds; i++) ++ sc_close(i); ++ ++ module_put(THIS_MODULE); ++ ++ set_fs(KERNEL_DS); ++ i = sc_execve("/bin/tar", argv, NULL); ++ eprintk("failed to exec /bin/tar: %d\n", i); ++ return -1; ++} ++ ++static int rst_restore_tmpfs(loff_t *pos, struct cpt_context * ctx) ++{ ++ int err; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ int n; ++ loff_t end; ++ int pid; ++ ++ err = rst_get_object(CPT_OBJ_NAME, *pos, &v, ctx); ++ if (err < 0) ++ return err; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) ++ return err; ++ pid = err = local_kernel_thread(undumptmpfs, (void*)pfd, SIGCHLD, 0); ++ if (err < 0) ++ goto out; ++ f = fget(pfd[1]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ ctx->file->f_pos = *pos + v.cpt_hdrlen; ++ end = *pos + v.cpt_next; ++ *pos += v.cpt_next; ++ do { ++ char buf[16]; ++ mm_segment_t oldfs; ++ ++ n = end - ctx->file->f_pos; ++ if (n > sizeof(buf)) ++ n = sizeof(buf); ++ ++ if (ctx->read(buf, n, ctx)) ++ break; ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ f->f_op->write(f, buf, n, &f->f_pos); ++ set_fs(oldfs); ++ } while (ctx->file->f_pos < end); ++ ++ fput(f); ++ ++ clear_tsk_thread_flag(current,TIF_SIGPENDING); ++ ++ if ((err = sc_waitx(pid, 0)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ ++ return 0; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ return err; ++} ++ ++int restore_one_vfsmount(struct cpt_vfsmount_image *mi, loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t endpos; ++ ++ endpos = pos + mi->cpt_next; ++ pos += mi->cpt_hdrlen; ++ ++ while (pos < endpos) { ++ char *mntdev; ++ char *mntpnt; ++ char *mnttype; ++ char *mntbind; ++ ++ mntdev = __rst_get_name(&pos, ctx); ++ mntpnt = __rst_get_name(&pos, ctx); ++ mnttype = __rst_get_name(&pos, ctx); ++ mntbind = __rst_get_name(&pos, ctx); ++ err = -EINVAL; ++ if (mnttype && mntpnt) { ++ err = 0; ++ if (strcmp(mntpnt, "/")) ++ err = do_one_mount(mntpnt, mnttype, mntbind, mi->cpt_flags, ctx); ++ if (strcmp(mnttype, "tmpfs") == 0) { ++ rst_restore_tmpfs(&pos, ctx); ++ } ++ } ++ if (mntdev) ++ rst_put_name(mntdev, ctx); ++ if (mntpnt) ++ rst_put_name(mntpnt, ctx); ++ if (mnttype) ++ rst_put_name(mnttype, ctx); ++ if (mntbind) ++ rst_put_name(mntbind, ctx); ++ if (err) ++ return err; ++ } ++ return 0; ++} ++ ++int restore_one_namespace(loff_t pos, loff_t endpos, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_vfsmount_image mi; ++ ++ while (pos < endpos) { ++ err = rst_get_object(CPT_OBJ_VFSMOUNT, pos, &mi, ctx); ++ if (err) ++ return err; ++ err = restore_one_vfsmount(&mi, pos, ctx); ++ if (err) ++ return err; ++ pos += mi.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_root_namespace(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_NAMESPACE]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_object_hdr sbuf; ++ int done = 0; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NAMESPACE || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ err = rst_get_object(CPT_OBJ_NAMESPACE, sec, &sbuf, ctx); ++ if (err) ++ return err; ++ if (done) { ++ eprintk_ctx("multiple namespaces are not supported\n"); ++ break; ++ } ++ done++; ++ err = restore_one_namespace(sec+sbuf.cpt_hdrlen, sec+sbuf.cpt_next, ctx); ++ if (err) ++ return err; ++ sec += sbuf.cpt_next; ++ } ++ ++ return 0; ++} ++ ++int rst_stray_files(struct cpt_context *ctx) ++{ ++ int err = 0; ++ loff_t sec = ctx->sections[CPT_SECT_FILES]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_FILES || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ struct cpt_object_hdr sbuf; ++ cpt_object_t *obj; ++ ++ err = _rst_get_object(CPT_OBJ_FILE, sec, &sbuf, sizeof(sbuf), ctx); ++ if (err) ++ break; ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_FILE, sec, ctx); ++ if (!obj) { ++ struct file *file; ++ ++ dprintk_ctx("stray file %Ld\n", sec); ++ ++ file = rst_sysv_shm(sec, ctx); ++ ++ if (IS_ERR(file)) { ++ eprintk_ctx("rst_stray_files: %ld\n", PTR_ERR(file)); ++ return PTR_ERR(file); ++ } else { ++ fput(file); ++ } ++ } ++ sec += sbuf.cpt_next; ++ } ++ ++ return err; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_i386.S linux-2.6.16-026test015/kernel/cpt/rst_i386.S +--- linux-2.6.16.orig/kernel/cpt/rst_i386.S 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_i386.S 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,40 @@ ++#define ASSEMBLY 1 ++ ++#include <linux/config.h> ++#include <linux/linkage.h> ++#include <asm/thread_info.h> ++#include <asm/errno.h> ++#include <asm/segment.h> ++#include <asm/page.h> ++#include <asm/smp.h> ++#include <asm/page.h> ++ ++ .section .text ++ .align 4 ++ .global ret_last_siginfo ++ret_last_siginfo: ++ call rlsi ++ movl %eax,%esp ++ ret ++ ++ .align 8 ++ .global ret_child_tid ++ret_child_tid: ++ push %esp ++ call rct ++ movl %eax,%esp ++ ret ++ ++ .align 4 ++ .global ret_from_rst ++ret_from_rst: ++ pushl %eax ++ jmp ret_from_fork+6 ++ ++ .align 4 ++ .global pre_ret_from_fork ++pre_ret_from_fork: ++ pushl %eax ++ call schedule_tail ++ popl %eax ++ ret +diff -upr linux-2.6.16.orig/kernel/cpt/rst_mm.c linux-2.6.16-026test015/kernel/cpt/rst_mm.c +--- linux-2.6.16.orig/kernel/cpt/rst_mm.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_mm.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,986 @@ ++/* ++ * ++ * kernel/cpt/rst_mm.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/virtinfo.h> ++#include <linux/hugetlb.h> ++#include <linux/errno.h> ++#include <linux/errno.h> ++#include <linux/pagemap.h> ++#include <linux/mman.h> ++#include <linux/vmalloc.h> ++#include <linux/rmap.h> ++#include <linux/hash.h> ++#include <asm/pgalloc.h> ++#include <asm/tlb.h> ++#include <asm/tlbflush.h> ++#include <asm/pgtable.h> ++#include <asm/mmu.h> ++#include <asm/ldt.h> ++#include <asm/desc.h> ++#include <asm/mmu_context.h> ++#include <linux/swapops.h> ++#include <linux/cpt_image.h> ++ ++#ifdef CONFIG_VE ++#include <ub/beancounter.h> ++#include <ub/ub_vmpages.h> ++#endif ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_files.h" ++#include "cpt_ubc.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++#include "cpt_pagein.h" ++#endif ++ ++#include "cpt_syscalls.h" ++ ++#define __PAGE_NX (1ULL<<63) ++ ++static unsigned long make_prot(struct cpt_vma_image *vmai) ++{ ++ unsigned long prot = 0; ++ ++ if (vmai->cpt_flags&VM_READ) ++ prot |= PROT_READ; ++ if (vmai->cpt_flags&VM_WRITE) ++ prot |= PROT_WRITE; ++ if (vmai->cpt_flags&VM_EXEC) ++ prot |= PROT_EXEC; ++ if (vmai->cpt_flags&VM_GROWSDOWN) ++ prot |= PROT_GROWSDOWN; ++ if (vmai->cpt_flags&VM_GROWSUP) ++ prot |= PROT_GROWSUP; ++ return prot; ++} ++ ++static unsigned long make_flags(struct cpt_vma_image *vmai) ++{ ++ unsigned long flags = MAP_FIXED; ++ ++ if (vmai->cpt_flags&(VM_SHARED|VM_MAYSHARE)) ++ flags |= MAP_SHARED; ++ else ++ flags |= MAP_PRIVATE; ++ ++ if (vmai->cpt_file == CPT_NULL) ++ flags |= MAP_ANONYMOUS; ++ if (vmai->cpt_flags&VM_GROWSDOWN) ++ flags |= MAP_GROWSDOWN; ++ if (vmai->cpt_flags&VM_DENYWRITE) ++ flags |= MAP_DENYWRITE; ++ if (vmai->cpt_flags&VM_EXECUTABLE) ++ flags |= MAP_EXECUTABLE; ++ if (!(vmai->cpt_flags&VM_ACCOUNT)) ++ flags |= MAP_NORESERVE; ++ return flags; ++} ++ ++ ++#if !defined(CONFIG_X86_64) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) ++static int __alloc_ldt(mm_context_t *pc, int mincount) ++{ ++ int oldsize, newsize, i; ++ ++ if (mincount <= pc->size) ++ return 0; ++ /* ++ * LDT got larger - reallocate if necessary. ++ */ ++ oldsize = pc->size; ++ mincount = (mincount+511)&(~511); ++ newsize = mincount*LDT_ENTRY_SIZE; ++ for (i = 0; i < newsize; i += PAGE_SIZE) { ++ int nr = i/PAGE_SIZE; ++ BUG_ON(i >= 64*1024); ++ if (!pc->ldt_pages[nr]) { ++ pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER|__GFP_UBC); ++ if (!pc->ldt_pages[nr]) ++ return -ENOMEM; ++ clear_highpage(pc->ldt_pages[nr]); ++ } ++ } ++ pc->size = mincount; ++ return 0; ++} ++ ++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) ++{ ++ struct mm_struct *mm = current->mm; ++ int i; ++ int err; ++ int size; ++ ++ err = __alloc_ldt(&mm->context, li->cpt_size/LDT_ENTRY_SIZE); ++ if (err) ++ return err; ++ ++ size = mm->context.size*LDT_ENTRY_SIZE; ++ ++ for (i = 0; i < size; i += PAGE_SIZE) { ++ int nr = i / PAGE_SIZE, bytes; ++ char *kaddr = kmap(mm->context.ldt_pages[nr]); ++ ++ bytes = size - i; ++ if (bytes > PAGE_SIZE) ++ bytes = PAGE_SIZE; ++ err = ctx->pread(kaddr, bytes, ctx, pos + li->cpt_hdrlen + i); ++ kunmap(mm->context.ldt_pages[nr]); ++ if (err) ++ return err; ++ } ++ ++ load_LDT(&mm->context); ++ return 0; ++} ++ ++#else ++ ++static int do_rst_ldt(struct cpt_obj_bits *li, loff_t pos, struct cpt_context *ctx) ++{ ++ struct mm_struct *mm = current->mm; ++ int oldsize = mm->context.size; ++ void *oldldt; ++ void *newldt; ++ int err; ++ ++ if (li->cpt_size > PAGE_SIZE) ++ newldt = vmalloc(li->cpt_size); ++ else ++ newldt = kmalloc(li->cpt_size, GFP_KERNEL); ++ ++ if (!newldt) ++ return -ENOMEM; ++ ++ err = ctx->pread(newldt, li->cpt_size, ctx, pos + li->cpt_hdrlen); ++ if (err) ++ return err; ++ ++ oldldt = mm->context.ldt; ++ mm->context.ldt = newldt; ++ mm->context.size = li->cpt_size/LDT_ENTRY_SIZE; ++ ++ load_LDT(&mm->context); ++ ++ if (oldsize) { ++ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) ++ vfree(oldldt); ++ else ++ kfree(oldldt); ++ } ++ return 0; ++} ++#endif ++ ++static int ++restore_aio_ring(struct kioctx *aio_ctx, struct cpt_aio_ctx_image *aimg) ++{ ++ struct aio_ring_info *info = &aio_ctx->ring_info; ++ unsigned nr_events = aio_ctx->max_reqs; ++ unsigned long size; ++ int nr_pages; ++ ++ /* We recalculate parameters of the ring exactly like ++ * fs/aio.c does and then compare calculated values ++ * with ones, stored in dump. They must be the same. */ ++ ++ nr_events += 2; ++ ++ size = sizeof(struct aio_ring); ++ size += sizeof(struct io_event) * nr_events; ++ nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; ++ ++ if (nr_pages != aimg->cpt_ring_pages) ++ return -EINVAL; ++ ++ info->nr_pages = nr_pages; ++ ++ nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); ++ ++ if (nr_events != aimg->cpt_nr) ++ return -EINVAL; ++ ++ info->nr = 0; ++ info->ring_pages = info->internal_pages; ++ if (nr_pages > AIO_RING_PAGES) { ++ info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL); ++ if (!info->ring_pages) ++ return -ENOMEM; ++ memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages); ++ } ++ ++ info->mmap_size = nr_pages * PAGE_SIZE; ++ ++ /* This piece of shit is not entirely my fault. Kernel aio.c makes ++ * something odd mmap()ping some pages and then pinning them. ++ * I guess it is just some mud remained of failed attempt to show ring ++ * to user space. The result is odd. :-) Immediately after ++ * creation of AIO context, kernel shares those pages with user ++ * and user can read and even write there. But after the first ++ * fork, pages are marked COW with evident consequences. ++ * I remember, I did the same mistake in the first version ++ * of mmapped packet socket, luckily that crap never reached ++ * mainstream. ++ * ++ * So, what are we going to do? I can simulate this odd behaviour ++ * exactly, but I am not insane yet. For now just take the pages ++ * from user space. Alternatively, we could keep kernel copy ++ * in AIO context image, which would be more correct. ++ * ++ * What is wrong now? If the pages are COWed, ring is transferred ++ * incorrectly. ++ */ ++ down_read(¤t->mm->mmap_sem); ++ info->mmap_base = aimg->cpt_mmap_base; ++ info->nr_pages = get_user_pages(current, current->mm, ++ info->mmap_base, nr_pages, ++ 1, 0, info->ring_pages, NULL); ++ up_read(¤t->mm->mmap_sem); ++ ++ if (unlikely(info->nr_pages != nr_pages)) { ++ int i; ++ ++ for (i=0; i<info->nr_pages; i++) ++ put_page(info->ring_pages[i]); ++ if (info->ring_pages && info->ring_pages != info->internal_pages) ++ kfree(info->ring_pages); ++ return -EFAULT; ++ } ++ ++ aio_ctx->user_id = info->mmap_base; ++ ++ info->nr = nr_events; ++ info->tail = aimg->cpt_tail; ++ ++ return 0; ++} ++ ++static int do_rst_aio(struct cpt_aio_ctx_image *aimg, loff_t pos, cpt_context_t *ctx) ++{ ++ int err; ++ struct kioctx *aio_ctx; ++ extern spinlock_t aio_nr_lock; ++ ++ aio_ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL); ++ if (!aio_ctx) ++ return -ENOMEM; ++ ++ memset(aio_ctx, 0, sizeof(*aio_ctx)); ++ aio_ctx->max_reqs = aimg->cpt_max_reqs; ++ ++ if ((err = restore_aio_ring(aio_ctx, aimg)) < 0) { ++ kmem_cache_free(kioctx_cachep, aio_ctx); ++ eprintk_ctx("AIO %Ld restore_aio_ring: %d\n", pos, err); ++ return err; ++ } ++ ++ aio_ctx->mm = current->mm; ++ atomic_inc(&aio_ctx->mm->mm_count); ++ atomic_set(&aio_ctx->users, 1); ++ spin_lock_init(&aio_ctx->ctx_lock); ++ spin_lock_init(&aio_ctx->ring_info.ring_lock); ++ init_waitqueue_head(&aio_ctx->wait); ++ INIT_LIST_HEAD(&aio_ctx->active_reqs); ++ INIT_LIST_HEAD(&aio_ctx->run_list); ++ INIT_WORK(&aio_ctx->wq, aio_kick_handler, ctx); ++ ++ spin_lock(&aio_nr_lock); ++ aio_nr += aio_ctx->max_reqs; ++ spin_unlock(&aio_nr_lock); ++ ++ write_lock(&aio_ctx->mm->ioctx_list_lock); ++ aio_ctx->next = aio_ctx->mm->ioctx_list; ++ aio_ctx->mm->ioctx_list = aio_ctx; ++ write_unlock(&aio_ctx->mm->ioctx_list_lock); ++ ++ return 0; ++} ++ ++struct anonvma_map ++{ ++ struct hlist_node list; ++ struct anon_vma *avma; ++ __u64 id; ++}; ++ ++static int verify_create_anonvma(struct mm_struct *mm, ++ struct cpt_vma_image *vmai, ++ cpt_context_t *ctx) ++{ ++ struct anon_vma *avma = NULL; ++ struct anon_vma *new_avma; ++ struct vm_area_struct *vma; ++ int h; ++ ++ if (!ctx->anonvmas) { ++ if (CPT_ANONVMA_HSIZE*sizeof(struct hlist_head) > PAGE_SIZE) ++ return -EINVAL; ++ if ((ctx->anonvmas = (void*)__get_free_page(GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) ++ INIT_HLIST_HEAD(&ctx->anonvmas[h]); ++ } else { ++ struct anonvma_map *map; ++ struct hlist_node *elem; ++ ++ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); ++ hlist_for_each_entry(map, elem, &ctx->anonvmas[h], list) { ++ if (map->id == vmai->cpt_anonvmaid) { ++ avma = map->avma; ++ break; ++ } ++ } ++ } ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, vmai->cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ return -ESRCH; ++ } ++ if (vma->vm_start != vmai->cpt_start) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("vma start mismatch\n"); ++ return -EINVAL; ++ } ++ if (vma->vm_pgoff != vmai->cpt_pgoff) { ++ dprintk_ctx("vma pgoff mismatch, fixing\n"); ++ if (vma->vm_file || (vma->vm_flags&(VM_SHARED|VM_MAYSHARE))) { ++ eprintk_ctx("cannot fixup vma pgoff\n"); ++ up_read(&mm->mmap_sem); ++ return -EINVAL; ++ } ++ vma->vm_pgoff = vmai->cpt_pgoff; ++ } ++ ++ if (!vma->anon_vma) { ++ if (avma) { ++ vma->anon_vma = avma; ++ anon_vma_link(vma); ++ } else { ++ int err; ++ ++ err = anon_vma_prepare(vma); ++ ++ if (err) { ++ up_read(&mm->mmap_sem); ++ return err; ++ } ++ } ++ } else { ++ /* Note, we _can_ arrive to the situation, when two ++ * different anonvmaid's point to one anon_vma, this happens ++ * f.e. when mmap() merged new area to previous one and ++ * they will share one anon_vma even if they did not on ++ * original host. ++ * ++ * IT IS OK. To all that I understand, we may merge all ++ * the anon_vma's and rmap can scan all the huge list of vmas ++ * searching for page. It is just "suboptimal". ++ * ++ * Real disaster would happen, if vma already got an anon_vma ++ * with different id. It is very rare case, kernel does the ++ * best efforts to merge anon_vmas when some attributes are ++ * different. In this case we will fall to copying memory. ++ */ ++ if (avma && vma->anon_vma != avma) { ++ up_read(&mm->mmap_sem); ++ wprintk_ctx("anon_vma mismatch\n"); ++ return 0; ++ } ++ } ++ ++ new_avma = vma->anon_vma; ++ up_read(&mm->mmap_sem); ++ ++ if (!avma) { ++ struct anonvma_map *map; ++ ++ if (!new_avma) ++ return -EINVAL; ++ ++ if ((map = kmalloc(sizeof(*map), GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ map->id = vmai->cpt_anonvmaid; ++ map->avma = new_avma; ++ h = hash_long((unsigned long)vmai->cpt_anonvmaid, CPT_ANONVMA_HBITS); ++ hlist_add_head(&map->list, &ctx->anonvmas[h]); ++ } ++ return 0; ++} ++ ++static int copy_mm_pages(struct mm_struct *src, unsigned long start, ++ unsigned long end) ++{ ++ int err; ++ ++ for (; start < end; start += PAGE_SIZE) { ++ struct page *page; ++ struct page *spage; ++ void *maddr, *srcaddr; ++ ++ err = get_user_pages(current, current->mm, ++ start, 1, 1, 1, &page, NULL); ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) ++ return err; ++ ++ err = get_user_pages(current, src, ++ start, 1, 0, 1, &spage, NULL); ++ ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) { ++ page_cache_release(page); ++ return err; ++ } ++ ++ srcaddr = kmap(spage); ++ maddr = kmap(page); ++ memcpy(maddr, srcaddr, PAGE_SIZE); ++ set_page_dirty_lock(page); ++ kunmap(page); ++ kunmap(spage); ++ page_cache_release(page); ++ page_cache_release(spage); ++ } ++ return 0; ++} ++ ++static int do_rst_vma(struct cpt_vma_image *vmai, loff_t vmapos, loff_t mmpos, struct cpt_context *ctx) ++{ ++ int err = 0; ++ unsigned long addr; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ struct file *file = NULL; ++ unsigned long prot; ++ int checked = 0; ++ ++ prot = make_prot(vmai); ++ ++ if (vmai->cpt_file != CPT_NULL) { ++ if (vmai->cpt_type == CPT_VMA_TYPE_0) { ++ file = rst_file(vmai->cpt_file, -1, ctx); ++ if (IS_ERR(file)) { ++ eprintk_ctx("do_rst_vma: rst_file: %Ld\n", vmai->cpt_file); ++ return PTR_ERR(file); ++ } ++ } else if (vmai->cpt_type == CPT_VMA_TYPE_SHM) { ++ file = rst_sysv_shm(vmai->cpt_file, ctx); ++ if (IS_ERR(file)) ++ return PTR_ERR(file); ++ } ++ } ++ ++ down_write(&mm->mmap_sem); ++ addr = do_mmap_pgoff(file, vmai->cpt_start, ++ vmai->cpt_end-vmai->cpt_start, ++ prot, make_flags(vmai), ++ vmai->cpt_pgoff); ++ ++ if (addr != vmai->cpt_start) { ++ up_write(&mm->mmap_sem); ++ ++ err = -EINVAL; ++ if (IS_ERR((void*)addr)) ++ err = addr; ++ goto out; ++ } ++ ++ vma = find_vma(mm, vmai->cpt_start); ++ if (vma == NULL) { ++ up_write(&mm->mmap_sem); ++ eprintk_ctx("cannot find mmapped vma\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ ++ /* do_mmap_pgoff() can merge new area to previous one (not to the next, ++ * we mmap in order, the rest of mm is still unmapped). This can happen ++ * f.e. if flags are to be adjusted later, or if we had different ++ * anon_vma on two adjacent regions. Split it by brute force. */ ++ if (vma->vm_start != vmai->cpt_start) { ++ dprintk_ctx("vma %Ld merged, split\n", vmapos); ++ err = split_vma(mm, vma, (unsigned long)vmai->cpt_start, 0); ++ if (err) { ++ up_write(&mm->mmap_sem); ++ eprintk_ctx("cannot split vma\n"); ++ goto out; ++ } ++ } ++ up_write(&mm->mmap_sem); ++ ++ if (vmai->cpt_anonvma && vmai->cpt_anonvmaid) { ++ err = verify_create_anonvma(mm, vmai, ctx); ++ if (err) { ++ eprintk_ctx("cannot verify_create_anonvma %Ld\n", vmapos); ++ goto out; ++ } ++ } ++ ++ if (vmai->cpt_next > vmai->cpt_hdrlen) { ++ loff_t offset = vmapos + vmai->cpt_hdrlen; ++ ++ do { ++ union { ++ struct cpt_page_block pb; ++ struct cpt_remappage_block rpb; ++ struct cpt_copypage_block cpb; ++ struct cpt_lazypage_block lpb; ++ } u; ++ loff_t pos; ++ ++ err = rst_get_object(-1, offset, &u, ctx); ++ if (err) { ++ eprintk_ctx("vma fix object: %d\n", err); ++ goto out; ++ } ++ if (u.rpb.cpt_object == CPT_OBJ_REMAPPAGES) { ++ err = sc_remap_file_pages(u.rpb.cpt_start, ++ u.rpb.cpt_end-u.rpb.cpt_start, ++ 0, u.rpb.cpt_pgoff, 0); ++ if (err < 0) { ++ eprintk_ctx("remap_file_pages: %d (%08x,%u,%u)\n", err, ++ (__u32)u.rpb.cpt_start, (__u32)(u.rpb.cpt_end-u.rpb.cpt_start), ++ (__u32)u.rpb.cpt_pgoff); ++ goto out; ++ } ++ offset += u.rpb.cpt_next; ++ continue; ++ } else if (u.cpb.cpt_object == CPT_OBJ_LAZYPAGES) { ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ unsigned long addr = u.lpb.cpt_start; ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, u.lpb.cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("lost vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ err = anon_vma_prepare(vma); ++ if (err) { ++ up_read(&mm->mmap_sem); ++ goto out; ++ } ++ while (addr < u.lpb.cpt_end) { ++ err = rst_pagein(vma, u.lpb.cpt_index + (addr-u.lpb.cpt_start)/PAGE_SIZE, ++ addr, ctx); ++ if (err) ++ break; ++ addr += PAGE_SIZE; ++ } ++ up_read(&mm->mmap_sem); ++#else ++ err = -EINVAL; ++#endif ++ if (err) ++ goto out; ++ offset += u.cpb.cpt_next; ++ continue; ++ } else if (u.cpb.cpt_object == CPT_OBJ_COPYPAGES) { ++ struct vm_area_struct *vma, *vma1; ++ struct mm_struct *src; ++ struct anon_vma *src_anon; ++ cpt_object_t *mobj; ++ ++ if (!vmai->cpt_anonvmaid) { ++ err = -EINVAL; ++ eprintk_ctx("CPT_OBJ_COPYPAGES in !anonvma\n"); ++ goto out; ++ } ++ ++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, u.cpb.cpt_source, ctx); ++ if (!mobj) { ++ eprintk_ctx("lost mm_struct to clone pages from\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ src = mobj->o_obj; ++ ++ down_read(&src->mmap_sem); ++ src_anon = NULL; ++ vma1 = find_vma(src, u.cpb.cpt_start); ++ if (vma1) ++ src_anon = vma1->anon_vma; ++ up_read(&src->mmap_sem); ++ ++ if (!vma1) { ++ eprintk_ctx("lost src vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ ++ down_read(&mm->mmap_sem); ++ if ((vma = find_vma(mm, u.cpb.cpt_start)) == NULL) { ++ up_read(&mm->mmap_sem); ++ eprintk_ctx("lost vm_area_struct\n"); ++ err = -ESRCH; ++ goto out; ++ } ++ ++ if (!src_anon || ++ !vma->anon_vma || ++ vma->anon_vma != src_anon || ++ vma->vm_start - vma1->vm_start != ++ (vma->vm_pgoff - vma1->vm_pgoff) << PAGE_SHIFT) { ++ up_read(&mm->mmap_sem); ++ wprintk_ctx("anon_vma mismatch in vm_area_struct %Ld\n", vmapos); ++ err = copy_mm_pages(mobj->o_obj, ++ u.cpb.cpt_start, ++ u.cpb.cpt_end); ++ } else { ++ err = __copy_page_range(vma, vma1, ++ u.cpb.cpt_start, ++ u.cpb.cpt_end-u.cpb.cpt_start); ++ up_read(&mm->mmap_sem); ++ } ++ if (err) { ++ eprintk_ctx("clone_page_range: %d (%08x,%u,%ld)\n", err, ++ (__u32)u.cpb.cpt_start, (__u32)(u.cpb.cpt_end-u.cpb.cpt_start), ++ (long)u.cpb.cpt_source); ++ goto out; ++ } ++ ++ offset += u.cpb.cpt_next; ++ continue; ++ } ++ if (u.pb.cpt_object != CPT_OBJ_PAGES) { ++ eprintk_ctx("unknown vma fix object %d\n", u.pb.cpt_object); ++ err = -EINVAL; ++ goto out; ++ } ++ pos = offset + sizeof(u.pb); ++ if (!(vmai->cpt_flags&VM_ACCOUNT) && !(prot&PROT_WRITE)) { ++ /* I guess this is get_user_pages() messed things, ++ * this happens f.e. when gdb inserts breakpoints. ++ */ ++ int i; ++ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/PAGE_SIZE; i++) { ++ struct page *page; ++ void *maddr; ++ err = get_user_pages(current, current->mm, ++ (unsigned long)u.pb.cpt_start + i*PAGE_SIZE, ++ 1, 1, 1, &page, NULL); ++ if (err == 0) ++ err = -EFAULT; ++ if (err < 0) { ++ eprintk_ctx("get_user_pages: %d\n", err); ++ goto out; ++ } ++ err = 0; ++ maddr = kmap(page); ++ if (u.pb.cpt_content == CPT_CONTENT_VOID) { ++ memset(maddr, 0, PAGE_SIZE); ++ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { ++ err = ctx->pread(maddr, PAGE_SIZE, ++ ctx, pos + i*PAGE_SIZE); ++ if (err) { ++ kunmap(page); ++ goto out; ++ } ++ } else { ++ err = -EINVAL; ++ kunmap(page); ++ goto out; ++ } ++ set_page_dirty_lock(page); ++ kunmap(page); ++ page_cache_release(page); ++ } ++ } else { ++ if (!(prot&PROT_WRITE)) ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); ++ if (u.pb.cpt_content == CPT_CONTENT_VOID) { ++ int i; ++ for (i=0; i<(u.pb.cpt_end-u.pb.cpt_start)/sizeof(unsigned long); i++) { ++ err = __put_user(0UL, ((unsigned long __user*)(unsigned long)u.pb.cpt_start) + i); ++ if (err) { ++ eprintk_ctx("__put_user 2 %d\n", err); ++ goto out; ++ } ++ } ++ } else if (u.pb.cpt_content == CPT_CONTENT_DATA) { ++ loff_t tpos = pos; ++ err = ctx->file->f_op->read(ctx->file, cpt_ptr_import(u.pb.cpt_start), ++ u.pb.cpt_end-u.pb.cpt_start, ++ &tpos); ++ if (err != u.pb.cpt_end-u.pb.cpt_start) { ++ if (err >= 0) ++ err = -EIO; ++ goto out; ++ } ++ } else { ++ err = -EINVAL; ++ goto out; ++ } ++ if (!(prot&PROT_WRITE)) ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); ++ } ++ err = 0; ++ offset += u.pb.cpt_next; ++ } while (offset < vmapos + vmai->cpt_next); ++ } ++ ++check: ++ do { ++ struct vm_area_struct *vma; ++ down_read(&mm->mmap_sem); ++ vma = find_vma(mm, addr); ++ if (vma) { ++ if ((vma->vm_flags^vmai->cpt_flags)&VM_READHINTMASK) { ++ VM_ClearReadHint(vma); ++ vma->vm_flags |= vmai->cpt_flags&VM_READHINTMASK; ++ } ++ if ((vma->vm_flags^vmai->cpt_flags)&VM_LOCKED) { ++ dprintk_ctx("fixing up VM_LOCKED %Ld\n", vmapos); ++ up_read(&mm->mmap_sem); ++ if (vma->vm_flags&VM_LOCKED) ++ err = sc_munlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); ++ else ++ err = sc_mlock(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start); ++ if (err) ++ goto out; ++ goto check; ++ } ++ if ((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&~__PAGE_NX) ++ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, ++ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); ++#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) ++ if (((vma->vm_page_prot.pgprot^vmai->cpt_pgprot)&__PAGE_NX) && ++ (ctx->kernel_config_flags&CPT_KERNEL_CONFIG_PAE)) ++ wprintk_ctx("VMA %08lx@%ld pgprot mismatch %08Lx %08Lx\n", addr, (long)vmapos, ++ (__u64)vma->vm_page_prot.pgprot, (__u64)vmai->cpt_pgprot); ++#endif ++ if (vma->vm_flags != vmai->cpt_flags) { ++ unsigned long x = vma->vm_flags ^ vmai->cpt_flags; ++ if (x & VM_EXEC) { ++ /* Crap. On i386 this is OK. ++ * It is impossible to make via mmap/mprotect ++ * exec.c clears VM_EXEC on stack. */ ++ vma->vm_flags &= ~VM_EXEC; ++ } else if ((x & VM_ACCOUNT) && !checked) { ++ checked = 1; ++ if (!(prot&PROT_WRITE)) { ++ up_read(&mm->mmap_sem); ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot | PROT_WRITE); ++ sc_mprotect(vmai->cpt_start, vmai->cpt_end-vmai->cpt_start, prot); ++ goto check; ++ } ++ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, ++ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); ++ } else { ++ wprintk_ctx("VMA %08lx@%ld flag mismatch %08x %08x\n", addr, (long)vmapos, ++ (__u32)vma->vm_flags, (__u32)vmai->cpt_flags); ++ } ++ } ++ } else { ++ wprintk_ctx("no VMA for %08lx@%ld\n", addr, (long)vmapos); ++ } ++ up_read(&mm->mmap_sem); ++ } while (0); ++ ++out: ++ if (file) ++ fput(file); ++ return err; ++} ++ ++static int do_rst_mm(struct cpt_mm_image *vmi, loff_t pos, struct cpt_context *ctx) ++{ ++ int err = 0; ++ unsigned int def_flags; ++ struct mm_struct *mm = current->mm; ++ ++ down_write(&mm->mmap_sem); ++ do_munmap(mm, 0, TASK_SIZE); ++ ++ mm->start_code = vmi->cpt_start_code; ++ mm->end_code = vmi->cpt_end_code; ++ mm->start_data = vmi->cpt_start_data; ++ mm->end_data = vmi->cpt_end_data; ++ mm->start_brk = vmi->cpt_start_brk; ++ mm->brk = vmi->cpt_brk; ++ mm->start_stack = vmi->cpt_start_stack; ++ mm->arg_start = vmi->cpt_start_arg; ++ mm->arg_end = vmi->cpt_end_arg; ++ mm->env_start = vmi->cpt_start_env; ++ mm->env_end = vmi->cpt_end_env; ++ mm->def_flags = 0; ++ def_flags = vmi->cpt_def_flags; ++ ++ mm->dumpable = (vmi->cpt_dumpable != 0); ++ mm->vps_dumpable = (vmi->cpt_vps_dumpable != 0); ++ ++#if 0 /* def CONFIG_HUGETLB_PAGE*/ ++/* NB: ? */ ++ int used_hugetlb; ++#endif ++ up_write(&mm->mmap_sem); ++ ++ if (vmi->cpt_next > vmi->cpt_hdrlen) { ++ loff_t offset = pos + vmi->cpt_hdrlen; ++ do { ++ union { ++ struct cpt_vma_image vmai; ++ struct cpt_aio_ctx_image aioi; ++ struct cpt_obj_bits bits; ++ } u; ++ err = rst_get_object(-1, offset, &u, ctx); ++ if (err) ++ goto out; ++ if (u.vmai.cpt_object == CPT_OBJ_VMA) { ++ err = do_rst_vma(&u.vmai, offset, pos, ctx); ++ if (err) ++ goto out; ++ } else if (u.bits.cpt_object == CPT_OBJ_BITS && ++ u.bits.cpt_content == CPT_CONTENT_MM_CONTEXT) { ++ err = do_rst_ldt(&u.bits, offset, ctx); ++ if (err) ++ goto out; ++ } else if (u.aioi.cpt_object == CPT_OBJ_AIO_CONTEXT) { ++ err = do_rst_aio(&u.aioi, offset, ctx); ++ if (err) ++ goto out; ++ } else { ++ eprintk_ctx("unknown object %u in mm image\n", u.vmai.cpt_object); ++ err = -EINVAL; ++ goto out; ++ } ++ offset += u.vmai.cpt_next; ++ } while (offset < pos + vmi->cpt_next); ++ } ++ ++ down_write(&mm->mmap_sem); ++ mm->def_flags = def_flags; ++ up_write(&mm->mmap_sem); ++ ++ ++out: ++ return err; ++} ++ ++extern void exit_mm(struct task_struct * tsk); ++ ++int rst_mm_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err = 0; ++ cpt_object_t *mobj; ++ void *tmp = (void*)__get_free_page(GFP_KERNEL); ++ struct cpt_mm_image *vmi = (struct cpt_mm_image *)tmp; ++ ++ if (!tmp) ++ return -ENOMEM; ++ ++ if (ti->cpt_mm == CPT_NULL) { ++ if (current->mm) ++ exit_mm(current); ++ goto out; ++ } ++ ++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); ++ if (mobj) { ++ if (current->mm != mobj->o_obj) BUG(); ++ goto out; ++ } ++ ++ if (current->mm == NULL) { ++ struct mm_struct *mm = mm_alloc(); ++ if (mm == NULL) { ++ err = -ENOMEM; ++ goto out; ++ } ++ err = init_new_context(current, mm); ++ if (err) { ++ mmdrop(mm); ++ goto out; ++ } ++ current->mm = mm; ++ } ++ ++ if ((err = rst_get_object(CPT_OBJ_MM, ti->cpt_mm, vmi, ctx)) != 0) ++ goto out; ++ if ((err = do_rst_mm(vmi, ti->cpt_mm, ctx)) != 0) { ++ eprintk_ctx("do_rst_mm %Ld\n", ti->cpt_mm); ++ goto out; ++ } ++ err = -ENOMEM; ++ mobj = cpt_object_add(CPT_OBJ_MM, current->mm, ctx); ++ if (mobj != NULL) { ++ err = 0; ++ cpt_obj_setpos(mobj, ti->cpt_mm, ctx); ++ } ++ ++out: ++ if (tmp) ++ free_page((unsigned long)tmp); ++ return err; ++} ++ ++/* This is part of mm setup, made in parent context. Mostly, it is the place, ++ * where we graft mm of another process to child. ++ */ ++ ++int rst_mm_basic(cpt_object_t *obj, struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ task_t *tsk = obj->o_obj; ++ cpt_object_t *mobj; ++ ++ /* Task without mm. Just get rid of this. */ ++ if (ti->cpt_mm == CPT_NULL) { ++ if (tsk->mm) { ++ mmput(tsk->mm); ++ tsk->mm = NULL; ++ } ++ return 0; ++ } ++ ++ mobj = lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx); ++ if (mobj) { ++ struct mm_struct *newmm = mobj->o_obj; ++ /* Good, the MM is already created. */ ++ if (newmm == tsk->mm) { ++ /* Already done by clone(). */ ++ return 0; ++ } ++ mmput(tsk->mm); ++ atomic_inc(&newmm->mm_users); ++ tsk->mm = newmm; ++ tsk->active_mm = newmm; ++ } ++ return 0; ++} ++ ++/* We use CLONE_VM when mm of child is going to be shared with parent. ++ * Otherwise mm is copied. ++ */ ++ ++__u32 rst_mm_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ if (ti->cpt_mm == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_MM, ti->cpt_mm, ctx)) ++ return CLONE_VM; ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_net.c linux-2.6.16-026test015/kernel/cpt/rst_net.c +--- linux-2.6.16.orig/kernel/cpt/rst_net.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_net.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,481 @@ ++/* ++ * ++ * kernel/cpt/rst_net.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <linux/rtnetlink.h> ++#include <linux/ve.h> ++#include <linux/ve_proto.h> ++#include <net/route.h> ++#include <net/ip_fib.h> ++#include <net/addrconf.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++#include "cpt_net.h" ++ ++#include "cpt_syscalls.h" ++ ++extern struct in_ifaddr *inet_alloc_ifa(void); ++extern int inet_insert_ifa(struct in_ifaddr *ifa); ++ ++int rst_restore_ifaddr(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_NET_IFADDR]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_ifaddr_image di; ++ struct net_device *dev; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_IFADDR || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int cindex = -1; ++ int err; ++ err = rst_get_object(CPT_OBJ_NET_IFADDR, sec, &di, ctx); ++ if (err) ++ return err; ++ if (di.cpt_index == ctx->lo_index_old) ++ cindex = ctx->lo_index; ++ else if (di.cpt_index == ctx->venet_index_old) ++ cindex = ctx->venet_index; ++ if (cindex <= 0) ++ eprintk_ctx("unknown ifaddr for %d\n", di.cpt_index); ++ rtnl_lock(); ++ dev = __dev_get_by_index(cindex); ++ if (dev && di.cpt_family == AF_INET) { ++ struct in_device *in_dev; ++ struct in_ifaddr *ifa; ++ if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) ++ in_dev = inetdev_init(dev); ++ ifa = inet_alloc_ifa(); ++ if (ifa) { ++ ifa->ifa_local = di.cpt_address[0]; ++ ifa->ifa_address = di.cpt_peer[0]; ++ ifa->ifa_broadcast = di.cpt_broadcast[0]; ++ ifa->ifa_prefixlen = di.cpt_masklen; ++ ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); ++ ifa->ifa_flags = di.cpt_flags; ++ ifa->ifa_scope = di.cpt_scope; ++ memcpy(ifa->ifa_label, di.cpt_label, IFNAMSIZ); ++ in_dev_hold(in_dev); ++ ifa->ifa_dev = in_dev; ++ err = inet_insert_ifa(ifa); ++ if (err && err != -EEXIST) { ++ rtnl_unlock(); ++ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); ++ return err; ++ } ++ } ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ } else if (dev && di.cpt_family == AF_INET6) { ++ err = inet6_addr_add(dev->ifindex, ++ (struct in6_addr *)di.cpt_address, ++ di.cpt_masklen); ++ if (err && err != -EEXIST) { ++ rtnl_unlock(); ++ eprintk_ctx("add ifaddr err %d for %d %s\n", err, di.cpt_index, di.cpt_label); ++ return err; ++ } ++#endif ++ } else { ++ rtnl_unlock(); ++ eprintk_ctx("unknown ifaddr 2 for %d\n", di.cpt_index); ++ return -EINVAL; ++ } ++ rtnl_unlock(); ++ sec += di.cpt_next; ++ } ++ return 0; ++} ++ ++static int rewrite_rtmsg(struct nlmsghdr *nlh, struct cpt_context *ctx) ++{ ++ int min_len = NLMSG_LENGTH(sizeof(struct rtmsg)); ++ struct rtmsg *rtm = NLMSG_DATA(nlh); ++ int idx = -1; ++ __u32 prefix0 = 0; ++ ++ if (nlh->nlmsg_len > min_len) { ++ int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); ++ struct rtattr *rta = (void*)nlh + NLMSG_ALIGN(min_len); ++ ++ while (RTA_OK(rta, attrlen)) { ++ if (rta->rta_type == RTA_OIF) { ++ idx = *(int*)RTA_DATA(rta); ++ if (idx == ctx->lo_index_old) ++ idx = ctx->lo_index; ++ else if (idx == ctx->venet_index_old) ++ idx = ctx->venet_index; ++ else { ++ eprintk_ctx("unknown iface %d\n", idx); ++ return -ENODEV; ++ } ++ *(int*)RTA_DATA(rta) = idx; ++ } else if (rta->rta_type == RTA_DST) { ++ prefix0 = *(__u32*)RTA_DATA(rta); ++ } ++ rta = RTA_NEXT(rta, attrlen); ++ } ++ } ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ if (rtm->rtm_family == AF_INET6) { ++ if (rtm->rtm_type == RTN_LOCAL) ++ return 2; ++ if (rtm->rtm_flags & RTM_F_CLONED) ++ return 2; ++ if (rtm->rtm_protocol == RTPROT_UNSPEC || ++ rtm->rtm_protocol == RTPROT_RA || ++ rtm->rtm_protocol == RTPROT_REDIRECT || ++ rtm->rtm_protocol == RTPROT_KERNEL) ++ return 2; ++ if (rtm->rtm_protocol == RTPROT_BOOT && ++ ((rtm->rtm_dst_len == 8 && prefix0 == htonl(0xFF000000)) || ++ (rtm->rtm_dst_len == 64 && prefix0 == htonl(0xFE800000)))) ++ return 2; ++ } ++#endif ++ return rtm->rtm_protocol == RTPROT_KERNEL; ++} ++ ++int rst_restore_route(struct cpt_context *ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct msghdr msg; ++ struct iovec iov; ++ struct sockaddr_nl nladdr; ++ mm_segment_t oldfs; ++ loff_t sec = ctx->sections[CPT_SECT_NET_ROUTE]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_object_hdr v; ++ char *pg; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_ROUTE || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ if (h.cpt_hdrlen >= h.cpt_next) ++ return 0; ++ ++ sec += h.cpt_hdrlen; ++ err = rst_get_object(CPT_OBJ_NET_ROUTE, sec, &v, ctx); ++ if (err < 0) ++ return err; ++ ++ err = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE, &sock); ++ if (err) ++ return err; ++ ++ pg = (char*)__get_free_page(GFP_KERNEL); ++ if (pg == NULL) { ++ err = -ENOMEM; ++ goto out_sock; ++ } ++ ++ memset(&nladdr, 0, sizeof(nladdr)); ++ nladdr.nl_family = AF_NETLINK; ++ ++ endsec = sec + v.cpt_next; ++ sec += v.cpt_hdrlen; ++ ++ while (sec < endsec) { ++ struct nlmsghdr *n; ++ struct nlmsghdr nh; ++ int kernel_flag; ++ ++ err = ctx->pread(&nh, sizeof(nh), ctx, sec); ++ if (err) ++ goto out_sock_pg; ++ if (nh.nlmsg_len > PAGE_SIZE) { ++ err = -EINVAL; ++ goto out_sock_pg; ++ } ++ err = ctx->pread(pg, nh.nlmsg_len, ctx, sec); ++ if (err) ++ goto out_sock_pg; ++ ++ n = (struct nlmsghdr*)pg; ++ n->nlmsg_flags = NLM_F_REQUEST|NLM_F_APPEND|NLM_F_CREATE; ++ ++ err = rewrite_rtmsg(n, ctx); ++ if (err < 0) ++ goto out_sock_pg; ++ kernel_flag = err; ++ ++ if (kernel_flag == 2) ++ goto do_next; ++ ++ iov.iov_base=n; ++ iov.iov_len=nh.nlmsg_len; ++ msg.msg_name=&nladdr; ++ msg.msg_namelen=sizeof(nladdr); ++ msg.msg_iov=&iov; ++ msg.msg_iovlen=1; ++ msg.msg_control=NULL; ++ msg.msg_controllen=0; ++ msg.msg_flags=MSG_DONTWAIT; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_sendmsg(sock, &msg, nh.nlmsg_len); ++ set_fs(oldfs); ++ ++ if (err < 0) ++ goto out_sock_pg; ++ err = 0; ++ ++ iov.iov_base=pg; ++ iov.iov_len=PAGE_SIZE; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = sock_recvmsg(sock, &msg, PAGE_SIZE, MSG_DONTWAIT); ++ set_fs(oldfs); ++ if (err != -EAGAIN) { ++ if (err == NLMSG_LENGTH(sizeof(struct nlmsgerr)) && ++ n->nlmsg_type == NLMSG_ERROR) { ++ struct nlmsgerr *e = NLMSG_DATA(n); ++ if (e->error != -EEXIST || !kernel_flag) ++ eprintk_ctx("NLMERR: %d\n", e->error); ++ } else { ++ eprintk_ctx("Res: %d %d\n", err, n->nlmsg_type); ++ } ++ } ++do_next: ++ err = 0; ++ sec += NLMSG_ALIGN(nh.nlmsg_len); ++ } ++ ++out_sock_pg: ++ free_page((unsigned long)pg); ++out_sock: ++ sock_release(sock); ++ return err; ++} ++ ++int rst_resume_network(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ env->disable_net = 0; ++ put_ve(env); ++ return 0; ++} ++ ++int rst_restore_netdev(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_NET_DEVICE]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_netdev_image di; ++ struct net_device *dev; ++ ++ get_exec_env()->disable_net = 1; ++ ++ dev = __dev_get_by_name("lo"); ++ if (!dev) { ++ eprintk_ctx("cannot find loopback netdevice\n"); ++ return -EINVAL; ++ } ++ ctx->lo_index = dev->ifindex; ++ ctx->lo_index_old = -1; ++ dev = __dev_get_by_name("venet0"); ++ if (!dev) { ++ eprintk_ctx("cannot find venet0 netdevice\n"); ++ return -EINVAL; ++ } ++ ctx->venet_index = dev->ifindex; ++ ctx->venet_index_old = -1; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_DEVICE || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int err; ++ err = rst_get_object(CPT_OBJ_NET_DEVICE, sec, &di, ctx); ++ if (err) ++ return err; ++ if (strcmp(di.cpt_name, "lo") == 0) { ++ ctx->lo_index_old = di.cpt_index; ++ } else if (strcmp(di.cpt_name, "venet0") == 0) { ++ ctx->venet_index_old = di.cpt_index; ++ } else { ++ eprintk_ctx("unknown interface %s\n", di.cpt_name); ++ } ++ dev = __dev_get_by_name(di.cpt_name); ++ if (dev) { ++ if (di.cpt_flags^dev->flags) { ++ rtnl_lock(); ++ err = dev_change_flags(dev, di.cpt_flags); ++ rtnl_unlock(); ++ if (err) ++ eprintk_ctx("dev_change_flags err: %d\n", err); ++ } ++ } else { ++ eprintk_ctx("unknown interface 2 %s\n", di.cpt_name); ++ } ++ sec += di.cpt_next; ++ } ++ return 0; ++} ++ ++static int dumpfn(void *arg) ++{ ++ int i; ++ int *pfd = arg; ++ char *argv[] = { "iptables-restore", "-c", NULL }; ++ ++ if (pfd[0] != 0) ++ sc_dup2(pfd[0], 0); ++ ++ for (i=1; i<current->files->fdt->max_fds; i++) ++ sc_close(i); ++ ++ module_put(THIS_MODULE); ++ ++ set_fs(KERNEL_DS); ++ i = sc_execve("/sbin/iptables-restore", argv, NULL); ++ eprintk("failed to exec /sbin/iptables-restore: %d\n", i); ++ return -1; ++} ++ ++static int rst_restore_iptables(struct cpt_context * ctx) ++{ ++ int err; ++ int pfd[2]; ++ struct file *f; ++ struct cpt_object_hdr v; ++ int n; ++ struct cpt_section_hdr h; ++ loff_t sec = ctx->sections[CPT_SECT_NET_IPTABLES]; ++ loff_t end; ++ int pid; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_NET_IPTABLES || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ if (h.cpt_hdrlen == h.cpt_next) ++ return 0; ++ if (h.cpt_hdrlen > h.cpt_next) ++ return -EINVAL; ++ sec += h.cpt_hdrlen; ++ err = rst_get_object(CPT_OBJ_NAME, sec, &v, ctx); ++ if (err < 0) ++ return err; ++ ++ err = sc_pipe(pfd); ++ if (err < 0) ++ return err; ++ pid = err = local_kernel_thread(dumpfn, (void*)pfd, SIGCHLD, 0); ++ if (err < 0) ++ goto out; ++ f = fget(pfd[1]); ++ sc_close(pfd[1]); ++ sc_close(pfd[0]); ++ ++ ctx->file->f_pos = sec + v.cpt_hdrlen; ++ end = sec + v.cpt_next; ++ do { ++ char *p; ++ char buf[16]; ++ mm_segment_t oldfs; ++ ++ n = end - ctx->file->f_pos; ++ if (n > sizeof(buf)) ++ n = sizeof(buf); ++ ++ if (ctx->read(buf, n, ctx)) ++ break; ++ if ((p = memchr(buf, 0, n)) != NULL) ++ n = p - buf; ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ f->f_op->write(f, buf, n, &f->f_pos); ++ set_fs(oldfs); ++ } while (ctx->file->f_pos < end); ++ ++ fput(f); ++ ++ clear_tsk_thread_flag(current,TIF_SIGPENDING); ++ ++ if ((err = sc_waitx(pid, 0)) < 0) ++ eprintk_ctx("wait4: %d\n", err); ++ ++ return 0; ++ ++out: ++ if (pfd[1] >= 0) ++ sc_close(pfd[1]); ++ if (pfd[0] >= 0) ++ sc_close(pfd[0]); ++ return err; ++} ++ ++int rst_restore_net(struct cpt_context *ctx) ++{ ++ int err; ++ ++ err = rst_restore_netdev(ctx); ++ if (!err) ++ err = rst_restore_ifaddr(ctx); ++ if (!err) ++ err = rst_restore_route(ctx); ++ if (!err) ++ err = rst_restore_iptables(ctx); ++ if (!err) ++ err = rst_restore_ip_conntrack(ctx); ++ return err; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_proc.c linux-2.6.16-026test015/kernel/cpt/rst_proc.c +--- linux-2.6.16.orig/kernel/cpt/rst_proc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_proc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,604 @@ ++/* ++ * ++ * kernel/cpt/rst_proc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/proc_fs.h> ++#include <linux/smp_lock.h> ++#include <asm/uaccess.h> ++#include <linux/cpt_ioctl.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_dump.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_kernel.h" ++ ++MODULE_AUTHOR("Alexey Kuznetsov <alexey@sw.ru>"); ++MODULE_LICENSE("GPL"); ++ ++/* List of contexts and lock protecting the list */ ++static struct list_head cpt_context_list; ++static spinlock_t cpt_context_lock; ++ ++static int proc_read(char *buffer, char **start, off_t offset, ++ int length, int *eof, void *data) ++{ ++ off_t pos = 0; ++ off_t begin = 0; ++ int len = 0; ++ cpt_context_t *ctx; ++ ++ len += sprintf(buffer, "Ctx Id VE State\n"); ++ ++ spin_lock(&cpt_context_lock); ++ ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ len += sprintf(buffer+len,"%p %08x %-8u %d", ++ ctx, ++ ctx->contextid, ++ ctx->ve_id, ++ ctx->ctx_state ++ ); ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ len += pagein_info_printf(buffer+len, ctx); ++#endif ++ ++ buffer[len++] = '\n'; ++ ++ pos = begin+len; ++ if (pos < offset) { ++ len = 0; ++ begin = pos; ++ } ++ if (pos > offset+length) ++ goto done; ++ } ++ *eof = 1; ++ ++done: ++ spin_unlock(&cpt_context_lock); ++ *start = buffer + (offset - begin); ++ len -= (offset - begin); ++ if(len > length) ++ len = length; ++ if(len < 0) ++ len = 0; ++ return len; ++} ++ ++void rst_context_release(cpt_context_t *ctx) ++{ ++ list_del(&ctx->ctx_list); ++ spin_unlock(&cpt_context_lock); ++ ++ if (ctx->ctx_state > 0) ++ rst_resume(ctx); ++ ctx->ctx_state = CPT_CTX_ERROR; ++ ++ rst_close_dumpfile(ctx); ++ ++ if (ctx->anonvmas) { ++ int h; ++ for (h = 0; h < CPT_ANONVMA_HSIZE; h++) { ++ while (!hlist_empty(&ctx->anonvmas[h])) { ++ struct hlist_node *elem = ctx->anonvmas[h].first; ++ hlist_del(elem); ++ kfree(elem); ++ } ++ } ++ free_page((unsigned long)ctx->anonvmas); ++ } ++ cpt_flush_error(ctx); ++ if (ctx->errorfile) { ++ fput(ctx->errorfile); ++ ctx->errorfile = NULL; ++ } ++ if (ctx->error_msg) { ++ free_page((unsigned long)ctx->error_msg); ++ ctx->error_msg = NULL; ++ } ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++ if (ctx->pgin_task) ++ put_task_struct(ctx->pgin_task); ++#endif ++ if (ctx->filejob_queue) ++ rst_flush_filejobs(ctx); ++ if (ctx->objcount) ++ eprintk_ctx("%d objects leaked\n", ctx->objcount); ++ kfree(ctx); ++ ++ spin_lock(&cpt_context_lock); ++} ++ ++static void __cpt_context_put(cpt_context_t *ctx) ++{ ++ if (!--ctx->refcount) ++ rst_context_release(ctx); ++} ++ ++static void cpt_context_put(cpt_context_t *ctx) ++{ ++ spin_lock(&cpt_context_lock); ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++} ++ ++cpt_context_t * rst_context_open(void) ++{ ++ cpt_context_t *ctx; ++ ++ if ((ctx = kmalloc(sizeof(*ctx), GFP_KERNEL)) != NULL) { ++ rst_context_init(ctx); ++ spin_lock(&cpt_context_lock); ++ list_add_tail(&ctx->ctx_list, &cpt_context_list); ++ spin_unlock(&cpt_context_lock); ++ ctx->error_msg = (char*)__get_free_page(GFP_KERNEL); ++ if (ctx->error_msg != NULL) ++ ctx->error_msg[0] = 0; ++ } ++ return ctx; ++} ++ ++void rst_report_error(int err, cpt_context_t *ctx) ++{ ++ if (ctx->statusfile) { ++ mm_segment_t oldfs; ++ int status = 7 /* VZ_ENVCREATE_ERROR */; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (ctx->statusfile->f_op && ctx->statusfile->f_op->write) ++ ctx->statusfile->f_op->write(ctx->statusfile, (char*)&status, sizeof(status), &ctx->statusfile->f_pos); ++ set_fs(oldfs); ++ fput(ctx->statusfile); ++ ctx->statusfile = NULL; ++ } ++} ++ ++ ++static cpt_context_t * cpt_context_lookup(unsigned int ctxid) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ list_for_each_entry(ctx, &cpt_context_list, ctx_list) { ++ if (ctx->contextid == ctxid) { ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ return ctx; ++ } ++ } ++ spin_unlock(&cpt_context_lock); ++ return NULL; ++} ++ ++static int rst_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg) ++{ ++ int err = 0; ++ cpt_context_t *ctx; ++ struct file *dfile = NULL; ++ ++ unlock_kernel(); ++ ++ if (cmd == CPT_TEST_CAPS) { ++ err = test_cpu_caps(); ++ goto out_lock; ++ } ++ ++ if (cmd == CPT_JOIN_CONTEXT || cmd == CPT_PUT_CONTEXT) { ++ cpt_context_t *old_ctx; ++ ++ ctx = NULL; ++ if (cmd == CPT_JOIN_CONTEXT) { ++ err = -ENOENT; ++ ctx = cpt_context_lookup(arg); ++ if (!ctx) ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ file->private_data = ctx; ++ ++ if (old_ctx) { ++ if (cmd == CPT_PUT_CONTEXT && old_ctx->sticky) { ++ old_ctx->sticky = 0; ++ old_ctx->refcount--; ++ } ++ __cpt_context_put(old_ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++ err = 0; ++ goto out_lock; ++ } ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ if (ctx) ++ ctx->refcount++; ++ spin_unlock(&cpt_context_lock); ++ ++ if (!ctx) { ++ cpt_context_t *old_ctx; ++ ++ err = -ENOMEM; ++ ctx = rst_context_open(); ++ if (!ctx) ++ goto out_lock; ++ ++ spin_lock(&cpt_context_lock); ++ old_ctx = (cpt_context_t*)file->private_data; ++ if (!old_ctx) { ++ ctx->refcount++; ++ file->private_data = ctx; ++ } else { ++ old_ctx->refcount++; ++ } ++ if (old_ctx) { ++ __cpt_context_put(ctx); ++ ctx = old_ctx; ++ } ++ spin_unlock(&cpt_context_lock); ++ } ++ ++ if (cmd == CPT_GET_CONTEXT) { ++ unsigned int contextid = (unsigned int)arg; ++ ++ err = -EINVAL; ++ if (ctx->contextid && ctx->contextid != contextid) ++ goto out_nosem; ++ if (!ctx->contextid) { ++ cpt_context_t *c1 = cpt_context_lookup(contextid); ++ if (c1) { ++ cpt_context_put(c1); ++ err = -EEXIST; ++ goto out_nosem; ++ } ++ ctx->contextid = contextid; ++ } ++ spin_lock(&cpt_context_lock); ++ if (!ctx->sticky) { ++ ctx->sticky = 1; ++ ctx->refcount++; ++ } ++ spin_unlock(&cpt_context_lock); ++ err = 0; ++ goto out_nosem; ++ } ++ ++ down(&ctx->main_sem); ++ ++ err = -EBUSY; ++ if (ctx->ctx_state < 0) ++ goto out; ++ ++ err = 0; ++ switch (cmd) { ++ case CPT_SET_DUMPFD: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ if (dfile->f_op == NULL || ++ dfile->f_op->read == NULL) { ++ fput(dfile); ++ err = -EBADF; ++ break; ++ } ++ } ++ if (ctx->file) ++ fput(ctx->file); ++ ctx->file = dfile; ++ break; ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ case CPT_SET_PAGEINFDIN: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->pagein_file_in) ++ fput(ctx->pagein_file_in); ++ ctx->pagein_file_in = dfile; ++ break; ++ case CPT_SET_PAGEINFDOUT: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->pagein_file_out) ++ fput(ctx->pagein_file_out); ++ ctx->pagein_file_out = dfile; ++ break; ++ case CPT_PAGEIND: ++ err = rst_pageind(ctx); ++ break; ++#endif ++ case CPT_SET_LOCKFD: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->lockfile) ++ fput(ctx->lockfile); ++ ctx->lockfile = dfile; ++ break; ++ case CPT_SET_STATUSFD: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->statusfile) ++ fput(ctx->statusfile); ++ ctx->statusfile = dfile; ++ break; ++ case CPT_SET_ERRORFD: ++ if (arg >= 0) { ++ dfile = fget(arg); ++ if (IS_ERR(dfile)) { ++ err = PTR_ERR(dfile); ++ break; ++ } ++ } ++ if (ctx->errorfile) ++ fput(ctx->errorfile); ++ ctx->errorfile = dfile; ++ break; ++ case CPT_SET_VEID: ++ if (ctx->ctx_state > 0) { ++ err = -EBUSY; ++ break; ++ } ++ ctx->ve_id = arg; ++ break; ++ case CPT_UNDUMP: ++ if (ctx->ctx_state > 0) { ++ err = -ENOENT; ++ break; ++ } ++ ctx->ctx_state = CPT_CTX_UNDUMPING; ++ err = vps_rst_undump(ctx); ++ if (err) { ++ rst_report_error(err, ctx); ++ if (rst_kill(ctx) == 0) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ } else { ++ ctx->ctx_state = CPT_CTX_UNDUMPED; ++ } ++ break; ++ case CPT_RESUME: ++ if (!ctx->ctx_state) { ++ err = -ENOENT; ++ break; ++ } ++ err = rst_resume(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ case CPT_KILL: ++ if (!ctx->ctx_state) { ++ err = -ENOENT; ++ break; ++ } ++ err = rst_kill(ctx); ++ if (!err) ++ ctx->ctx_state = CPT_CTX_IDLE; ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++ ++out: ++ cpt_flush_error(ctx); ++ up(&ctx->main_sem); ++out_nosem: ++ cpt_context_put(ctx); ++out_lock: ++ lock_kernel(); ++ return err; ++} ++ ++static int rst_open(struct inode * inode, struct file * file) ++{ ++ if (!try_module_get(THIS_MODULE)) ++ return -EBUSY; ++ ++ return 0; ++} ++ ++static int rst_release(struct inode * inode, struct file * file) ++{ ++ cpt_context_t *ctx; ++ ++ spin_lock(&cpt_context_lock); ++ ctx = (cpt_context_t*)file->private_data; ++ file->private_data = NULL; ++ if (ctx) ++ __cpt_context_put(ctx); ++ spin_unlock(&cpt_context_lock); ++ ++ ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++static struct file_operations rst_fops = ++{ ++ .owner = THIS_MODULE, ++ .ioctl = rst_ioctl, ++ .open = rst_open, ++ .release = rst_release, ++}; ++ ++ ++static struct proc_dir_entry *proc_ent; ++extern void *schedule_tail_p; ++extern void schedule_tail_hook(void); ++ ++static struct ctl_table_header *ctl_header; ++ ++static ctl_table debug_table[] = { ++ { ++ .ctl_name = 9476, ++ .procname = "rst", ++ .data = &debug_level, ++ .maxlen = sizeof(debug_level), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { .ctl_name = 0 } ++}; ++static ctl_table root_table[] = { ++ { ++ .ctl_name = CTL_DEBUG, ++ .procname = "debug", ++ .mode = 0555, ++ .child = debug_table, ++ }, ++ { .ctl_name = 0 } ++}; ++ ++#ifdef CONFIG_X86_64 ++ ++static void *vzentry_forkret_get(void) ++{ ++ unsigned char *p; ++ ++ p = (unsigned char *)ret_from_fork; ++ return (void *)(*(u32 *)(p + 1) + p + 5); ++} ++ ++static void vzentry_forkret_set(void *data) ++{ ++ unsigned char *p; ++ long offset; ++ ++ p = (unsigned char *)ret_from_fork; ++ offset = (unsigned long)data - (unsigned long)(p + 5); ++ if ((long)(s32)offset != offset) { ++ printk("vzentry_forkret_set: too long hook offset\n"); ++ BUG(); ++ } ++ *(u32 *)(p + 1) = offset; ++} ++#endif ++ ++static int __init init_rst(void) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ctl_header = register_sysctl_table(root_table, 0); ++ if (!ctl_header) ++ goto err_mon; ++ ++ spin_lock_init(&cpt_context_lock); ++ INIT_LIST_HEAD(&cpt_context_list); ++ ++ err = -EINVAL; ++ proc_ent = create_proc_entry("rst", 0600, NULL); ++ if (!proc_ent) ++ goto err_out; ++ ++ rst_fops.read = proc_ent->proc_fops->read; ++ rst_fops.write = proc_ent->proc_fops->write; ++ rst_fops.llseek = proc_ent->proc_fops->llseek; ++ proc_ent->proc_fops = &rst_fops; ++ ++ proc_ent->read_proc = proc_read; ++ proc_ent->data = NULL; ++ proc_ent->owner = THIS_MODULE; ++#ifdef CONFIG_X86_64 ++ schedule_tail_p = vzentry_forkret_get(); ++ vzentry_forkret_set(&schedule_tail_hook); ++#endif ++ return 0; ++ ++err_out: ++ unregister_sysctl_table(ctl_header); ++err_mon: ++ return err; ++} ++module_init(init_rst); ++ ++static void __exit exit_rst(void) ++{ ++#ifdef CONFIG_X86_64 ++ /* This is wrong, of course. But still the best what we can do. */ ++ vzentry_forkret_set(schedule_tail_p); ++#endif ++ ++ remove_proc_entry("rst", NULL); ++ unregister_sysctl_table(ctl_header); ++ ++ spin_lock(&cpt_context_lock); ++ while (!list_empty(&cpt_context_list)) { ++ cpt_context_t *ctx; ++ ctx = list_entry(cpt_context_list.next, cpt_context_t, ctx_list); ++ ++ if (!ctx->sticky) ++ ctx->refcount++; ++ ctx->sticky = 0; ++ ++ BUG_ON(ctx->refcount != 1); ++ ++ __cpt_context_put(ctx); ++ } ++ spin_unlock(&cpt_context_lock); ++} ++module_exit(exit_rst); +diff -upr linux-2.6.16.orig/kernel/cpt/rst_process.c linux-2.6.16-026test015/kernel/cpt/rst_process.c +--- linux-2.6.16.orig/kernel/cpt/rst_process.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1257 @@ ++/* ++ * ++ * kernel/cpt/rst_process.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/virtinfo.h> ++#include <linux/kmem_cache.h> ++#include <linux/errno.h> ++#include <linux/pagemap.h> ++#include <linux/ptrace.h> ++#include <linux/tty.h> ++#include <asm/desc.h> ++#include <asm/unistd.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_misc.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_ubc.h" ++#include "cpt_process.h" ++#include "cpt_kernel.h" ++ ++#ifdef CONFIG_X86_64 ++ ++#define _TIF_RESUME (1<<22) ++ ++#define SYSCALL_NR(regs) ((regs)->orig_rax) ++#define SYSCALL_RETVAL(regs) ((regs)->rax) ++#define SYSCALL_PC(regs) ((regs)->rip) ++ ++#define ESP(tsk) (tsk)->thread.rsp ++ ++#define __NR32_restart_syscall 0 ++#define __NR32_rt_sigtimedwait 177 ++#define __NR32_pause 29 ++#define __NR32_futex 240 ++ ++#define syscall_is(tsk,regs,name) ((!((tsk)->thread_info->flags&_TIF_IA32) && \ ++ SYSCALL_NR(regs) == __NR_##name) || \ ++ (((tsk)->thread_info->flags&_TIF_IA32) && \ ++ SYSCALL_NR(regs) == __NR32_##name)) ++#else ++ ++#define SYSCALL_NR(regs) ((regs)->orig_eax) ++#define SYSCALL_RETVAL(regs) ((regs)->eax) ++#define SYSCALL_PC(regs) ((regs)->eip) ++ ++#define ESP(tsk) (tsk)->thread.esp ++ ++#define syscall_is(tsk,regs,name) (SYSCALL_NR(regs) == __NR_##name) ++ ++#undef task_pt_regs ++#define task_pt_regs(t) ((struct pt_regs *)((t)->thread.esp0) - 1) ++ ++#endif ++ ++static void decode_siginfo(siginfo_t *info, struct cpt_siginfo_image *si) ++{ ++ memset(info, 0, sizeof(*info)); ++ switch(si->cpt_code & __SI_MASK) { ++ case __SI_TIMER: ++ info->si_tid = si->cpt_pid; ++ info->si_overrun = si->cpt_uid; ++ info->_sifields._timer._sigval.sival_ptr = cpt_ptr_import(si->cpt_sigval); ++ info->si_sys_private = si->cpt_utime; ++ break; ++ case __SI_POLL: ++ info->si_band = si->cpt_pid; ++ info->si_fd = si->cpt_uid; ++ break; ++ case __SI_FAULT: ++ info->si_addr = cpt_ptr_import(si->cpt_sigval); ++#ifdef __ARCH_SI_TRAPNO ++ info->si_trapno = si->cpt_pid; ++#endif ++ break; ++ case __SI_CHLD: ++ info->si_pid = si->cpt_pid; ++ info->si_uid = si->cpt_uid; ++ info->si_status = si->cpt_sigval; ++ info->si_stime = si->cpt_stime; ++ info->si_utime = si->cpt_utime; ++ break; ++ case __SI_KILL: ++ case __SI_RT: ++ case __SI_MESGQ: ++ default: ++ info->si_pid = si->cpt_pid; ++ info->si_uid = si->cpt_uid; ++ info->si_ptr = cpt_ptr_import(si->cpt_sigval); ++ break; ++ } ++ info->si_signo = si->cpt_signo; ++ info->si_errno = si->cpt_errno; ++ info->si_code = si->cpt_code; ++} ++ ++static int restore_sigqueue(task_t *tsk, ++ struct sigpending *queue, unsigned long start, ++ unsigned long end) ++{ ++ while (start < end) { ++ struct cpt_siginfo_image *si = (struct cpt_siginfo_image *)start; ++ if (si->cpt_object == CPT_OBJ_SIGINFO) { ++ struct sigqueue *q = NULL; ++ struct user_struct *up; ++ up = alloc_uid(si->cpt_user); ++ if (!up) ++ return -ENOMEM; ++ q = kmem_cache_alloc(sigqueue_cachep, GFP_ATOMIC); ++ if (!q) { ++ free_uid(up); ++ return -ENOMEM; ++ } ++ if (ub_siginfo_charge(q, get_exec_ub())) { ++ kmem_cache_free(sigqueue_cachep, q); ++ free_uid(up); ++ return -ENOMEM; ++ } ++ ++ INIT_LIST_HEAD(&q->list); ++ /* Preallocated elements (posix timers) are not ++ * supported yet. It is safe to replace them with ++ * a private one. */ ++ q->flags = 0; ++ q->user = up; ++ atomic_inc(&q->user->sigpending); ++ ++ decode_siginfo(&q->info, si); ++ list_add_tail(&q->list, &queue->list); ++ } ++ start += si->cpt_next; ++ } ++ return 0; ++} ++ ++int rst_process_linkage(cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ ++ if (tsk == NULL) { ++ eprintk_ctx("task %u(%s) is missing\n", ti->cpt_pid, ti->cpt_comm); ++ return -EINVAL; ++ } ++ ++ if (virt_pgid(tsk) != ti->cpt_pgrp) { ++ int pid; ++ ++ if ((pid = vpid_to_pid(ti->cpt_pgrp)) < 0) { ++ eprintk_ctx("illegal PGRP " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ ++ write_lock_irq(&tasklist_lock); ++ detach_pid(tsk, PIDTYPE_PGID); ++ tsk->signal->pgrp = pid; ++ set_virt_pgid(tsk, ti->cpt_pgrp); ++ if (thread_group_leader(tsk)) ++ attach_pid(tsk, PIDTYPE_PGID, pid); ++ write_unlock_irq(&tasklist_lock); ++ } ++ if (virt_sid(tsk) != ti->cpt_session) { ++ int pid; ++ ++ if ((pid = vpid_to_pid(ti->cpt_session)) < 0) { ++ eprintk_ctx("illegal SID " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ ++ write_lock_irq(&tasklist_lock); ++ detach_pid(tsk, PIDTYPE_SID); ++ tsk->signal->session = pid; ++ set_virt_sid(tsk, ti->cpt_session); ++ if (thread_group_leader(tsk)) ++ attach_pid(tsk, PIDTYPE_SID, pid); ++ write_unlock_irq(&tasklist_lock); ++ } ++ if (ti->cpt_old_pgrp > 0 && tsk->signal->tty_old_pgrp == 0) { ++ int pid; ++ ++ if ((pid = vpid_to_pid(ti->cpt_old_pgrp)) < 0) { ++ eprintk_ctx("illegal OLD_PGRP " CPT_FID "\n", CPT_TID(tsk)); ++ return -EINVAL; ++ } ++ ++ tsk->signal->tty_old_pgrp = pid; ++ } ++ } ++ ++ return 0; ++} ++ ++static int restore_one_signal_struct(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_signal_image *si = cpt_get_buf(ctx); ++ ++ current->signal->tty = NULL; ++ ++ err = rst_get_object(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, si, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ if (virt_pgid(current) != si->cpt_pgrp) { ++ int err; ++ int pid = 0; ++ ++ if (si->cpt_pgrp_type == CPT_PGRP_ORPHAN) { ++ pid = alloc_pidmap(); ++ if (pid < 0) { ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ if ((err = alloc_vpid(pid, si->cpt_pgrp)) < 0) { ++ free_pidmap(pid); ++ pid = 0; ++ if (err != -EEXIST) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ } ++ } ++ if (pid || ++ (pid = vpid_to_pid(si->cpt_pgrp)) > 0) { ++ write_lock_irq(&tasklist_lock); ++ detach_pid(current, PIDTYPE_PGID); ++ current->signal->pgrp = pid; ++ set_virt_pgid(current, si->cpt_pgrp); ++ if (thread_group_leader(current)) ++ attach_pid(current, PIDTYPE_PGID, pid); ++ write_unlock_irq(&tasklist_lock); ++ } ++ } ++ ++ current->signal->tty_old_pgrp = 0; ++ if ((int)si->cpt_old_pgrp > 0) { ++ if (si->cpt_old_pgrp_type == CPT_PGRP_STRAY) { ++ current->signal->tty_old_pgrp = alloc_pidmap(); ++ if (current->signal->tty_old_pgrp < 0) { ++ eprintk_ctx("failed to allocate stray tty_old_pgrp\n"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ free_pidmap(current->signal->tty_old_pgrp); ++ } else { ++ current->signal->tty_old_pgrp = vpid_to_pid(si->cpt_old_pgrp); ++ if (current->signal->tty_old_pgrp < 0) { ++ dprintk_ctx("forward old tty PGID\n"); ++ current->signal->tty_old_pgrp = 0; ++ } ++ } ++ } ++ ++ if (virt_sid(current) != si->cpt_session) { ++ int err; ++ int pid = 0; ++ ++ if (si->cpt_session_type == CPT_PGRP_ORPHAN) { ++ pid = alloc_pidmap(); ++ if (pid < 0) { ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ if ((err = alloc_vpid(pid, si->cpt_session)) < 0) { ++ free_pidmap(pid); ++ pid = 0; ++ if (err != -EEXIST) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ } ++ } ++ if (pid || ++ (pid = vpid_to_pid(si->cpt_session)) > 0) { ++ write_lock_irq(&tasklist_lock); ++ detach_pid(current, PIDTYPE_SID); ++ set_virt_sid(current, si->cpt_session); ++ current->signal->session = pid; ++ if (thread_group_leader(current)) ++ attach_pid(current, PIDTYPE_SID, pid); ++ write_unlock_irq(&tasklist_lock); ++ } ++ } ++ ++ cpt_sigset_import(¤t->signal->shared_pending.signal, si->cpt_sigpending); ++ current->signal->leader = si->cpt_leader; ++ if (si->cpt_ctty != CPT_NULL) { ++ cpt_object_t *obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, si->cpt_ctty, ctx); ++ if (obj) { ++ struct tty_struct *tty = obj->o_obj; ++ if (tty->session == 0 || tty->session == current->signal->session) { ++ tty->session = current->signal->session; ++ current->signal->tty = tty; ++ } else { ++ wprintk_ctx("tty session mismatch\n"); ++ } ++ } ++ } ++ ++ if (si->cpt_curr_target) ++ current->signal->curr_target = find_task_by_pid_ve(si->cpt_curr_target); ++ current->signal->flags = 0; ++ if (si->cpt_group_exit) ++ current->signal->flags |= SIGNAL_GROUP_EXIT; ++ current->signal->group_exit_code = si->cpt_group_exit_code; ++ if (si->cpt_group_exit_task) { ++ current->signal->group_exit_task = find_task_by_pid_ve(si->cpt_group_exit_task); ++ if (current->signal->group_exit_task == NULL) { ++ eprintk_ctx("oops, group_exit_task=NULL, pid=%u\n", si->cpt_group_exit_task); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ } ++ current->signal->notify_count = si->cpt_notify_count; ++ current->signal->group_stop_count = si->cpt_group_stop_count; ++ ++ if (si->cpt_next > si->cpt_hdrlen) { ++ char *buf = kmalloc(si->cpt_next - si->cpt_hdrlen, GFP_KERNEL); ++ if (buf == NULL) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ err = ctx->pread(buf, si->cpt_next - si->cpt_hdrlen, ctx, ++ ti->cpt_signal + si->cpt_hdrlen); ++ if (err) { ++ kfree(buf); ++ cpt_release_buf(ctx); ++ return err; ++ } ++ restore_sigqueue(current, ++ ¤t->signal->shared_pending, (unsigned long)buf, ++ (unsigned long)buf + si->cpt_next - si->cpt_hdrlen); ++ kfree(buf); ++ } ++ cpt_release_buf(ctx); ++ return 0; ++} ++ ++int restore_one_sighand_struct(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_sighand_image si; ++ int i; ++ loff_t pos, endpos; ++ ++ err = rst_get_object(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, &si, ctx); ++ if (err) ++ return err; ++ ++ for (i=0; i<_NSIG; i++) { ++ current->sighand->action[i].sa.sa_handler = SIG_DFL; ++ current->sighand->action[i].sa.sa_restorer = 0; ++ current->sighand->action[i].sa.sa_flags = SA_ONESHOT | SA_NOMASK; ++ memset(¤t->sighand->action[i].sa.sa_mask, 0, sizeof(sigset_t)); ++ } ++ ++ pos = ti->cpt_sighand + si.cpt_hdrlen; ++ endpos = ti->cpt_sighand + si.cpt_next; ++ while (pos < endpos) { ++ struct cpt_sighandler_image shi; ++ ++ err = rst_get_object(CPT_OBJ_SIGHANDLER, pos, &shi, ctx); ++ if (err) ++ return err; ++ current->sighand->action[shi.cpt_signo].sa.sa_handler = (void*)(unsigned long)shi.cpt_handler; ++ current->sighand->action[shi.cpt_signo].sa.sa_restorer = (void*)(unsigned long)shi.cpt_restorer; ++ current->sighand->action[shi.cpt_signo].sa.sa_flags = shi.cpt_flags; ++ cpt_sigset_import(¤t->sighand->action[shi.cpt_signo].sa.sa_mask, shi.cpt_mask); ++ pos += shi.cpt_next; ++ } ++ ++ return 0; ++} ++ ++ ++__u32 rst_signal_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ __u32 flag = 0; ++ ++ if (lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx)) ++ flag |= CLONE_THREAD; ++ if (ti->cpt_sighand == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx)) ++ flag |= CLONE_SIGHAND; ++ return flag; ++} ++ ++int rst_signal_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ ++ if (ti->cpt_signal == CPT_NULL || ti->cpt_sighand == CPT_NULL) { ++ return -EINVAL; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGHAND_STRUCT, ti->cpt_sighand, ctx); ++ if (obj) { ++ struct sighand_struct *sig = current->sighand; ++ if (obj->o_obj != sig) { ++ return -EINVAL; ++ } ++ } else { ++ obj = cpt_object_add(CPT_OBJ_SIGHAND_STRUCT, current->sighand, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ cpt_obj_setpos(obj, ti->cpt_sighand, ctx); ++ err = restore_one_sighand_struct(ti, ctx); ++ if (err) ++ return err; ++ } ++ ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SIGNAL_STRUCT, ti->cpt_signal, ctx); ++ if (obj) { ++ struct signal_struct *sig = current->signal; ++ if (obj->o_obj != sig) { ++ return -EINVAL; ++ } ++ if (current->signal) { ++ set_virt_pgid(current, pid_type_to_vpid(PIDTYPE_PGID, current->signal->pgrp)); ++ set_virt_sid(current, pid_type_to_vpid(PIDTYPE_SID, current->signal->session)); ++ } ++ } else { ++ obj = cpt_object_add(CPT_OBJ_SIGNAL_STRUCT, current->signal, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ cpt_obj_setpos(obj, ti->cpt_signal, ctx); ++ err = restore_one_signal_struct(ti, ctx); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++ ++static u32 decode_segment(u32 segid) ++{ ++ if (segid == CPT_SEG_ZERO) ++ return 0; ++ ++ /* TLS descriptors */ ++ if (segid <= CPT_SEG_TLS3) ++ return ((GDT_ENTRY_TLS_MIN + segid-CPT_SEG_TLS1)<<3) + 3; ++ ++ /* LDT descriptor, it is just an index to LDT array */ ++ if (segid >= CPT_SEG_LDT) ++ return ((segid - CPT_SEG_LDT) << 3) | 7; ++ ++ /* Check for one of standard descriptors */ ++#ifdef CONFIG_X86_64 ++ if (segid == CPT_SEG_USER32_DS) ++ return __USER32_DS; ++ if (segid == CPT_SEG_USER32_CS) ++ return __USER32_CS; ++ if (segid == CPT_SEG_USER64_DS) ++ return __USER_DS; ++ if (segid == CPT_SEG_USER64_CS) ++ return __USER_CS; ++#else ++ if (segid == CPT_SEG_USER32_DS) ++ return __USER_DS; ++ if (segid == CPT_SEG_USER32_CS) ++ return __USER_CS; ++#endif ++ wprintk("Invalid segment reg %d\n", segid); ++ return 0; ++} ++ ++unsigned long rct(unsigned long *child_tids) ++{ ++ dprintk("rct: " CPT_FID "\n", CPT_TID(current)); ++ current->clear_child_tid = (void*)child_tids[0]; ++ current->set_child_tid = (void*)child_tids[1]; ++ module_put(THIS_MODULE); ++ return (unsigned long)(child_tids+2); ++} ++ ++unsigned long rlsi(void) ++{ ++ int signr; ++ siginfo_t *info = current->last_siginfo; ++ struct pt_regs *regs = task_pt_regs(current); ++ struct k_sigaction *ka; ++ int ptrace_id; ++ ++ dprintk("rlsi: " CPT_FID "\n", CPT_TID(current)); ++ ++ spin_lock_irq(¤t->sighand->siglock); ++ current->last_siginfo = NULL; ++ recalc_sigpending(); ++ ++ ptrace_id = current->pn_state; ++ clear_pn_state(current); ++ ++ switch (ptrace_id) { ++ case PN_STOP_TF: ++ case PN_STOP_TF_RT: ++ /* frame_*signal */ ++ dprintk("SIGTRAP %u/%u(%s) %u/%u %u %ld %lu %lu\n", ++ virt_pid(current), current->pid, current->comm, ++ info->si_signo, info->si_code, ++ current->exit_code, SYSCALL_NR(regs), ++ current->ptrace, current->ptrace_message); ++ goto out; ++ case PN_STOP_ENTRY: ++ case PN_STOP_LEAVE: ++ /* do_syscall_trace */ ++ spin_unlock_irq(¤t->sighand->siglock); ++ dprintk("ptrace do_syscall_trace: %d %d\n", ptrace_id, current->exit_code); ++ if (current->exit_code) { ++ send_sig(current->exit_code, current, 1); ++ current->exit_code = 0; ++ } ++ if (ptrace_id == PN_STOP_ENTRY && SYSCALL_RETVAL(regs) == -ENOSYS) { ++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); ++ SYSCALL_PC(regs) -= 2; ++ } else if (syscall_is(current, regs, rt_sigtimedwait)) { ++ if (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR) { ++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); ++ SYSCALL_PC(regs) -= 2; ++ } ++ } ++ goto out_nolock; ++ case PN_STOP_FORK: ++ /* fork */ ++ SYSCALL_RETVAL(regs) = current->ptrace_message; ++ dprintk("ptrace fork returns pid %ld\n", SYSCALL_RETVAL(regs)); ++ goto out; ++ case PN_STOP_VFORK: ++ /* after vfork */ ++ SYSCALL_RETVAL(regs) = current->ptrace_message; ++ dprintk("ptrace after vfork returns pid %ld\n", SYSCALL_RETVAL(regs)); ++ goto out; ++ case PN_STOP_SIGNAL: ++ /* normal case : dequeue signal */ ++ break; ++ case PN_STOP_EXIT: ++ dprintk("ptrace exit caught\n"); ++ current->ptrace &= ~PT_TRACE_EXIT; ++ spin_unlock_irq(¤t->sighand->siglock); ++ module_put(THIS_MODULE); ++ complete_and_exit(NULL, current->ptrace_message); ++ BUG(); ++ case PN_STOP_EXEC: ++ eprintk("ptrace after exec caught: must not happen\n"); ++ BUG(); ++ default: ++ eprintk("ptrace with unknown identity %d\n", ptrace_id); ++ BUG(); ++ } ++ ++ signr = current->exit_code; ++ if (signr == 0) { ++ dprintk("rlsi: canceled signal %d\n", info->si_signo); ++ goto out; ++ } ++ current->exit_code = 0; ++ ++ if (signr != info->si_signo) { ++ info->si_signo = signr; ++ info->si_errno = 0; ++ info->si_code = SI_USER; ++ info->si_pid = virt_pid(current->parent); ++ info->si_uid = current->parent->uid; ++ } ++ ++ /* If the (new) signal is now blocked, requeue it. */ ++ if (sigismember(¤t->blocked, signr)) { ++ dprintk("going to requeue signal %d\n", signr); ++ goto out_resend_sig; ++ } ++ ++ ka = ¤t->sighand->action[signr-1]; ++ if (ka->sa.sa_handler == SIG_IGN) { ++ dprintk("going to resend signal %d (ignored)\n", signr); ++ goto out; ++ } ++ if (ka->sa.sa_handler != SIG_DFL) { ++ dprintk("going to resend signal %d (not SIG_DFL)\n", signr); ++ goto out_resend_sig; ++ } ++ if (signr == SIGCONT || ++ signr == SIGCHLD || ++ signr == SIGWINCH || ++ signr == SIGURG || ++ current->pid == 1) ++ goto out; ++ ++ /* All the rest, which we cannot handle are requeued. */ ++ dprintk("going to resend signal %d (sigh)\n", signr); ++out_resend_sig: ++ spin_unlock_irq(¤t->sighand->siglock); ++ send_sig_info(signr, info, current); ++ module_put(THIS_MODULE); ++ return (unsigned long)(info+1); ++ ++out: ++ spin_unlock_irq(¤t->sighand->siglock); ++out_nolock: ++ module_put(THIS_MODULE); ++ return (unsigned long)(info+1); ++} ++ ++static void ret_finish_stop(void) ++{ ++ /* ... ++ * do_signal() -> ++ * get_signal_to_deliver() -> ++ * do_signal_stop() -> ++ * finish_stop() ++ * ++ * Normally after SIGCONT it will dequeue the next signal. If no signal ++ * is found, do_signal restarts syscall unconditionally. ++ * Otherwise signal handler is pushed on user stack. ++ */ ++ ++ dprintk("rfs: " CPT_FID "\n", CPT_TID(current)); ++ ++ clear_stop_state(current); ++ current->exit_code = 0; ++ ++ module_put(THIS_MODULE); ++} ++ ++static void ret_restart_sys(void) ++{ ++ struct pt_regs *regs = task_pt_regs(current); ++ ++ /* This hook is supposed to be executed, when we have ++ * to complete some interrupted syscall. ++ */ ++ dprintk("rrs: " CPT_FID "\n", CPT_TID(current)); ++ ++ if (syscall_is(current,regs,pause)) { ++ if (SYSCALL_RETVAL(regs) == -ERESTARTNOHAND) { ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ } ++ } else if (syscall_is(current,regs,rt_sigtimedwait)) { ++ if (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR) { ++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); ++ SYSCALL_PC(regs) -= 2; ++ } ++ } else if (syscall_is(current,regs,futex)) { ++ if (SYSCALL_RETVAL(regs) == -EINTR) { ++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); ++ SYSCALL_PC(regs) -= 2; ++ } ++ } ++ ++ if (!signal_pending(current)) { ++ if (SYSCALL_RETVAL(regs) == -ERESTARTSYS || ++ SYSCALL_RETVAL(regs) == -ERESTARTNOINTR || ++ SYSCALL_RETVAL(regs) == -ERESTARTNOHAND) { ++ SYSCALL_RETVAL(regs) = SYSCALL_NR(regs); ++ SYSCALL_PC(regs) -= 2; ++ } else if (SYSCALL_RETVAL(regs) == -ERESTART_RESTARTBLOCK) { ++ SYSCALL_RETVAL(regs) = __NR_restart_syscall; ++#ifdef CONFIG_X86_64 ++ if (current->thread_info->flags&_TIF_IA32) ++ SYSCALL_RETVAL(regs) = __NR32_restart_syscall; ++#endif ++ SYSCALL_PC(regs) -= 2; ++ } ++ } ++ ++ module_put(THIS_MODULE); ++} ++ ++extern void ret_last_siginfo(void); ++extern void ret_child_tid(void); ++extern void ret_from_rst(void); ++extern void pre_ret_from_fork(void); ++ ++#ifndef CONFIG_X86_64 ++ ++/* tsk->thread.eip points to pre_ret_from_fork ++ * Stack layout: ++ * [eip of the last hook] ++ * [args of the last hook] ++ * [eip of previous hook] ++ * [args of previous hook] ++ * ... ++ * [eip of the first hook] ++ * [args of the first hook] ++ * [ret_from_rst] ++ */ ++ ++static void * add_hook(task_t *tsk, void (*hook)(void), int argsize, int *hooks) ++{ ++ ESP(tsk) -= sizeof(unsigned long); ++ *(unsigned long*)ESP(tsk) = tsk->thread.eip; ++ ESP(tsk) -= argsize; ++ tsk->thread.eip = (unsigned long)hook; ++ if (!try_module_get(THIS_MODULE)) BUG(); ++ (*hooks)++; ++ return (void*)ESP(tsk); ++} ++ ++static int restore_registers(task_t *tsk, struct pt_regs *regs, ++ struct cpt_task_image *ti, struct cpt_x86_regs *b) ++{ ++ if (b->cpt_object != CPT_OBJ_X86_REGS) ++ return -EINVAL; ++ ++ tsk->thread.esp = (unsigned long) regs; ++ tsk->thread.esp0 = (unsigned long) (regs+1); ++ tsk->thread.eip = (unsigned long) ret_from_rst; ++ ++ tsk->thread.fs = decode_segment(b->cpt_fs); ++ tsk->thread.gs = decode_segment(b->cpt_gs); ++ tsk->thread.debugreg[0] = b->cpt_debugreg[0]; ++ tsk->thread.debugreg[1] = b->cpt_debugreg[1]; ++ tsk->thread.debugreg[2] = b->cpt_debugreg[2]; ++ tsk->thread.debugreg[3] = b->cpt_debugreg[3]; ++ tsk->thread.debugreg[4] = b->cpt_debugreg[4]; ++ tsk->thread.debugreg[5] = b->cpt_debugreg[5]; ++ tsk->thread.debugreg[6] = b->cpt_debugreg[6]; ++ tsk->thread.debugreg[7] = b->cpt_debugreg[7]; ++ ++ memcpy(regs, &b->cpt_ebx, sizeof(struct pt_regs)); ++ ++ regs->xcs = decode_segment(b->cpt_xcs); ++ regs->xss = decode_segment(b->cpt_xss); ++ regs->xds = decode_segment(b->cpt_xds); ++ regs->xes = decode_segment(b->cpt_xes); ++ ++ return 0; ++} ++ ++#else ++ ++/* Stack layout: ++ * ++ * [eip of the last hook] ++ * [args of the last hook] ++ * ... ++ * [eip of the first hook] ++ * [args of the first hook] ++ * [ret_from_fork+5] ++ */ ++ ++static void * add_hook(task_t *tsk, void (*hook)(void), int argsize, int *hooks) ++{ ++ if (!*hooks) { ++ extern void ret_from_fork2(void); ++ ESP(tsk) -= sizeof(unsigned long); ++ *(unsigned long*)ESP(tsk) = (unsigned long)ret_from_fork2; ++ tsk->thread_info->flags |= _TIF_RESUME; ++ } ++ ESP(tsk) -= argsize + sizeof(unsigned long); ++ *(unsigned long*)ESP(tsk) = (unsigned long)hook; ++ if (!try_module_get(THIS_MODULE)) BUG(); ++ (*hooks)++; ++ return (void*)(ESP(tsk) + sizeof(unsigned long)); ++} ++ ++static void xlate_ptregs_32_to_64(struct pt_regs *d, struct cpt_x86_regs *s) ++{ ++ memset(d, 0, sizeof(struct pt_regs)); ++ d->rbp = s->cpt_ebp; ++ d->rbx = s->cpt_ebx; ++ d->rax = (s32)s->cpt_eax; ++ d->rcx = s->cpt_ecx; ++ d->rdx = s->cpt_edx; ++ d->rsi = s->cpt_esi; ++ d->rdi = s->cpt_edi; ++ d->orig_rax = (s32)s->cpt_orig_eax; ++ d->rip = s->cpt_eip; ++ d->cs = s->cpt_xcs; ++ d->eflags = s->cpt_eflags; ++ d->rsp = s->cpt_esp; ++ d->ss = s->cpt_xss; ++} ++ ++static int restore_registers(task_t *tsk, struct pt_regs *regs, ++ struct cpt_task_image *ti, struct cpt_obj_bits *hdr) ++{ ++ if (hdr->cpt_object == CPT_OBJ_X86_64_REGS) { ++ struct cpt_x86_64_regs *b = (void*)hdr; ++ ++ tsk->thread.rsp = (unsigned long) regs; ++ tsk->thread.rsp0 = (unsigned long) (regs+1); ++ ++ tsk->thread.fs = b->cpt_fsbase; ++ tsk->thread.gs = b->cpt_gsbase; ++ tsk->thread.fsindex = decode_segment(b->cpt_fsindex); ++ tsk->thread.gsindex = decode_segment(b->cpt_gsindex); ++ tsk->thread.ds = decode_segment(b->cpt_ds); ++ tsk->thread.es = decode_segment(b->cpt_es); ++ tsk->thread.debugreg0 = b->cpt_debugreg[0]; ++ tsk->thread.debugreg1 = b->cpt_debugreg[1]; ++ tsk->thread.debugreg2 = b->cpt_debugreg[2]; ++ tsk->thread.debugreg3 = b->cpt_debugreg[3]; ++ tsk->thread.debugreg6 = b->cpt_debugreg[6]; ++ tsk->thread.debugreg7 = b->cpt_debugreg[7]; ++ ++ memcpy(regs, &b->cpt_r15, sizeof(struct pt_regs)); ++ ++ tsk->thread.userrsp = regs->rsp; ++ regs->cs = decode_segment(b->cpt_cs); ++ regs->ss = decode_segment(b->cpt_ss); ++ } else if (hdr->cpt_object == CPT_OBJ_X86_REGS) { ++ struct cpt_x86_regs *b = (void*)hdr; ++ ++ tsk->thread.rsp = (unsigned long) regs; ++ tsk->thread.rsp0 = (unsigned long) (regs+1); ++ ++ tsk->thread.fs = 0; ++ tsk->thread.gs = 0; ++ tsk->thread.fsindex = decode_segment(b->cpt_fs); ++ tsk->thread.gsindex = decode_segment(b->cpt_gs); ++ tsk->thread.debugreg0 = b->cpt_debugreg[0]; ++ tsk->thread.debugreg1 = b->cpt_debugreg[1]; ++ tsk->thread.debugreg2 = b->cpt_debugreg[2]; ++ tsk->thread.debugreg3 = b->cpt_debugreg[3]; ++ tsk->thread.debugreg6 = b->cpt_debugreg[6]; ++ tsk->thread.debugreg7 = b->cpt_debugreg[7]; ++ ++ xlate_ptregs_32_to_64(regs, b); ++ ++ tsk->thread.userrsp = regs->rsp; ++ regs->cs = decode_segment(b->cpt_xcs); ++ regs->ss = decode_segment(b->cpt_xss); ++ tsk->thread.ds = decode_segment(b->cpt_xds); ++ tsk->thread.es = decode_segment(b->cpt_xes); ++ } else { ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++#endif ++ ++int rst_restore_process(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ struct pt_regs * regs; ++ struct cpt_object_hdr *b; ++ struct cpt_siginfo_image *lsi = NULL; ++ struct group_info *gids, *ogids; ++ int hooks = 0; ++ int i; ++ ++ if (tsk == NULL) { ++ eprintk_ctx("oops, task %d/%s is missing\n", ti->cpt_pid, ti->cpt_comm); ++ return -EFAULT; ++ } ++ ++ wait_task_inactive(tsk); ++ regs = task_pt_regs(tsk); ++ ++ if (!tsk->exit_state) { ++ tsk->lock_depth = -1; ++#ifdef CONFIG_PREEMPT ++ tsk->thread_info->preempt_count--; ++#endif ++ } ++ ++ if (tsk->static_prio != ti->cpt_static_prio) ++ set_user_nice(tsk, PRIO_TO_NICE(ti->cpt_static_prio)); ++ ++ cpt_sigset_import(&tsk->blocked, ti->cpt_sigblocked); ++ cpt_sigset_import(&tsk->real_blocked, ti->cpt_sigrblocked); ++ cpt_sigset_import(&tsk->saved_sigmask, ti->cpt_sigsuspend_blocked); ++ cpt_sigset_import(&tsk->pending.signal, ti->cpt_sigpending); ++ ++ tsk->uid = ti->cpt_uid; ++ tsk->euid = ti->cpt_euid; ++ tsk->suid = ti->cpt_suid; ++ tsk->fsuid = ti->cpt_fsuid; ++ tsk->gid = ti->cpt_gid; ++ tsk->egid = ti->cpt_egid; ++ tsk->sgid = ti->cpt_sgid; ++ tsk->fsgid = ti->cpt_fsgid; ++ memcpy(&tsk->cap_effective, &ti->cpt_ecap, sizeof(tsk->cap_effective)); ++ memcpy(&tsk->cap_inheritable, &ti->cpt_icap, sizeof(tsk->cap_inheritable)); ++ memcpy(&tsk->cap_permitted, &ti->cpt_pcap, sizeof(tsk->cap_permitted)); ++ tsk->keep_capabilities = (ti->cpt_keepcap != 0); ++ tsk->did_exec = (ti->cpt_did_exec != 0); ++ gids = groups_alloc(ti->cpt_ngids); ++ ogids = tsk->group_info; ++ if (gids) { ++ int i; ++ for (i=0; i<32; i++) ++ gids->small_block[i] = ti->cpt_gids[i]; ++ tsk->group_info = gids; ++ } ++ if (ogids) ++ put_group_info(ogids); ++ tsk->utime = ti->cpt_utime; ++ tsk->stime = ti->cpt_stime; ++ if (ctx->image_version == 0) { ++ tsk->start_time = _ns_to_timespec(ti->cpt_starttime*TICK_NSEC); ++ } else { ++ cpt_timespec_import(&tsk->start_time, ti->cpt_starttime); ++ } ++ _set_normalized_timespec(&tsk->start_time, ++ tsk->start_time.tv_sec - ++ get_exec_env()->init_entry->start_time.tv_sec, ++ tsk->start_time.tv_nsec - ++ get_exec_env()->init_entry->start_time.tv_nsec); ++ ++ tsk->nvcsw = ti->cpt_nvcsw; ++ tsk->nivcsw = ti->cpt_nivcsw; ++ tsk->min_flt = ti->cpt_min_flt; ++ tsk->maj_flt = ti->cpt_maj_flt; ++ ++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,8) ++ tsk->cutime = ti->cpt_cutime; ++ tsk->cstime = ti->cpt_cstime; ++ tsk->cnvcsw = ti->cpt_cnvcsw; ++ tsk->cnivcsw = ti->cpt_cnivcsw; ++ tsk->cmin_flt = ti->cpt_cmin_flt; ++ tsk->cmaj_flt = ti->cpt_cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; i<RLIM_NLIMITS; i++) { ++ tsk->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; ++ tsk->rlim[i].rlim_max = ti->cpt_rlim_max[i]; ++ } ++#else ++ if (thread_group_leader(tsk) && tsk->signal) { ++ tsk->signal->utime = ti->cpt_utime; ++ tsk->signal->stime = ti->cpt_stime; ++ tsk->signal->cutime = ti->cpt_cutime; ++ tsk->signal->cstime = ti->cpt_cstime; ++ tsk->signal->nvcsw = ti->cpt_nvcsw; ++ tsk->signal->nivcsw = ti->cpt_nivcsw; ++ tsk->signal->cnvcsw = ti->cpt_cnvcsw; ++ tsk->signal->cnivcsw = ti->cpt_cnivcsw; ++ tsk->signal->min_flt = ti->cpt_min_flt; ++ tsk->signal->maj_flt = ti->cpt_maj_flt; ++ tsk->signal->cmin_flt = ti->cpt_cmin_flt; ++ tsk->signal->cmaj_flt = ti->cpt_cmaj_flt; ++ ++ if (RLIM_NLIMITS > CPT_RLIM_NLIMITS) ++ __asm__("undefined\n"); ++ ++ for (i=0; i<RLIM_NLIMITS; i++) { ++ tsk->signal->rlim[i].rlim_cur = ti->cpt_rlim_cur[i]; ++ tsk->signal->rlim[i].rlim_max = ti->cpt_rlim_max[i]; ++ } ++ } ++#endif ++ ++ for (i=0; i<3; i++) { ++ if (i >= GDT_ENTRY_TLS_ENTRIES) { ++ eprintk_ctx("too many tls descs\n"); ++ } else { ++#ifndef CONFIG_X86_64 ++ tsk->thread.tls_array[i].a = ti->cpt_tls[i]&0xFFFFFFFF; ++ tsk->thread.tls_array[i].b = ti->cpt_tls[i]>>32; ++#else ++ tsk->thread.tls_array[i] = ti->cpt_tls[i]; ++#endif ++ } ++ } ++ ++ clear_stopped_child_used_math(tsk); ++ ++ b = (void *)(ti+1); ++ while ((void*)b < ((void*)ti) + ti->cpt_next) { ++ /* Siginfo objects are at the end of obj array */ ++ if (b->cpt_object == CPT_OBJ_SIGINFO) { ++ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); ++ restore_sigqueue(tsk, &tsk->pending, (unsigned long)b, (unsigned long)ti + ti->cpt_next); ++ set_exec_env(env); ++ break; ++ } ++ ++ switch (b->cpt_object) { ++ case CPT_OBJ_BITS: ++ if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE && ++ cpu_has_fxsr) { ++ memcpy(&tsk->thread.i387, ++ (void*)b + b->cpt_hdrlen, ++ sizeof(struct i387_fxsave_struct)); ++ if (ti->cpt_used_math) ++ set_stopped_child_used_math(tsk); ++ } ++#ifdef CONFIG_X86_32 ++ else if (b->cpt_content == CPT_CONTENT_X86_FPUSTATE_OLD && ++ !cpu_has_fxsr) { ++ memcpy(&tsk->thread.i387, ++ (void*)b + b->cpt_hdrlen, ++ sizeof(struct i387_fsave_struct)); ++ if (ti->cpt_used_math) ++ set_stopped_child_used_math(tsk); ++ } ++#endif ++ break; ++ case CPT_OBJ_LASTSIGINFO: ++ lsi = (void*)b; ++ break; ++ case CPT_OBJ_X86_REGS: ++ case CPT_OBJ_X86_64_REGS: ++ if (restore_registers(tsk, regs, ti, (void*)b)) { ++ eprintk_ctx("cannot restore registers: image is corrupted\n"); ++ return -EINVAL; ++ } ++ break; ++ case CPT_OBJ_SIGALTSTACK: { ++ struct cpt_sigaltstack_image *sas; ++ sas = (struct cpt_sigaltstack_image *)b; ++ tsk->sas_ss_sp = sas->cpt_stack; ++ tsk->sas_ss_size = sas->cpt_stacksize; ++ break; ++ } ++ } ++ b = ((void*)b) + b->cpt_next; ++ } ++ ++ if (ti->cpt_ppid != ti->cpt_rppid) { ++ task_t *parent; ++ struct ve_struct *env = set_exec_env(VE_TASK_INFO(tsk)->owner_env); ++ write_lock_irq(&tasklist_lock); ++ parent = find_task_by_pid_ve(ti->cpt_ppid); ++ if (parent && parent != tsk->parent) { ++ list_add(&tsk->ptrace_list, &tsk->parent->ptrace_children); ++ REMOVE_LINKS(tsk); ++ tsk->parent = parent; ++ SET_LINKS(tsk); ++ } ++ write_unlock_irq(&tasklist_lock); ++ set_exec_env(env); ++ } ++ ++ tsk->ptrace_message = ti->cpt_ptrace_message; ++ tsk->pn_state = ti->cpt_pn_state; ++ tsk->stopped_state = ti->cpt_stopped_state; ++ tsk->thread_info->flags = ti->cpt_thrflags; ++ ++ /* The image was created with kernel < 2.6.16, while ++ * task hanged in sigsuspend -> do_signal. ++ * ++ * FIXME! This needs more brain efforts... ++ */ ++ if (ti->cpt_sigsuspend_state) { ++ tsk->thread_info->flags |= _TIF_RESTORE_SIGMASK; ++ } ++ ++#ifdef CONFIG_X86_64 ++ tsk->thread_info->flags |= _TIF_FORK; ++ if (!ti->cpt_64bit) ++ tsk->thread_info->flags |= _TIF_IA32; ++#endif ++ ++#ifndef CONFIG_X86_64 ++ do { ++ if (regs->orig_eax == __NR__newselect && regs->edi) { ++ struct timeval tv; ++ if (access_process_vm(tsk, regs->edi, &tv, ++ sizeof(tv), 0) != sizeof(tv)) { ++ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm: edi %ld\n", ++ virt_pid(tsk), tsk->pid, tsk->comm, ++ regs->edi); ++ break; ++ } ++ dprintk_ctx("task %d/%d(%s): Old timeval in newselect: %ld.%ld\n", ++ virt_pid(tsk), tsk->pid, tsk->comm, ++ tv.tv_sec, tv.tv_usec); ++ tv.tv_sec -= ctx->delta_time.tv_sec; ++ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { ++ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; ++ tv.tv_sec--; ++ } else { ++ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; ++ } ++ if (tv.tv_sec < 0) { ++ tv.tv_sec = 0; ++ tv.tv_usec = 0; ++ } ++ dprintk_ctx("task %d/%d(%s): New timeval in newselect: %ld.%ld\n", ++ virt_pid(tsk), tsk->pid, tsk->comm, ++ tv.tv_sec, tv.tv_usec); ++ if (access_process_vm(tsk, regs->edi, &tv, ++ sizeof(tv), 1) != sizeof(tv)) { ++ wprintk_ctx("task %d/%d(%s): Error 1 in access_process_vm write: edi %ld\n", ++ virt_pid(tsk), tsk->pid, tsk->comm, regs->edi); ++ } ++ ++ } else if (regs->orig_eax == __NR_select && regs->edi) { ++ struct { ++ unsigned long n; ++ fd_set __user *inp, *outp, *exp; ++ struct timeval __user *tvp; ++ } a; ++ struct timeval tv; ++ if (access_process_vm(tsk, regs->ebx, &a, ++ sizeof(a), 0) != sizeof(a)) { ++ wprintk_ctx("task %d: Error 2 in access_process_vm\n", tsk->pid); ++ break; ++ } ++ if (access_process_vm(tsk, (unsigned long)a.tvp, ++ &tv, sizeof(tv), 0) != sizeof(tv)) { ++ wprintk_ctx("task %d: Error 3 in access_process_vm\n", tsk->pid); ++ break; ++ } ++ dprintk_ctx("task %d: Old timeval in select: %ld.%ld\n", ++ tsk->pid, tv.tv_sec, tv.tv_usec); ++ tv.tv_sec -= ctx->delta_time.tv_sec; ++ if (tv.tv_usec < ctx->delta_time.tv_nsec / 1000) { ++ tv.tv_usec += 1000000 - ctx->delta_time.tv_nsec / 1000; ++ tv.tv_sec--; ++ } else { ++ tv.tv_usec -= ctx->delta_time.tv_nsec / 1000; ++ } ++ if (tv.tv_sec < 0) { ++ tv.tv_sec = 0; ++ tv.tv_usec = 0; ++ } ++ dprintk_ctx("task %d: New timeval in select: %ld.%ld\n", ++ tsk->pid, tv.tv_sec, tv.tv_usec); ++ if (access_process_vm(tsk, (unsigned long)a.tvp, ++ &tv, sizeof(tv), 1) != sizeof(tv)) { ++ wprintk_ctx("task %d: Error 3 in access_process_vm write\n", tsk->pid); ++ } ++ } ++ } while (0); ++#endif ++ ++ if (!tsk->exit_state && (long)SYSCALL_NR(regs) >= 0) { ++ if (SYSCALL_RETVAL(regs) == -ERESTARTSYS || ++ SYSCALL_RETVAL(regs) == -ERESTARTNOINTR || ++ SYSCALL_RETVAL(regs) == -ERESTARTNOHAND || ++ SYSCALL_RETVAL(regs) == -ERESTART_RESTARTBLOCK || ++ syscall_is(tsk,regs,pause) || ++ (syscall_is(tsk,regs,rt_sigtimedwait) && ++ (SYSCALL_RETVAL(regs) == -EAGAIN || SYSCALL_RETVAL(regs) == -EINTR)) || ++ (syscall_is(tsk,regs,futex) && ++ (SYSCALL_RETVAL(regs) == -EINTR))) ++ add_hook(tsk, ret_restart_sys, 0, &hooks); ++ } ++ ++ if (lsi || tsk->pn_state) { ++ /* ... -> ptrace_notify() ++ * or ++ * ... -> do_signal() -> get_signal_to_deliver() -> ++ * ptrace stop ++ */ ++ tsk->last_siginfo = add_hook(tsk, ret_last_siginfo, sizeof(siginfo_t), &hooks); ++ memset(tsk->last_siginfo, 0, sizeof(siginfo_t)); ++ if (lsi) ++ decode_siginfo(tsk->last_siginfo, lsi); ++ } ++ ++ tsk->ptrace = ti->cpt_ptrace; ++ tsk->flags = ti->cpt_flags & ~PF_FROZEN; ++ clear_tsk_thread_flag(tsk, TIF_FREEZE); ++ tsk->exit_signal = ti->cpt_exit_signal; ++ ++ if (tsk->stopped_state) { ++ dprintk_ctx("finish_stop\n"); ++ if (ti->cpt_state != TASK_STOPPED) ++ eprintk_ctx("Hellooo, state is %u\n", (unsigned)ti->cpt_state); ++ add_hook(tsk, ret_finish_stop, 0, &hooks); ++ } ++ ++ if (!tsk->exit_state && ++ (ti->cpt_set_tid || ti->cpt_clear_tid)) { ++ unsigned long *ptr = add_hook(tsk, ret_child_tid, sizeof(unsigned long)*2, &hooks); ++ ptr[0] = ti->cpt_clear_tid; ++ ptr[1] = ti->cpt_set_tid; ++ dprintk_ctx("settids\n"); ++ } ++ ++#ifdef CONFIG_X86_64 ++ if (!hooks && (long)SYSCALL_NR(regs) < 0) { ++ extern void ret_from_fork2(void); ++ ESP(tsk) -= sizeof(unsigned long); ++ *(unsigned long*)ESP(tsk) = (unsigned long)ret_from_fork2; ++ tsk->thread_info->flags |= _TIF_RESUME; ++ } ++#else ++ tsk->thread.esp -= 4; ++ *(__u32*)tsk->thread.esp = tsk->thread.eip; ++ tsk->thread.eip = (unsigned long)pre_ret_from_fork; ++#endif ++ ++ if (ti->cpt_state == TASK_TRACED) ++ tsk->state = TASK_TRACED; ++ else if (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD)) { ++ tsk->signal->it_virt_expires = 0; ++ tsk->signal->it_prof_expires = 0; ++ if (tsk->state != EXIT_DEAD) ++ eprintk_ctx("oops, schedule() did not make us dead\n"); ++ } ++ ++ if (thread_group_leader(tsk) && ++ ti->cpt_it_real_value && ++ !(ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { ++ DEFINE_KTIME(val); ++ ++ if (ctx->image_version != 0) { ++ ktime_t delta; ++ ++ val = ktime_add_ns(val, ti->cpt_it_real_value); ++ delta = timespec_to_ktime(ctx->delta_time); ++ val = ktime_sub(val, delta); ++ if (val.tv64 <= 0) ++ val.tv64 = NSEC_PER_USEC; ++ dprintk("rst itimer " CPT_FID " +%Ld %Ld %Lu\n", CPT_TID(tsk), val.tv64, delta.tv64, ti->cpt_it_real_value); ++ } else { ++ unsigned long jif = ti->cpt_it_real_value - ++ timespec_to_jiffies(&ctx->delta_time); ++ if ((long)jif <= 0) ++ jif = 1; ++ val = ktime_add_ns(val, (u64)jif*TICK_NSEC); ++ } ++ spin_lock_irq(&tsk->sighand->siglock); ++ if (hrtimer_try_to_cancel(&tsk->signal->real_timer) >= 0) { ++ /* FIXME. Check!!!! */ ++ hrtimer_start(&tsk->signal->real_timer, val, HRTIMER_REL); ++ } else { ++ wprintk_ctx("Timer clash. Impossible?\n"); ++ } ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ dprintk_ctx("itimer " CPT_FID " +%Lu\n", CPT_TID(tsk), val.tv64); ++ } ++ ++ module_put(THIS_MODULE); ++ } ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_socket.c linux-2.6.16-026test015/kernel/cpt/rst_socket.c +--- linux-2.6.16.orig/kernel/cpt/rst_socket.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_socket.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,876 @@ ++/* ++ * ++ * kernel/cpt/rst_socket.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/namei.h> ++#include <linux/socket.h> ++#include <linux/un.h> ++#include <net/tcp.h> ++#include <net/sock.h> ++#include <net/scm.h> ++#include <net/af_unix.h> ++ ++#include <ub/ub_mem.h> ++#include <ub/ub_orphan.h> ++#include <ub/ub_orphan.h> ++#include <ub/ub_net.h> ++#include <ub/ub_tcp.h> ++ ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++ ++#include "cpt_syscalls.h" ++ ++ ++static int setup_sock_common(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ if (sk->sk_socket) { ++ sk->sk_socket->flags = si->cpt_ssflags; ++ sk->sk_socket->state = si->cpt_sstate; ++ } ++ sk->sk_reuse = si->cpt_reuse; ++ sk->sk_shutdown = si->cpt_shutdown; ++ sk->sk_userlocks = si->cpt_userlocks; ++ sk->sk_no_check = si->cpt_no_check; ++ sock_reset_flag(sk, SOCK_DBG); ++ if (si->cpt_debug) ++ sock_set_flag(sk, SOCK_DBG); ++ sock_reset_flag(sk, SOCK_RCVTSTAMP); ++ if (si->cpt_rcvtstamp) ++ sock_set_flag(sk, SOCK_RCVTSTAMP); ++ sock_reset_flag(sk, SOCK_LOCALROUTE); ++ if (si->cpt_localroute) ++ sock_set_flag(sk, SOCK_LOCALROUTE); ++ sk->sk_protocol = si->cpt_protocol; ++ sk->sk_err = si->cpt_err; ++ sk->sk_err_soft = si->cpt_err_soft; ++ sk->sk_priority = si->cpt_priority; ++ sk->sk_rcvlowat = si->cpt_rcvlowat; ++ sk->sk_rcvtimeo = si->cpt_rcvtimeo; ++ if (si->cpt_rcvtimeo == CPT_NULL) ++ sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; ++ sk->sk_sndtimeo = si->cpt_sndtimeo; ++ if (si->cpt_sndtimeo == CPT_NULL) ++ sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; ++ sk->sk_rcvbuf = si->cpt_rcvbuf; ++ sk->sk_sndbuf = si->cpt_sndbuf; ++ sk->sk_bound_dev_if = si->cpt_bound_dev_if; ++ sk->sk_flags = si->cpt_flags; ++ sk->sk_lingertime = si->cpt_lingertime; ++ if (si->cpt_lingertime == CPT_NULL) ++ sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; ++ sk->sk_peercred.pid = si->cpt_peer_pid; ++ sk->sk_peercred.uid = si->cpt_peer_uid; ++ sk->sk_peercred.gid = si->cpt_peer_gid; ++ cpt_timeval_import(&sk->sk_stamp, si->cpt_stamp); ++ return 0; ++} ++ ++static struct file *sock_mapfile(struct socket *sock) ++{ ++ int fd = sock_map_fd(sock); ++ ++ if (fd >= 0) { ++ struct file *file = sock->file; ++ get_file(file); ++ sc_close(fd); ++ return file; ++ } ++ return ERR_PTR(fd); ++} ++ ++/* Assumption is that /tmp exists and writable. ++ * In previous versions we assumed that listen() will autobind ++ * the socket. It does not do this for AF_UNIX by evident reason: ++ * socket in abstract namespace is accessible, unlike socket bound ++ * to deleted FS object. ++ */ ++ ++static int ++select_deleted_name(char * name, cpt_context_t *ctx) ++{ ++ int i; ++ ++ for (i=0; i<100; i++) { ++ struct nameidata nd; ++ unsigned int rnd = net_random(); ++ ++ sprintf(name, "/tmp/SOCK.%08x", rnd); ++ ++ if (path_lookup(name, 0, &nd) != 0) ++ return 0; ++ ++ path_release(&nd); ++ } ++ ++ eprintk_ctx("failed to allocate deleted socket inode\n"); ++ return -ELOOP; ++} ++ ++static int ++bind_unix_socket(struct socket *sock, struct cpt_sock_image *si, ++ cpt_context_t *ctx) ++{ ++ int err; ++ char *name; ++ struct sockaddr* addr; ++ int addrlen; ++ struct sockaddr_un sun; ++ struct nameidata nd; ++ ++ if ((addrlen = si->cpt_laddrlen) <= 2) ++ return 0; ++ ++ nd.dentry = NULL; ++ name = ((char*)si->cpt_laddr) + 2; ++ addr = (struct sockaddr *)si->cpt_laddr; ++ ++ if (name[0]) { ++ err = path_lookup(name, 0, &nd); ++ if (err) { ++ nd.dentry = NULL; ++ } else { ++ if (si->cpt_deleted) { ++ path_release(&nd); ++ nd.dentry = NULL; ++ addr = (struct sockaddr*)&sun; ++ addr->sa_family = AF_UNIX; ++ name = ((char*)addr) + 2; ++ err = select_deleted_name(name, ctx); ++ if (err) ++ return err; ++ addrlen = 2 + strlen(name); ++ } else if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) { ++ eprintk_ctx("bind_unix_socket: not a socket dentry\n"); ++ path_release(&nd); ++ return -EINVAL; ++ } ++ } ++ if (nd.dentry) ++ sc_unlink(name); ++ } ++ ++ err = sock->ops->bind(sock, addr, addrlen); ++ ++ if (!err) { ++ if (nd.dentry) { ++ sc_chown(name, nd.dentry->d_inode->i_uid, ++ nd.dentry->d_inode->i_gid); ++ sc_chmod(name, nd.dentry->d_inode->i_mode); ++ } ++ if (si->cpt_deleted && name[0]) ++ sc_unlink(name); ++ } ++ if (nd.dentry) ++ path_release(&nd); ++ return err; ++} ++ ++static int fixup_unix_address(struct socket *sock, struct cpt_sock_image *si, ++ struct cpt_context *ctx) ++{ ++ struct sock *sk = sock->sk; ++ cpt_object_t *obj; ++ struct sock *parent; ++ ++ if (sk->sk_family != AF_UNIX || sk->sk_state == TCP_LISTEN) ++ return 0; ++ ++ if (si->cpt_parent == -1) ++ return bind_unix_socket(sock, si, ctx); ++ ++ obj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); ++ if (!obj) ++ return 0; ++ ++ parent = obj->o_obj; ++ if (unix_sk(parent)->addr) { ++ if (unix_sk(sk)->addr && ++ atomic_dec_and_test(&unix_sk(sk)->addr->refcnt)) ++ kfree(unix_sk(sk)->addr); ++ atomic_inc(&unix_sk(parent)->addr->refcnt); ++ unix_sk(sk)->addr = unix_sk(parent)->addr; ++ } ++ return 0; ++} ++ ++ ++static int open_socket(cpt_object_t *obj, struct cpt_sock_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct socket *sock2 = NULL; ++ struct file *file; ++ cpt_object_t *fobj; ++ cpt_object_t *pobj = NULL; ++ ++ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, ++ &sock); ++ if (err) ++ return err; ++ ++ if (si->cpt_socketpair) { ++ err = sock_create_kern(si->cpt_family, si->cpt_type, ++ si->cpt_protocol, &sock2); ++ if (err) ++ goto err_out; ++ ++ err = sock->ops->socketpair(sock, sock2); ++ if (err < 0) ++ goto err_out; ++ ++ /* Socketpair with a peer outside our environment. ++ * So, we create real half-open pipe and do not worry ++ * about dead end anymore. */ ++ if (si->cpt_peer == -1) { ++ sock_release(sock2); ++ sock2 = NULL; ++ } ++ } ++ ++ cpt_obj_setobj(obj, sock->sk, ctx); ++ ++ if (si->cpt_file != CPT_NULL) { ++ file = sock_mapfile(sock); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) ++ goto err_out; ++ ++ err = -ENOMEM; ++ ++ obj->o_parent = file; ++ ++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) ++ goto err_out; ++ cpt_obj_setpos(fobj, si->cpt_file, ctx); ++ cpt_obj_setindex(fobj, si->cpt_index, ctx); ++ } ++ ++ if (sock2) { ++ struct file *file2; ++ ++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_peer, ctx); ++ if (!pobj) BUG(); ++ if (pobj->o_obj) BUG(); ++ cpt_obj_setobj(pobj, sock2->sk, ctx); ++ ++ if (pobj->o_ppos != CPT_NULL) { ++ file2 = sock_mapfile(sock2); ++ err = PTR_ERR(file2); ++ if (IS_ERR(file2)) ++ goto err_out; ++ ++ err = -ENOMEM; ++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file2, ctx)) == NULL) ++ goto err_out; ++ cpt_obj_setpos(fobj, pobj->o_ppos, ctx); ++ cpt_obj_setindex(fobj, si->cpt_peer, ctx); ++ ++ pobj->o_parent = file2; ++ } ++ } ++ ++ setup_sock_common(sock->sk, si, obj->o_pos, ctx); ++ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) { ++ inet_sk(sock->sk)->freebind = 1; ++ if (si->cpt_laddrlen) { ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); ++ if (err) { ++ dprintk_ctx("binding failed: %d, do not worry\n", err); ++ } ++ } ++ rst_socket_in(si, obj->o_pos, sock->sk, ctx); ++ } else if (sock->sk->sk_family == AF_NETLINK) { ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); ++ if (err) { ++ eprintk_ctx("AF_NETLINK binding failed: %d\n", err); ++ } ++ if (si->cpt_raddrlen) { ++ err = sock->ops->connect(sock, (struct sockaddr *)&si->cpt_raddr, si->cpt_raddrlen, O_NONBLOCK); ++ if (err) { ++ eprintk_ctx("oops, AF_NETLINK connect failed: %d\n", err); ++ } ++ } ++ } ++ fixup_unix_address(sock, si, ctx); ++ ++ if (sock2) { ++ err = rst_get_object(CPT_OBJ_SOCKET, pobj->o_pos, si, ctx); ++ if (err) ++ return err; ++ setup_sock_common(sock2->sk, si, pobj->o_pos, ctx); ++ fixup_unix_address(sock2, si, ctx); ++ } ++ ++ if ((sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) ++ && (int)si->cpt_parent != -1) { ++ cpt_object_t *lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); ++ if (lobj && cpt_attach_accept(lobj->o_obj, sock->sk, ctx) == 0) ++ sock->sk = NULL; ++ } ++ ++ ++ if (si->cpt_file == CPT_NULL && sock->sk && ++ sock->sk->sk_family == AF_INET) { ++ struct sock *sk = sock->sk; ++ ++ if (sk) { ++ sock->sk = NULL; ++ ++ local_bh_disable(); ++ bh_lock_sock(sk); ++ if (sock_owned_by_user(sk)) ++ eprintk_ctx("oops, sock is locked by user\n"); ++ ++ sock_hold(sk); ++ sock_orphan(sk); ++ ub_inc_orphan_count(sk); ++ bh_unlock_sock(sk); ++ local_bh_enable(); ++ sock_put(sk); ++ dprintk_ctx("orphaning socket %p\n", sk); ++ } ++ } ++ ++ if (si->cpt_file == CPT_NULL && sock->sk == NULL) ++ sock_release(sock); ++ ++ return 0; ++ ++err_out: ++ if (sock2) ++ sock_release(sock2); ++ sock_release(sock); ++ return err; ++} ++ ++static int open_listening_socket(loff_t pos, struct cpt_sock_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct socket *sock; ++ struct file *file; ++ cpt_object_t *obj, *fobj; ++ ++ err = sock_create_kern(si->cpt_family, si->cpt_type, si->cpt_protocol, ++ &sock); ++ if (err) { ++ eprintk_ctx("open_listening_socket: sock_create_kern: %d\n", err); ++ return err; ++ } ++ ++ sock->sk->sk_reuse = 2; ++ sock->sk->sk_bound_dev_if = si->cpt_bound_dev_if; ++ ++ if (sock->sk->sk_family == AF_UNIX) { ++ err = bind_unix_socket(sock, si, ctx); ++ } else if (si->cpt_laddrlen) { ++ if (sock->sk->sk_family == AF_INET || sock->sk->sk_family == AF_INET6) ++ inet_sk(sock->sk)->freebind = 1; ++ ++ err = sock->ops->bind(sock, (struct sockaddr *)&si->cpt_laddr, si->cpt_laddrlen); ++ ++ if (err) { ++ eprintk_ctx("open_listening_socket: bind: %d\n", err); ++ goto err_out; ++ } ++ } ++ ++ err = sock->ops->listen(sock, si->cpt_max_ack_backlog); ++ if (err) { ++ eprintk_ctx("open_listening_socket: listen: %d, %Ld, %d\n", err, pos, si->cpt_deleted); ++ goto err_out; ++ } ++ ++ /* Now we may access socket body directly and fixup all the things. */ ++ ++ file = sock_mapfile(sock); ++ err = PTR_ERR(file); ++ if (IS_ERR(file)) { ++ eprintk_ctx("open_listening_socket: map: %d\n", err); ++ goto err_out; ++ } ++ ++ err = -ENOMEM; ++ if ((fobj = cpt_object_add(CPT_OBJ_FILE, file, ctx)) == NULL) ++ goto err_out; ++ if ((obj = cpt_object_add(CPT_OBJ_SOCKET, sock->sk, ctx)) == NULL) ++ goto err_out; ++ cpt_obj_setpos(obj, pos, ctx); ++ cpt_obj_setindex(obj, si->cpt_index, ctx); ++ obj->o_parent = file; ++ cpt_obj_setpos(fobj, si->cpt_file, ctx); ++ cpt_obj_setindex(fobj, si->cpt_index, ctx); ++ ++ setup_sock_common(sock->sk, si, pos, ctx); ++ ++ if (si->cpt_family == AF_INET || si->cpt_family == AF_INET6) ++ rst_restore_synwait_queue(sock->sk, si, pos, ctx); ++ ++ return 0; ++ ++err_out: ++ sock_release(sock); ++ return err; ++} ++ ++static int ++rst_sock_attr_mcfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) ++{ ++ int err; ++ loff_t pos = *pos_p; ++ struct cpt_sockmc_image v; ++ ++ err = rst_get_object(CPT_OBJ_SOCK_MCADDR, pos, &v, ctx); ++ if (err) ++ return err; ++ ++ *pos_p += v.cpt_next; ++ ++ if (v.cpt_family == AF_INET) ++ return rst_sk_mcfilter_in(sk, &v, pos, ctx); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ else if (v.cpt_family == AF_INET6) ++ return rst_sk_mcfilter_in6(sk, &v, pos, ctx); ++#endif ++ else ++ return -EAFNOSUPPORT; ++} ++ ++ ++static int ++rst_sock_attr_skfilter(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) ++{ ++ int err; ++ struct sk_filter *fp, *old_fp; ++ loff_t pos = *pos_p; ++ struct cpt_obj_bits v; ++ ++ err = rst_get_object(CPT_OBJ_SKFILTER, pos, &v, ctx); ++ if (err) ++ return err; ++ ++ *pos_p += v.cpt_next; ++ ++ if (v.cpt_size % sizeof(struct sock_filter)) ++ return -EINVAL; ++ ++ fp = sock_kmalloc(sk, v.cpt_size+sizeof(*fp), GFP_KERNEL_UBC); ++ if (fp == NULL) ++ return -ENOMEM; ++ atomic_set(&fp->refcnt, 1); ++ fp->len = v.cpt_size/sizeof(struct sock_filter); ++ ++ err = ctx->pread(fp->insns, v.cpt_size, ctx, pos+v.cpt_hdrlen); ++ if (err) { ++ sk_filter_release(sk, fp); ++ return err; ++ } ++ ++ old_fp = sk->sk_filter; ++ sk->sk_filter = fp; ++ if (old_fp) ++ sk_filter_release(sk, old_fp); ++ return 0; ++} ++ ++ ++int rst_sock_attr(loff_t *pos_p, struct sock *sk, cpt_context_t *ctx) ++{ ++ int err; ++ loff_t pos = *pos_p; ++ ++ err = rst_sock_attr_skfilter(pos_p, sk, ctx); ++ if (err && pos == *pos_p) ++ err = rst_sock_attr_mcfilter(pos_p, sk, ctx); ++ return err; ++} ++ ++struct sk_buff * rst_skb(loff_t *pos_p, __u32 *owner, __u32 *queue, struct cpt_context *ctx) ++{ ++ int err; ++ struct sk_buff *skb; ++ struct cpt_skb_image v; ++ loff_t pos = *pos_p; ++ struct scm_fp_list *fpl = NULL; ++ struct timeval tmptv; ++ ++ err = rst_get_object(CPT_OBJ_SKB, pos, &v, ctx); ++ if (err) ++ return ERR_PTR(err); ++ *pos_p = pos + v.cpt_next; ++ ++ if (owner) ++ *owner = v.cpt_owner; ++ if (queue) ++ *queue = v.cpt_queue; ++ ++ skb = alloc_skb(v.cpt_len + v.cpt_hspace + v.cpt_tspace, GFP_KERNEL); ++ if (skb == NULL) ++ return ERR_PTR(-ENOMEM); ++ skb_reserve(skb, v.cpt_hspace); ++ skb_put(skb, v.cpt_len); ++ skb->h.raw = skb->head + v.cpt_h; ++ skb->nh.raw = skb->head + v.cpt_nh; ++ skb->mac.raw = skb->head + v.cpt_mac; ++ if (sizeof(skb->cb) < sizeof(v.cpt_cb)) BUG(); ++ memcpy(skb->cb, v.cpt_cb, sizeof(v.cpt_cb)); ++ skb->mac_len = v.cpt_mac_len; ++ ++ skb->csum = v.cpt_csum; ++ skb->local_df = v.cpt_local_df; ++ skb->pkt_type = v.cpt_pkt_type; ++ skb->ip_summed = v.cpt_ip_summed; ++ skb->priority = v.cpt_priority; ++ skb->protocol = v.cpt_protocol; ++ cpt_timeval_import(&tmptv, v.cpt_stamp); ++ skb_set_timestamp(skb, &tmptv); ++ ++ skb_shinfo(skb)->tso_segs = v.cpt_tso_segs; ++ skb_shinfo(skb)->tso_size = v.cpt_tso_size; ++ if (ctx->image_version == 0) { ++ skb_shinfo(skb)->tso_segs = 1; ++ skb_shinfo(skb)->tso_size = 0; ++ } ++ ++ if (v.cpt_next > v.cpt_hdrlen) { ++ pos = pos + v.cpt_hdrlen; ++ while (pos < *pos_p) { ++ union { ++ struct cpt_obj_bits b; ++ struct cpt_fd_image f; ++ } u; ++ ++ err = rst_get_object(-1, pos, &u, ctx); ++ if (err) { ++ kfree_skb(skb); ++ return ERR_PTR(err); ++ } ++ if (u.b.cpt_object == CPT_OBJ_BITS) { ++ if (u.b.cpt_size != v.cpt_hspace + skb->len) { ++ eprintk_ctx("invalid skb image %u != %u + %u\n", u.b.cpt_size, v.cpt_hspace, skb->len); ++ kfree_skb(skb); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ err = ctx->pread(skb->head, u.b.cpt_size, ctx, pos+u.b.cpt_hdrlen); ++ if (err) { ++ kfree_skb(skb); ++ return ERR_PTR(err); ++ } ++ } else if (u.f.cpt_object == CPT_OBJ_FILEDESC) { ++ if (!fpl) { ++ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); ++ if (!fpl) { ++ kfree_skb(skb); ++ return ERR_PTR(-ENOMEM); ++ } ++ fpl->count = 0; ++ UNIXCB(skb).fp = fpl; ++ } ++ fpl->fp[fpl->count] = rst_file(u.f.cpt_file, -1, ctx); ++ if (!IS_ERR(fpl->fp[fpl->count])) ++ fpl->count++; ++ } ++ pos += u.b.cpt_next; ++ } ++ } ++ ++ return skb; ++} ++ ++static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) ++{ ++ int i; ++ scm->fp = UNIXCB(skb).fp; ++ skb->destructor = sock_wfree; ++ UNIXCB(skb).fp = NULL; ++ ++ for (i=scm->fp->count-1; i>=0; i--) ++ unix_notinflight(scm->fp->fp[i]); ++} ++ ++static void unix_destruct_fds(struct sk_buff *skb) ++{ ++ struct scm_cookie scm; ++ memset(&scm, 0, sizeof(scm)); ++ unix_detach_fds(&scm, skb); ++ scm_destroy(&scm); ++ sock_wfree(skb); ++ module_put(THIS_MODULE); ++} ++ ++ ++static int restore_unix_rqueue(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ loff_t endpos; ++ ++ pos = pos + si->cpt_hdrlen; ++ endpos = pos + si->cpt_next; ++ while (pos < endpos) { ++ struct sk_buff *skb; ++ struct sock *owner_sk; ++ __u32 owner; ++ ++ skb = rst_skb(&pos, &owner, NULL, ctx); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) == -EINVAL) { ++ int err; ++ ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ } ++ return PTR_ERR(skb); ++ } ++ ++ owner_sk = unix_peer(sk); ++ if (owner != -1) { ++ cpt_object_t *pobj; ++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, owner, ctx); ++ if (pobj == NULL) { ++ eprintk_ctx("orphan af_unix skb?\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ owner_sk = pobj->o_obj; ++ } ++ if (owner_sk == NULL) { ++ dprintk_ctx("orphan af_unix skb 2?\n"); ++ kfree_skb(skb); ++ continue; ++ } ++ skb_set_owner_w(skb, owner_sk); ++ if (UNIXCB(skb).fp) { ++ skb->destructor = unix_destruct_fds; ++ if (!try_module_get(THIS_MODULE)) BUG(); ++ } ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ if (sk->sk_state == TCP_LISTEN) { ++ struct socket *sock = skb->sk->sk_socket; ++ if (sock == NULL) BUG(); ++ if (sock->file) BUG(); ++ skb->sk->sk_socket = NULL; ++ skb->sk->sk_sleep = NULL; ++ sock->sk = NULL; ++ sock_release(sock); ++ } ++ } ++ return 0; ++} ++ ++ ++/* All the sockets are created before we start to open files */ ++ ++int rst_sockets(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_SOCKET]; ++ loff_t endsec; ++ cpt_object_t *obj; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) { ++ eprintk_ctx("rst_sockets: ctx->pread: %d\n", err); ++ return err; ++ } ++ if (h.cpt_section != CPT_SECT_SOCKET || h.cpt_hdrlen < sizeof(h)) { ++ eprintk_ctx("rst_sockets: hdr err\n"); ++ return -EINVAL; ++ } ++ ++ /* The first pass: we create socket index and open listening sockets. */ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ struct cpt_sock_image *sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); ++ cpt_release_buf(ctx); ++ return err; ++ } ++ if (sbuf->cpt_state == TCP_LISTEN) { ++ err = open_listening_socket(sec, sbuf, ctx); ++ cpt_release_buf(ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: open_listening_socket: %d\n", err); ++ return err; ++ } ++ } else { ++ cpt_release_buf(ctx); ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ if (obj == NULL) ++ return -ENOMEM; ++ cpt_obj_setindex(obj, sbuf->cpt_index, ctx); ++ cpt_obj_setpos(obj, sec, ctx); ++ obj->o_ppos = sbuf->cpt_file; ++ intern_cpt_object(CPT_OBJ_SOCKET, obj, ctx); ++ } ++ sec += sbuf->cpt_next; ++ } ++ ++ /* Pass 2: really restore sockets */ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct cpt_sock_image *sbuf; ++ if (obj->o_obj != NULL) ++ continue; ++ sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: rst_get_object: %d\n", err); ++ cpt_release_buf(ctx); ++ return err; ++ } ++ if (sbuf->cpt_state == TCP_LISTEN) BUG(); ++ err = open_socket(obj, sbuf, ctx); ++ cpt_release_buf(ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: open_socket: %d\n", err); ++ return err; ++ } ++ } ++ ++ return 0; ++} ++ ++int rst_orphans(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_ORPHANS]; ++ loff_t endsec; ++ cpt_object_t *obj; ++ struct cpt_section_hdr h; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_ORPHANS || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ struct cpt_sock_image *sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, sec, sbuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ if (obj == NULL) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ obj->o_pos = sec; ++ obj->o_ppos = sbuf->cpt_file; ++ err = open_socket(obj, sbuf, ctx); ++ dprintk_ctx("Restoring orphan: %d\n", err); ++ free_cpt_object(obj, ctx); ++ cpt_release_buf(ctx); ++ if (err) ++ return err; ++ sec += sbuf->cpt_next; ++ } ++ ++ return 0; ++} ++ ++ ++/* Pass 3: I understand, this is not funny already :-), ++ * but we have to do another pass to establish links between ++ * not-paired AF_UNIX SOCK_DGRAM sockets and to restore AF_UNIX ++ * skb queues with proper skb->sk links. ++ * ++ * This could be made at the end of rst_sockets(), but we defer ++ * restoring af_unix queues up to the end of restoring files to ++ * make restoring passed FDs cleaner. ++ */ ++ ++int rst_sockets_complete(struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct cpt_sock_image *sbuf; ++ struct sock *sk = obj->o_obj; ++ struct sock *peer; ++ ++ if (!sk) BUG(); ++ ++ if (sk->sk_family != AF_UNIX) ++ continue; ++ ++ sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ if (sbuf->cpt_next > sbuf->cpt_hdrlen) ++ restore_unix_rqueue(sk, sbuf, obj->o_pos, ctx); ++ ++ cpt_release_buf(ctx); ++ ++ if (sk->sk_type == SOCK_DGRAM && unix_peer(sk) == NULL) { ++ cpt_object_t *pobj; ++ ++ sbuf = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_SOCKET, obj->o_pos, sbuf, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ if (sbuf->cpt_peer != -1) { ++ pobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, sbuf->cpt_peer, ctx); ++ if (pobj) { ++ peer = pobj->o_obj; ++ sock_hold(peer); ++ unix_peer(sk) = peer; ++ } ++ } ++ cpt_release_buf(ctx); ++ } ++ } ++ ++ rst_orphans(ctx); ++ ++ return 0; ++} ++ +diff -upr linux-2.6.16.orig/kernel/cpt/rst_socket_in.c linux-2.6.16-026test015/kernel/cpt/rst_socket_in.c +--- linux-2.6.16.orig/kernel/cpt/rst_socket_in.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_socket_in.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,494 @@ ++/* ++ * ++ * kernel/cpt/rst_socket_in.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/socket.h> ++#include <linux/tcp.h> ++#include <linux/jhash.h> ++#include <net/sock.h> ++#include <net/tcp.h> ++#include <linux/ipv6.h> ++#include <linux/igmp.h> ++#include <net/addrconf.h> ++#include <net/inet6_connection_sock.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_socket.h" ++#include "cpt_kernel.h" ++ ++static inline unsigned long jiffies_import(__u32 tmo) ++{ ++ __s32 delta = tmo; ++ return jiffies + (long)delta; ++} ++ ++static inline __u32 tcp_jiffies_import(__u32 tmo) ++{ ++ return ((__u32)jiffies) + tmo; ++} ++ ++ ++static int restore_queues(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ loff_t endpos; ++ ++ pos = pos + si->cpt_hdrlen; ++ endpos = pos + si->cpt_next; ++ while (pos < endpos) { ++ struct sk_buff *skb; ++ __u32 type; ++ ++ skb = rst_skb(&pos, NULL, &type, ctx); ++ if (IS_ERR(skb)) { ++ if (PTR_ERR(skb) == -EINVAL) { ++ int err; ++ ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ } ++ return PTR_ERR(skb); ++ } ++ ++ if (sk->sk_type == SOCK_STREAM) { ++ if (type == CPT_SKB_RQ) { ++ sk_stream_set_owner_r(skb, sk); ++ ub_tcprcvbuf_charge_forced(sk, skb); ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ } else if (type == CPT_SKB_OFOQ) { ++ struct tcp_sock *tp = tcp_sk(sk); ++ sk_stream_set_owner_r(skb, sk); ++ ub_tcprcvbuf_charge_forced(sk, skb); ++ skb_queue_tail(&tp->out_of_order_queue, skb); ++ } else if (type == CPT_SKB_WQ) { ++ sk->sk_wmem_queued += skb->truesize; ++ sk->sk_forward_alloc -= skb->truesize; ++ ub_tcpsndbuf_charge_forced(sk, skb); ++ skb_queue_tail(&sk->sk_write_queue, skb); ++ } else { ++ wprintk_ctx("strange stream queue type %u\n", type); ++ kfree_skb(skb); ++ } ++ } else { ++ if (type == CPT_SKB_RQ) { ++ skb_set_owner_r(skb, sk); ++ skb_queue_tail(&sk->sk_receive_queue, skb); ++ } else if (type == CPT_SKB_WQ) { ++ struct inet_sock *inet = inet_sk(sk); ++ if (inet->cork.fragsize) { ++ skb_set_owner_w(skb, sk); ++ skb_queue_tail(&sk->sk_write_queue, skb); ++ } else { ++ eprintk_ctx("cork skb is dropped\n"); ++ kfree_skb(skb); ++ } ++ } else { ++ wprintk_ctx("strange dgram queue type %u\n", type); ++ kfree_skb(skb); ++ } ++ } ++ } ++ return 0; ++} ++ ++static struct sock *find_parent(__u16 sport, cpt_context_t *ctx) ++{ ++ cpt_object_t *obj; ++ for_each_object(obj, CPT_OBJ_SOCKET) { ++ struct sock *sk = obj->o_obj; ++ if (sk && ++ sk->sk_state == TCP_LISTEN && ++ (sk->sk_family == AF_INET || sk->sk_family == AF_INET6) && ++ inet_sk(sk)->sport == sport) ++ return sk; ++ } ++ return NULL; ++} ++ ++static int rst_socket_tcp(struct cpt_sock_image *si, loff_t pos, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct sk_buff *skb; ++ tp->pred_flags = si->cpt_pred_flags; ++ tp->rcv_nxt = si->cpt_rcv_nxt; ++ tp->snd_nxt = si->cpt_snd_nxt; ++ tp->snd_una = si->cpt_snd_una; ++ tp->snd_sml = si->cpt_snd_sml; ++ tp->rcv_tstamp = tcp_jiffies_import(si->cpt_rcv_tstamp); ++ tp->lsndtime = tcp_jiffies_import(si->cpt_lsndtime); ++ tp->tcp_header_len = si->cpt_tcp_header_len; ++ inet_csk(sk)->icsk_ack.pending = si->cpt_ack_pending; ++ inet_csk(sk)->icsk_ack.quick = si->cpt_quick; ++ inet_csk(sk)->icsk_ack.pingpong = si->cpt_pingpong; ++ inet_csk(sk)->icsk_ack.blocked = si->cpt_blocked; ++ inet_csk(sk)->icsk_ack.ato = si->cpt_ato; ++ inet_csk(sk)->icsk_ack.timeout = jiffies_import(si->cpt_ack_timeout); ++ inet_csk(sk)->icsk_ack.lrcvtime = tcp_jiffies_import(si->cpt_lrcvtime); ++ inet_csk(sk)->icsk_ack.last_seg_size = si->cpt_last_seg_size; ++ inet_csk(sk)->icsk_ack.rcv_mss = si->cpt_rcv_mss; ++ tp->snd_wl1 = si->cpt_snd_wl1; ++ tp->snd_wnd = si->cpt_snd_wnd; ++ tp->max_window = si->cpt_max_window; ++ inet_csk(sk)->icsk_pmtu_cookie = si->cpt_pmtu_cookie; ++ tp->mss_cache = si->cpt_mss_cache; ++ tp->rx_opt.mss_clamp = si->cpt_mss_clamp; ++ inet_csk(sk)->icsk_ext_hdr_len = si->cpt_ext_header_len; ++ inet_csk(sk)->icsk_ca_state = si->cpt_ca_state; ++ inet_csk(sk)->icsk_retransmits = si->cpt_retransmits; ++ tp->reordering = si->cpt_reordering; ++ tp->frto_counter = si->cpt_frto_counter; ++ tp->frto_highmark = si->cpt_frto_highmark; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) ++ // // tp->adv_cong = si->cpt_adv_cong; ++#endif ++ inet_csk(sk)->icsk_accept_queue.rskq_defer_accept = si->cpt_defer_accept; ++ inet_csk(sk)->icsk_backoff = si->cpt_backoff; ++ tp->srtt = si->cpt_srtt; ++ tp->mdev = si->cpt_mdev; ++ tp->mdev_max = si->cpt_mdev_max; ++ tp->rttvar = si->cpt_rttvar; ++ tp->rtt_seq = si->cpt_rtt_seq; ++ inet_csk(sk)->icsk_rto = si->cpt_rto; ++ tp->packets_out = si->cpt_packets_out; ++ tp->left_out = si->cpt_left_out; ++ tp->retrans_out = si->cpt_retrans_out; ++ tp->lost_out = si->cpt_lost_out; ++ tp->sacked_out = si->cpt_sacked_out; ++ tp->fackets_out = si->cpt_fackets_out; ++ tp->snd_ssthresh = si->cpt_snd_ssthresh; ++ tp->snd_cwnd = si->cpt_snd_cwnd; ++ tp->snd_cwnd_cnt = si->cpt_snd_cwnd_cnt; ++ tp->snd_cwnd_clamp = si->cpt_snd_cwnd_clamp; ++ tp->snd_cwnd_used = si->cpt_snd_cwnd_used; ++ tp->snd_cwnd_stamp = tcp_jiffies_import(si->cpt_snd_cwnd_stamp); ++ inet_csk(sk)->icsk_timeout = tcp_jiffies_import(si->cpt_timeout); ++ tp->rcv_wnd = si->cpt_rcv_wnd; ++ tp->rcv_wup = si->cpt_rcv_wup; ++ tp->write_seq = si->cpt_write_seq; ++ tp->pushed_seq = si->cpt_pushed_seq; ++ tp->copied_seq = si->cpt_copied_seq; ++ tp->rx_opt.tstamp_ok = si->cpt_tstamp_ok; ++ tp->rx_opt.wscale_ok = si->cpt_wscale_ok; ++ tp->rx_opt.sack_ok = si->cpt_sack_ok; ++ tp->rx_opt.saw_tstamp = si->cpt_saw_tstamp; ++ tp->rx_opt.snd_wscale = si->cpt_snd_wscale; ++ tp->rx_opt.rcv_wscale = si->cpt_rcv_wscale; ++ tp->nonagle = si->cpt_nonagle; ++ tp->keepalive_probes = si->cpt_keepalive_probes; ++ tp->rx_opt.rcv_tsval = si->cpt_rcv_tsval; ++ tp->rx_opt.rcv_tsecr = si->cpt_rcv_tsecr; ++ tp->rx_opt.ts_recent = si->cpt_ts_recent; ++ tp->rx_opt.ts_recent_stamp = si->cpt_ts_recent_stamp; ++ tp->rx_opt.user_mss = si->cpt_user_mss; ++ tp->rx_opt.dsack = si->cpt_dsack; ++ tp->rx_opt.eff_sacks = si->cpt_num_sacks; ++ tp->duplicate_sack[0].start_seq = si->cpt_sack_array[0]; ++ tp->duplicate_sack[0].end_seq = si->cpt_sack_array[1]; ++ tp->selective_acks[0].start_seq = si->cpt_sack_array[2]; ++ tp->selective_acks[0].end_seq = si->cpt_sack_array[3]; ++ tp->selective_acks[1].start_seq = si->cpt_sack_array[4]; ++ tp->selective_acks[1].end_seq = si->cpt_sack_array[5]; ++ tp->selective_acks[2].start_seq = si->cpt_sack_array[6]; ++ tp->selective_acks[2].end_seq = si->cpt_sack_array[7]; ++ tp->selective_acks[3].start_seq = si->cpt_sack_array[8]; ++ tp->selective_acks[3].end_seq = si->cpt_sack_array[9]; ++ ++ tp->window_clamp = si->cpt_window_clamp; ++ tp->rcv_ssthresh = si->cpt_rcv_ssthresh; ++ inet_csk(sk)->icsk_probes_out = si->cpt_probes_out; ++ tp->rx_opt.num_sacks = si->cpt_num_sacks; ++ tp->advmss = si->cpt_advmss; ++ inet_csk(sk)->icsk_syn_retries = si->cpt_syn_retries; ++ tp->ecn_flags = si->cpt_ecn_flags; ++ tp->prior_ssthresh = si->cpt_prior_ssthresh; ++ tp->high_seq = si->cpt_high_seq; ++ tp->retrans_stamp = si->cpt_retrans_stamp; ++ tp->undo_marker = si->cpt_undo_marker; ++ tp->undo_retrans = si->cpt_undo_retrans; ++ tp->urg_seq = si->cpt_urg_seq; ++ tp->urg_data = si->cpt_urg_data; ++ inet_csk(sk)->icsk_pending = si->cpt_pending; ++ tp->urg_mode = si->cpt_urg_mode; ++ tp->snd_up = si->cpt_snd_up; ++ tp->keepalive_time = si->cpt_keepalive_time; ++ tp->keepalive_intvl = si->cpt_keepalive_intvl; ++ tp->linger2 = si->cpt_linger2; ++ ++ sk->sk_send_head = NULL; ++ for (skb = skb_peek(&sk->sk_write_queue); ++ skb && skb != (struct sk_buff*)&sk->sk_write_queue; ++ skb = skb->next) { ++ if (!after(tp->snd_nxt, TCP_SKB_CB(skb)->seq)) { ++ sk->sk_send_head = skb; ++ break; ++ } ++ } ++ ++ if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) { ++ struct inet_sock *inet = inet_sk(sk); ++ if (inet->num == 0) { ++ cpt_object_t *lobj = NULL; ++ ++ if ((int)si->cpt_parent != -1) ++ lobj = lookup_cpt_obj_byindex(CPT_OBJ_SOCKET, si->cpt_parent, ctx); ++ ++ if (lobj && lobj->o_obj) { ++ inet->num = ntohs(inet->sport); ++ local_bh_disable(); ++ __inet_inherit_port(&tcp_hashinfo, lobj->o_obj, sk); ++ local_bh_enable(); ++ dprintk_ctx("port inherited from parent\n"); ++ } else { ++ struct sock *lsk = find_parent(inet->sport, ctx); ++ if (lsk) { ++ inet->num = ntohs(inet->sport); ++ local_bh_disable(); ++ __inet_inherit_port(&tcp_hashinfo, lsk, sk); ++ local_bh_enable(); ++ dprintk_ctx("port inherited\n"); ++ } else { ++ eprintk_ctx("we are kinda lost...\n"); ++ } ++ } ++ } ++ ++ sk->sk_prot->hash(sk); ++ ++ if (inet_csk(sk)->icsk_ack.pending&ICSK_ACK_TIMER) ++ sk_reset_timer(sk, &inet_csk(sk)->icsk_delack_timer, inet_csk(sk)->icsk_ack.timeout); ++ if (inet_csk(sk)->icsk_pending) ++ sk_reset_timer(sk, &inet_csk(sk)->icsk_retransmit_timer, ++ inet_csk(sk)->icsk_timeout); ++ if (sock_flag(sk, SOCK_KEEPOPEN)) { ++ unsigned long expires = jiffies_import(si->cpt_ka_timeout); ++ if (time_after(jiffies, expires)) ++ expires = jiffies + HZ; ++ sk_reset_timer(sk, &sk->sk_timer, expires); ++ } ++ } ++ ++ return 0; ++} ++ ++ ++int rst_socket_in(struct cpt_sock_image *si, loff_t pos, struct sock *sk, ++ struct cpt_context *ctx) ++{ ++ struct inet_sock *inet = inet_sk(sk); ++ ++ lock_sock(sk); ++ ++ sk->sk_state = si->cpt_state; ++ ++ inet->daddr = si->cpt_daddr; ++ inet->dport = si->cpt_dport; ++ inet->saddr = si->cpt_saddr; ++ inet->rcv_saddr = si->cpt_rcv_saddr; ++ inet->sport = si->cpt_sport; ++ inet->uc_ttl = si->cpt_uc_ttl; ++ inet->tos = si->cpt_tos; ++ inet->cmsg_flags = si->cpt_cmsg_flags; ++ inet->mc_index = si->cpt_mc_index; ++ inet->mc_addr = si->cpt_mc_addr; ++ inet->hdrincl = si->cpt_hdrincl; ++ inet->mc_ttl = si->cpt_mc_ttl; ++ inet->mc_loop = si->cpt_mc_loop; ++ inet->pmtudisc = si->cpt_pmtudisc; ++ inet->recverr = si->cpt_recverr; ++ inet->freebind = si->cpt_freebind; ++ inet->id = si->cpt_idcounter; ++ ++ inet->cork.flags = si->cpt_cork_flags; ++ inet->cork.fragsize = si->cpt_cork_fragsize; ++ inet->cork.length = si->cpt_cork_length; ++ inet->cork.addr = si->cpt_cork_addr; ++ inet->cork.fl.fl4_src = si->cpt_cork_saddr; ++ inet->cork.fl.fl4_dst = si->cpt_cork_daddr; ++ inet->cork.fl.oif = si->cpt_cork_oif; ++ if (inet->cork.fragsize) { ++ if (ip_route_output_key(&inet->cork.rt, &inet->cork.fl)) { ++ eprintk_ctx("failed to restore cork route\n"); ++ inet->cork.fragsize = 0; ++ } ++ } ++ ++ if (sk->sk_type == SOCK_DGRAM && sk->sk_protocol == IPPROTO_UDP) { ++ struct udp_sock *up = udp_sk(sk); ++ up->pending = si->cpt_udp_pending; ++ up->corkflag = si->cpt_udp_corkflag; ++ up->encap_type = si->cpt_udp_encap; ++ up->len = si->cpt_udp_len; ++ } ++ ++ if (sk->sk_family == AF_INET6) { ++ struct ipv6_pinfo *np = inet6_sk(sk); ++ ++ memcpy(&np->saddr, si->cpt_saddr6, 16); ++ memcpy(&np->rcv_saddr, si->cpt_rcv_saddr6, 16); ++ memcpy(&np->daddr, si->cpt_daddr6, 16); ++ np->flow_label = si->cpt_flow_label6; ++ np->frag_size = si->cpt_frag_size6; ++ np->hop_limit = si->cpt_hop_limit6; ++ np->mcast_hops = si->cpt_mcast_hops6; ++ np->mcast_oif = si->cpt_mcast_oif6; ++ np->rxopt.all = si->cpt_rxopt6; ++ np->mc_loop = si->cpt_mc_loop6; ++ np->recverr = si->cpt_recverr6; ++ np->sndflow = si->cpt_sndflow6; ++ np->pmtudisc = si->cpt_pmtudisc6; ++ np->ipv6only = si->cpt_ipv6only6; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (si->cpt_mapped) { ++ extern struct inet_connection_sock_af_ops ipv6_mapped; ++ if (sk->sk_type == SOCK_STREAM && ++ sk->sk_protocol == IPPROTO_TCP) { ++ inet_csk(sk)->icsk_af_ops = &ipv6_mapped; ++ sk->sk_backlog_rcv = tcp_v4_do_rcv; ++ } ++ } ++#endif ++ } ++ ++ restore_queues(sk, si, pos, ctx); ++ ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP) ++ rst_socket_tcp(si, pos, sk, ctx); ++ ++ release_sock(sk); ++ return 0; ++} ++ ++int cpt_attach_accept(struct sock *lsk, struct sock *sk, cpt_context_t *ctx) ++{ ++ struct request_sock *req; ++ ++ if (lsk->sk_state != TCP_LISTEN) ++ return -EINVAL; ++ ++ req = reqsk_alloc(&tcp_request_sock_ops); ++ if (!req) ++ return -ENOMEM; ++ ++ sk->sk_socket = NULL; ++ sk->sk_sleep = NULL; ++ inet_csk_reqsk_queue_add(lsk, req, sk); ++ return 0; ++} ++ ++static __inline__ u32 __tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) ++{ ++ return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); ++} ++ ++int rst_restore_synwait_queue(struct sock *sk, struct cpt_sock_image *si, ++ loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t end = si->cpt_next; ++ ++ pos += si->cpt_hdrlen; ++ while (pos < end) { ++ struct cpt_openreq_image oi; ++ ++ err = rst_get_object(CPT_OBJ_OPENREQ, pos, &oi, ctx); ++ if (err) { ++ err = rst_sock_attr(&pos, sk, ctx); ++ if (err) ++ return err; ++ continue; ++ } ++ ++ if (oi.cpt_object == CPT_OBJ_OPENREQ) { ++ struct request_sock *req = reqsk_alloc(&tcp_request_sock_ops); ++ if (req == NULL) ++ return -ENOMEM; ++ ++ memset(req, 0, sizeof(*req)); ++ tcp_rsk(req)->rcv_isn = oi.cpt_rcv_isn; ++ tcp_rsk(req)->snt_isn = oi.cpt_snt_isn; ++ inet_rsk(req)->rmt_port = oi.cpt_rmt_port; ++ req->mss = oi.cpt_mss; ++ req->retrans = oi.cpt_retrans; ++ inet_rsk(req)->snd_wscale = oi.cpt_snd_wscale; ++ inet_rsk(req)->rcv_wscale = oi.cpt_rcv_wscale; ++ inet_rsk(req)->tstamp_ok = oi.cpt_tstamp_ok; ++ inet_rsk(req)->sack_ok = oi.cpt_sack_ok; ++ inet_rsk(req)->wscale_ok = oi.cpt_wscale_ok; ++ inet_rsk(req)->ecn_ok = oi.cpt_ecn_ok; ++ inet_rsk(req)->acked = oi.cpt_acked; ++ req->window_clamp = oi.cpt_window_clamp; ++ req->rcv_wnd = oi.cpt_rcv_wnd; ++ req->ts_recent = oi.cpt_ts_recent; ++ req->expires = jiffies_import(oi.cpt_expires); ++ ++ if (oi.cpt_family == AF_INET) { ++ memcpy(&inet_rsk(req)->loc_addr, oi.cpt_loc_addr, 4); ++ memcpy(&inet_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 4); ++ inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ++ } else { ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++ memcpy(&inet6_rsk(req)->loc_addr, oi.cpt_loc_addr, 16); ++ memcpy(&inet6_rsk(req)->rmt_addr, oi.cpt_rmt_addr, 16); ++ inet6_rsk(req)->iif = oi.cpt_iif; ++ inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); ++#endif ++ } ++ } ++ pos += oi.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_sk_mcfilter_in(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx) ++{ ++ struct ip_mreqn imr; ++ ++ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { ++ eprintk_ctx("IGMPv3 is still not supported\n"); ++ return -EINVAL; ++ } ++ ++ memset(&imr, 0, sizeof(imr)); ++ imr.imr_ifindex = v->cpt_ifindex; ++ imr.imr_multiaddr.s_addr = v->cpt_mcaddr[0]; ++ return ip_mc_join_group(sk, &imr); ++} ++ ++#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) ++int rst_sk_mcfilter_in6(struct sock *sk, struct cpt_sockmc_image *v, ++ loff_t pos, cpt_context_t *ctx) ++{ ++ ++ if (v->cpt_mode || v->cpt_next != v->cpt_hdrlen) { ++ eprintk_ctx("IGMPv3 is still not supported\n"); ++ return -EINVAL; ++ } ++ ++ return ipv6_sock_mc_join(sk, v->cpt_ifindex, ++ (struct in6_addr*)v->cpt_mcaddr); ++} ++#endif +diff -upr linux-2.6.16.orig/kernel/cpt/rst_sysvipc.c linux-2.6.16-026test015/kernel/cpt/rst_sysvipc.c +--- linux-2.6.16.orig/kernel/cpt/rst_sysvipc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_sysvipc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,409 @@ ++/* ++ * ++ * kernel/cpt/rst_sysvipc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/shm.h> ++/* FIXME. x86_64 has asm/ipc.h forgotten? */ ++#include <asm-generic/ipc.h> ++#include <asm/uaccess.h> ++#include <asm/unistd.h> ++#include <ub/ub_mem.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_kernel.h" ++ ++struct _warg { ++ struct file *file; ++ struct cpt_sysvshm_image *v; ++}; ++ ++static int fixup_one_shm(struct shmid_kernel *shp, void *arg) ++{ ++ struct _warg *warg = arg; ++ ++ if (shp->shm_file != warg->file) ++ return 0; ++ if (shp->shm_nattch) ++ return -EEXIST; ++ ++ shp->shm_perm.uid = warg->v->cpt_uid; ++ shp->shm_perm.gid = warg->v->cpt_gid; ++ shp->shm_perm.cuid = warg->v->cpt_cuid; ++ shp->shm_perm.cgid = warg->v->cpt_cgid; ++ shp->shm_perm.mode = warg->v->cpt_mode; ++ ++ shp->shm_atim = warg->v->cpt_atime; ++ shp->shm_dtim = warg->v->cpt_dtime; ++ shp->shm_ctim = warg->v->cpt_ctime; ++ shp->shm_cprid = warg->v->cpt_creator; ++ shp->shm_lprid = warg->v->cpt_last; ++ ++ /* TODO: fix shp->mlock_user? */ ++ return 1; ++} ++ ++static int fixup_shm(struct file *file, struct cpt_sysvshm_image *v) ++{ ++ struct _warg warg; ++ ++ warg.file = file; ++ warg.v = v; ++ ++ return sysvipc_walk_shm(fixup_one_shm, &warg); ++} ++ ++static int fixup_shm_data(struct file *file, loff_t pos, loff_t end, ++ struct cpt_context *ctx) ++{ ++ struct cpt_page_block pgb; ++ ssize_t (*do_write)(struct file *, const char __user *, size_t, loff_t *ppos); ++ ++ do_write = file->f_dentry->d_inode->i_fop->write; ++ if (do_write == NULL) { ++ eprintk_ctx("No TMPFS? Cannot restore content of SYSV SHM\n"); ++ return -EINVAL; ++ } ++ ++ while (pos < end) { ++ loff_t opos; ++ loff_t ipos; ++ int count; ++ int err; ++ ++ err = rst_get_object(CPT_OBJ_PAGES, pos, &pgb, ctx); ++ if (err) ++ return err; ++ dprintk_ctx("restoring SHM block: %08x-%08x\n", ++ (__u32)pgb.cpt_start, (__u32)pgb.cpt_end); ++ ipos = pos + pgb.cpt_hdrlen; ++ opos = pgb.cpt_start; ++ count = pgb.cpt_end-pgb.cpt_start; ++ while (count > 0) { ++ mm_segment_t oldfs; ++ int copy = count; ++ ++ if (copy > PAGE_SIZE) ++ copy = PAGE_SIZE; ++ (void)cpt_get_buf(ctx); ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ err = ctx->pread(ctx->tmpbuf, copy, ctx, ipos); ++ set_fs(oldfs); ++ if (err) { ++ __cpt_release_buf(ctx); ++ return err; ++ } ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ ipos += copy; ++ err = do_write(file, ctx->tmpbuf, copy, &opos); ++ set_fs(oldfs); ++ __cpt_release_buf(ctx); ++ if (err != copy) { ++ eprintk_ctx("write() failure\n"); ++ if (err >= 0) ++ err = -EIO; ++ return err; ++ } ++ count -= copy; ++ } ++ pos += pgb.cpt_next; ++ } ++ return 0; ++} ++ ++struct file * rst_sysv_shm(loff_t pos, struct cpt_context *ctx) ++{ ++ struct file *file; ++ int err; ++ loff_t dpos, epos; ++ union { ++ struct cpt_file_image fi; ++ struct cpt_sysvshm_image shmi; ++ struct cpt_inode_image ii; ++ } u; ++ ++ err = rst_get_object(CPT_OBJ_FILE, pos, &u.fi, ctx); ++ if (err < 0) ++ goto err_out; ++ pos = u.fi.cpt_inode; ++ err = rst_get_object(CPT_OBJ_INODE, pos, &u.ii, ctx); ++ if (err < 0) ++ goto err_out; ++ dpos = pos + u.ii.cpt_hdrlen; ++ epos = pos + u.ii.cpt_next; ++ err = rst_get_object(CPT_OBJ_SYSV_SHM, pos + u.ii.cpt_hdrlen, &u.shmi, ctx); ++ if (err < 0) ++ goto err_out; ++ dpos += u.shmi.cpt_next; ++ ++ file = sysvipc_setup_shm(u.shmi.cpt_key, u.shmi.cpt_id, ++ u.shmi.cpt_segsz, u.shmi.cpt_mode); ++ if (!IS_ERR(file)) { ++ err = fixup_shm(file, &u.shmi); ++ if (err != -EEXIST && dpos < epos) ++ err = fixup_shm_data(file, dpos, epos, ctx); ++ } ++ ++ return file; ++ ++err_out: ++ return ERR_PTR(err); ++} ++ ++static int attach_one_undo(int semid, struct sem_array *sma, void *arg) ++{ ++ struct sem_undo *su = arg; ++ struct sem_undo_list *undo_list = current->sysvsem.undo_list; ++ ++ if (semid != su->semid) ++ return 0; ++ ++ su->proc_next = undo_list->proc_list; ++ undo_list->proc_list = su; ++ ++ su->id_next = sma->undo; ++ sma->undo = su; ++ ++ return 1; ++} ++ ++static int attach_undo(struct sem_undo *su) ++{ ++ return sysvipc_walk_sem(attach_one_undo, su); ++} ++ ++static int do_rst_semundo(struct cpt_object_hdr *sui, loff_t pos, struct cpt_context *ctx) ++{ ++ int err; ++ struct sem_undo_list *undo_list; ++ ++ if (current->sysvsem.undo_list) { ++ eprintk_ctx("Funny undo_list\n"); ++ return 0; ++ } ++ ++ undo_list = ub_kmalloc(sizeof(struct sem_undo_list), GFP_KERNEL); ++ if (undo_list == NULL) ++ return -ENOMEM; ++ memset(undo_list, 0, sizeof(struct sem_undo_list)); ++ atomic_set(&undo_list->refcnt, 1); ++ spin_lock_init(&undo_list->lock); ++ current->sysvsem.undo_list = undo_list; ++ ++ if (sui->cpt_next > sui->cpt_hdrlen) { ++ loff_t offset = pos + sui->cpt_hdrlen; ++ do { ++ struct sem_undo *new; ++ struct cpt_sysvsem_undo_image spi; ++ err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO_REC, offset, &spi, ctx); ++ if (err) ++ goto out; ++ new = ub_kmalloc(sizeof(struct sem_undo) + ++ sizeof(short)*spi.cpt_nsem, GFP_KERNEL); ++ if (!new) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ memset(new, 0, sizeof(struct sem_undo) + sizeof(short)*spi.cpt_nsem); ++ new->semadj = (short *) &new[1]; ++ new->semid = spi.cpt_id; ++ err = ctx->pread(new->semadj, spi.cpt_nsem*sizeof(short), ctx, offset + spi.cpt_hdrlen); ++ if (err) { ++ kfree(new); ++ goto out; ++ } ++ err = attach_undo(new); ++ if (err <= 0) { ++ if (err == 0) ++ err = -ENOENT; ++ kfree(new); ++ goto out; ++ } ++ offset += spi.cpt_next; ++ } while (offset < pos + sui->cpt_next); ++ } ++ err = 0; ++ ++out: ++ return err; ++} ++ ++__u32 rst_semundo_flag(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ __u32 flag = 0; ++ ++#if 0 ++ if (ti->cpt_sysvsem_undo == CPT_NULL || ++ lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo)) ++ flag |= CLONE_SYSVSEM; ++#endif ++ return flag; ++} ++ ++int rst_semundo_complete(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ int err; ++ struct sem_undo_list *f = current->sysvsem.undo_list; ++ cpt_object_t *obj; ++ struct cpt_object_hdr sui; ++ ++ if (ti->cpt_sysvsem_undo == CPT_NULL) { ++ exit_sem(current); ++ return 0; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, ctx); ++ if (obj) { ++ if (obj->o_obj != f) { ++ exit_sem(current); ++ f = obj->o_obj; ++ atomic_inc(&f->refcnt); ++ current->sysvsem.undo_list = f; ++ } ++ return 0; ++ } ++ ++ if ((err = rst_get_object(CPT_OBJ_SYSVSEM_UNDO, ti->cpt_sysvsem_undo, &sui, ctx)) != 0) ++ goto out; ++ ++ if ((err = do_rst_semundo(&sui, ti->cpt_sysvsem_undo, ctx)) != 0) ++ goto out; ++ ++ err = -ENOMEM; ++ obj = cpt_object_add(CPT_OBJ_SYSVSEM_UNDO, f, ctx); ++ if (obj) { ++ err = 0; ++ cpt_obj_setpos(obj, ti->cpt_sysvsem_undo, ctx); ++ } ++ ++ return 0; ++ ++out: ++ return err; ++} ++ ++struct _sarg { ++ int semid; ++ struct cpt_sysvsem_image *v; ++ __u32 *arr; ++}; ++ ++static int fixup_one_sem(int semid, struct sem_array *sma, void *arg) ++{ ++ struct _sarg *warg = arg; ++ ++ if (semid != warg->semid) ++ return 0; ++ ++ sma->sem_perm.uid = warg->v->cpt_uid; ++ sma->sem_perm.gid = warg->v->cpt_gid; ++ sma->sem_perm.cuid = warg->v->cpt_cuid; ++ sma->sem_perm.cgid = warg->v->cpt_cgid; ++ sma->sem_perm.mode = warg->v->cpt_mode; ++ sma->sem_perm.seq = warg->v->cpt_seq; ++ ++ sma->sem_ctime = warg->v->cpt_ctime; ++ sma->sem_otime = warg->v->cpt_otime; ++ memcpy(sma->sem_base, warg->arr, sma->sem_nsems*8); ++ return 1; ++} ++ ++static int fixup_sem(int semid, struct cpt_sysvsem_image *v, __u32 *arr) ++{ ++ struct _sarg warg; ++ ++ warg.semid = semid; ++ warg.v = v; ++ warg.arr = arr; ++ ++ return sysvipc_walk_sem(fixup_one_sem, &warg); ++} ++ ++ ++static int restore_sem(loff_t pos, struct cpt_sysvsem_image *si, ++ struct cpt_context *ctx) ++{ ++ int err; ++ __u32 *arr; ++ int nsems = (si->cpt_next - si->cpt_hdrlen)/8; ++ ++ arr = kmalloc(nsems*8, GFP_KERNEL); ++ if (!arr) ++ return -ENOMEM; ++ ++ err = ctx->pread(arr, nsems*8, ctx, pos+si->cpt_hdrlen); ++ if (err) ++ goto out; ++ err = sysvipc_setup_sem(si->cpt_key, si->cpt_id, nsems, si->cpt_mode); ++ if (err < 0) { ++ eprintk_ctx("SEM 3\n"); ++ goto out; ++ } ++ err = fixup_sem(si->cpt_id, si, arr); ++ if (err == 0) ++ err = -ESRCH; ++ if (err > 0) ++ err = 0; ++out: ++ kfree(arr); ++ return err; ++} ++ ++static int rst_sysv_sem(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_SYSV_SEM]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_sysvsem_image sbuf; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_SYSV_SEM || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int err; ++ err = rst_get_object(CPT_OBJ_SYSV_SEM, sec, &sbuf, ctx); ++ if (err) ++ return err; ++ err = restore_sem(sec, &sbuf, ctx); ++ if (err) ++ return err; ++ sec += sbuf.cpt_next; ++ } ++ return 0; ++} ++ ++int rst_sysv_ipc(struct cpt_context *ctx) ++{ ++ return rst_sysv_sem(ctx); ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_tty.c linux-2.6.16-026test015/kernel/cpt/rst_tty.c +--- linux-2.6.16.orig/kernel/cpt/rst_tty.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_tty.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,380 @@ ++/* ++ * ++ * kernel/cpt/rst_tty.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/pipe_fs_i.h> ++#include <linux/mman.h> ++#include <linux/mount.h> ++#include <linux/tty.h> ++#include <linux/vmalloc.h> ++#include <asm/unistd.h> ++#include <asm/uaccess.h> ++#include <linux/cpt_image.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_mm.h" ++#include "cpt_files.h" ++#include "cpt_kernel.h" ++ ++static int pty_setup(struct tty_struct *stty, loff_t pos, ++ struct cpt_tty_image *pi, struct cpt_context *ctx) ++{ ++ unsigned long flags; ++ ++ stty->pgrp = -1; ++ stty->session = 0; ++ stty->packet = pi->cpt_packet; ++ stty->stopped = pi->cpt_stopped; ++ stty->hw_stopped = pi->cpt_hw_stopped; ++ stty->flow_stopped = pi->cpt_flow_stopped; ++#define DONOT_CHANGE ((1<<TTY_CHARGED)|(1<<TTY_CLOSING)|(1<<TTY_LDISC)) ++ flags = stty->flags & DONOT_CHANGE; ++ stty->flags = flags | (pi->cpt_flags & ~DONOT_CHANGE); ++ stty->ctrl_status = pi->cpt_ctrl_status; ++ stty->winsize.ws_row = pi->cpt_ws_row; ++ stty->winsize.ws_col = pi->cpt_ws_col; ++ stty->winsize.ws_ypixel = pi->cpt_ws_prow; ++ stty->winsize.ws_xpixel = pi->cpt_ws_pcol; ++ stty->canon_column = pi->cpt_canon_column; ++ stty->column = pi->cpt_column; ++ stty->raw = pi->cpt_raw; ++ stty->real_raw = pi->cpt_real_raw; ++ stty->erasing = pi->cpt_erasing; ++ stty->lnext = pi->cpt_lnext; ++ stty->icanon = pi->cpt_icanon; ++ stty->closing = pi->cpt_closing; ++ stty->minimum_to_wake = pi->cpt_minimum_to_wake; ++ ++ stty->termios->c_iflag = pi->cpt_c_iflag; ++ stty->termios->c_oflag = pi->cpt_c_oflag; ++ stty->termios->c_lflag = pi->cpt_c_lflag; ++ stty->termios->c_cflag = pi->cpt_c_cflag; ++ memcpy(&stty->termios->c_cc, &pi->cpt_c_cc, NCCS); ++ memcpy(stty->read_flags, pi->cpt_read_flags, sizeof(stty->read_flags)); ++ ++ if (pi->cpt_next > pi->cpt_hdrlen) { ++ int err; ++ struct cpt_obj_bits b; ++ err = rst_get_object(CPT_OBJ_BITS, pos + pi->cpt_hdrlen, &b, ctx); ++ if (err) ++ return err; ++ if (b.cpt_size == 0) ++ return 0; ++ err = ctx->pread(stty->read_buf, b.cpt_size, ctx, pos + pi->cpt_hdrlen + b.cpt_hdrlen); ++ if (err) ++ return err; ++ ++ spin_lock_irq(&stty->read_lock); ++ stty->read_tail = 0; ++ stty->read_cnt = b.cpt_size; ++ stty->read_head = b.cpt_size; ++ stty->canon_head = stty->read_tail + pi->cpt_canon_head; ++ stty->canon_data = pi->cpt_canon_data; ++ spin_unlock_irq(&stty->read_lock); ++ } ++ ++ return 0; ++} ++ ++/* Find slave/master tty in image, when we already know master/slave. ++ * It might be optimized, of course. */ ++static loff_t find_pty_pair(struct tty_struct *stty, loff_t pos, struct cpt_tty_image *pi, struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_TTY]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_tty_image *pibuf; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return CPT_NULL; ++ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) ++ return CPT_NULL; ++ pibuf = kmalloc(sizeof(*pibuf), GFP_KERNEL); ++ if (pibuf == NULL) { ++ eprintk_ctx("cannot allocate buffer\n"); ++ return CPT_NULL; ++ } ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) ++ return CPT_NULL; ++ if (pibuf->cpt_index == pi->cpt_index && ++ !((pi->cpt_drv_flags^pibuf->cpt_drv_flags)&TTY_DRIVER_DEVPTS_MEM) && ++ pos != sec) { ++ pty_setup(stty, sec, pibuf, ctx); ++ return sec; ++ } ++ sec += pibuf->cpt_next; ++ } ++ kfree(pibuf); ++ return CPT_NULL; ++} ++ ++static int fixup_tty_attrs(struct cpt_inode_image *ii, struct file *master, ++ struct cpt_context *ctx) ++{ ++ int err; ++ struct iattr newattrs; ++ struct dentry *d = master->f_dentry; ++ ++ newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; ++ newattrs.ia_uid = ii->cpt_uid; ++ newattrs.ia_gid = ii->cpt_gid; ++ newattrs.ia_mode = ii->cpt_mode; ++ ++ mutex_lock(&d->d_inode->i_mutex); ++ err = notify_change(d, &newattrs); ++ mutex_unlock(&d->d_inode->i_mutex); ++ ++ return err; ++} ++ ++/* NOTE: "portable", but ugly thing. To allocate /dev/pts/N, we open ++ * /dev/ptmx until we get pty with desired index. ++ */ ++ ++struct file *ptmx_open(int index, unsigned int flags) ++{ ++ struct file *file; ++ struct file **stack = NULL; ++ int depth = 0; ++ ++ for (;;) { ++ struct tty_struct *tty; ++ ++ file = filp_open("/dev/ptmx", flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); ++ if (IS_ERR(file)) ++ break; ++ tty = file->private_data; ++ if (tty->index == index) ++ break; ++ ++ if (depth == PAGE_SIZE/sizeof(struct file *)) { ++ fput(file); ++ file = ERR_PTR(-EBUSY); ++ break; ++ } ++ if (stack == NULL) { ++ stack = (struct file **)__get_free_page(GFP_KERNEL); ++ if (!stack) { ++ fput(file); ++ file = ERR_PTR(-ENOMEM); ++ break; ++ } ++ } ++ stack[depth] = file; ++ depth++; ++ } ++ while (depth > 0) { ++ depth--; ++ fput(stack[depth]); ++ } ++ if (stack) ++ free_page((unsigned long)stack); ++ return file; ++} ++ ++ ++struct file * rst_open_tty(struct cpt_file_image *fi, struct cpt_inode_image *ii, ++ unsigned flags, struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ struct file *master, *slave; ++ struct tty_struct *stty; ++ struct cpt_tty_image *pi; ++ static char *a = "pqrstuvwxyzabcde"; ++ static char *b = "0123456789abcdef"; ++ char pairname[16]; ++ unsigned master_flags, slave_flags; ++ ++ if (fi->cpt_priv == CPT_NULL) ++ return ERR_PTR(-EINVAL); ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, fi->cpt_priv, ctx); ++ if (obj && obj->o_parent) { ++ dprintk_ctx("obtained pty as pair to existing\n"); ++ master = obj->o_parent; ++ stty = master->private_data; ++ ++ if (stty->driver->subtype == PTY_TYPE_MASTER && ++ (stty->driver->flags&TTY_DRIVER_DEVPTS_MEM)) { ++ wprintk_ctx("cloning ptmx\n"); ++ get_file(master); ++ return master; ++ } ++ ++ master = dentry_open(dget(master->f_dentry), ++ mntget(master->f_vfsmnt), flags); ++ if (!IS_ERR(master)) { ++ stty = master->private_data; ++ if (stty->driver->subtype != PTY_TYPE_MASTER) ++ fixup_tty_attrs(ii, master, ctx); ++ } ++ return master; ++ } ++ ++ pi = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_TTY, fi->cpt_priv, pi, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return ERR_PTR(err); ++ } ++ ++ master_flags = slave_flags = 0; ++ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) ++ master_flags = flags; ++ else ++ slave_flags = flags; ++ ++ /* ++ * Open pair master/slave. ++ */ ++ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) { ++ master = ptmx_open(pi->cpt_index, master_flags); ++ } else { ++ sprintf(pairname, "/dev/pty%c%c", a[pi->cpt_index/16], b[pi->cpt_index%16]); ++ master = filp_open(pairname, master_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); ++ } ++ if (IS_ERR(master)) { ++ eprintk_ctx("filp_open master: %Ld %ld\n", fi->cpt_priv, PTR_ERR(master)); ++ cpt_release_buf(ctx); ++ return master; ++ } ++ stty = master->private_data; ++ clear_bit(TTY_PTY_LOCK, &stty->flags); ++ if (pi->cpt_drv_flags&TTY_DRIVER_DEVPTS_MEM) ++ sprintf(pairname, "/dev/pts/%d", stty->index); ++ else ++ sprintf(pairname, "/dev/tty%c%c", a[stty->index/16], b[stty->index%16]); ++ slave = filp_open(pairname, slave_flags|O_NONBLOCK|O_NOCTTY|O_RDWR, 0); ++ if (IS_ERR(slave)) { ++ eprintk_ctx("filp_open slave %s: %ld\n", pairname, PTR_ERR(slave)); ++ fput(master); ++ cpt_release_buf(ctx); ++ return slave; ++ } ++ ++ if (pi->cpt_drv_subtype != PTY_TYPE_MASTER) ++ fixup_tty_attrs(ii, slave, ctx); ++ ++ cpt_object_add(CPT_OBJ_TTY, master->private_data, ctx); ++ cpt_object_add(CPT_OBJ_TTY, slave->private_data, ctx); ++ cpt_object_add(CPT_OBJ_FILE, master, ctx); ++ cpt_object_add(CPT_OBJ_FILE, slave, ctx); ++ ++ if (pi->cpt_drv_subtype == PTY_TYPE_MASTER) { ++ loff_t pos; ++ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); ++ obj->o_parent = master; ++ cpt_obj_setpos(obj, fi->cpt_priv, ctx); ++ pty_setup(stty, fi->cpt_priv, pi, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); ++ obj->o_parent = slave; ++ pos = find_pty_pair(stty->link, fi->cpt_priv, pi, ctx); ++ cpt_obj_setpos(obj, pos, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_FILE, slave, ctx); ++ cpt_obj_setpos(obj, CPT_NULL, ctx); ++ get_file(master); ++ cpt_release_buf(ctx); ++ return master; ++ } else { ++ loff_t pos; ++ obj = lookup_cpt_object(CPT_OBJ_TTY, slave->private_data, ctx); ++ obj->o_parent = slave; ++ cpt_obj_setpos(obj, fi->cpt_priv, ctx); ++ pty_setup(stty->link, fi->cpt_priv, pi, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_TTY, master->private_data, ctx); ++ obj->o_parent = master; ++ pos = find_pty_pair(stty, fi->cpt_priv, pi, ctx); ++ cpt_obj_setpos(obj, pos, ctx); ++ ++ obj = lookup_cpt_object(CPT_OBJ_FILE, master, ctx); ++ cpt_obj_setpos(obj, CPT_NULL, ctx); ++ get_file(slave); ++ cpt_release_buf(ctx); ++ return slave; ++ } ++} ++ ++int rst_tty_jobcontrol(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_TTY]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_TTY || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ cpt_object_t *obj; ++ struct cpt_tty_image *pibuf = cpt_get_buf(ctx); ++ ++ if (rst_get_object(CPT_OBJ_TTY, sec, pibuf, ctx)) { ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_TTY, sec, ctx); ++ if (obj) { ++ struct tty_struct *stty = obj->o_obj; ++ if ((int)pibuf->cpt_pgrp > 0) { ++ stty->pgrp = vpid_to_pid(pibuf->cpt_pgrp); ++ if (stty->pgrp == -1) ++ dprintk_ctx("unknown tty pgrp %d\n", pibuf->cpt_pgrp); ++ } else if (pibuf->cpt_pgrp) { ++ stty->pgrp = alloc_pidmap(); ++ if (stty->pgrp < 0) { ++ eprintk_ctx("cannot allocate stray tty->pgrp"); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ free_pidmap(stty->pgrp); ++ } ++ if ((int)pibuf->cpt_session > 0) { ++ int sess; ++ sess = vpid_to_pid(pibuf->cpt_session); ++ if (sess == -1) { ++ dprintk_ctx("unknown tty session %d\n", pibuf->cpt_session); ++ } else if (stty->session <= 0) { ++ stty->session = sess; ++ } else if (stty->session != sess) { ++ wprintk_ctx("tty session mismatch 2\n"); ++ } ++ } ++ } ++ sec += pibuf->cpt_next; ++ cpt_release_buf(ctx); ++ } ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_ubc.c linux-2.6.16-026test015/kernel/cpt/rst_ubc.c +--- linux-2.6.16.orig/kernel/cpt/rst_ubc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_ubc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,108 @@ ++/* ++ * ++ * kernel/cpt/rst_ubc.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/types.h> ++#include <ub/beancounter.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++ ++struct user_beancounter *rst_lookup_ubc(__u64 pos, struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ obj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, pos, ctx); ++ if (obj == NULL) { ++ printk(KERN_ERR "RST: unknown ub @%Lu\n", pos); ++ return get_beancounter(get_exec_ub()); ++ } ++ return get_beancounter(obj->o_obj); ++} ++ ++static void restore_one_bc_parm(__u64 *dmp, struct ubparm *prm, int held) ++{ ++ prm->barrier = (dmp[0] == CPT_NULL ? UB_MAXVALUE : dmp[0]); ++ prm->limit = (dmp[1] == CPT_NULL ? UB_MAXVALUE : dmp[1]); ++ if (held) ++ prm->held = dmp[2]; ++ prm->maxheld = dmp[3]; ++ prm->minheld = dmp[4]; ++ prm->failcnt = dmp[5]; ++} ++ ++static int restore_one_bc(struct cpt_beancounter_image *v, ++ cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ struct user_beancounter *bc; ++ cpt_object_t *pobj; ++ int i; ++ ++ if (v->cpt_parent != CPT_NULL) { ++ pobj = lookup_cpt_obj_bypos(CPT_OBJ_UBC, v->cpt_parent, ctx); ++ if (pobj == NULL) ++ return -ESRCH; ++ bc = get_subbeancounter_byid(pobj->o_obj, v->cpt_id, 1); ++ } else { ++ bc = get_exec_ub(); ++ while (bc->parent) ++ bc = bc->parent; ++ get_beancounter(bc); ++ } ++ if (bc == NULL) ++ return -ENOMEM; ++ obj->o_obj = bc; ++ ++ for (i = 0; i < UB_RESOURCES; i++) ++ restore_one_bc_parm(v->cpt_parms, bc->ub_parms, 0); ++ for (i = 0; i < UB_RESOURCES; i++) ++ restore_one_bc_parm(v->cpt_parms + UB_RESOURCES * 6, ++ bc->ub_store, 1); ++ return 0; ++} ++ ++int rst_undump_ubc(struct cpt_context *ctx) ++{ ++ loff_t start, end; ++ struct cpt_beancounter_image *v; ++ cpt_object_t *obj; ++ int err; ++ ++ err = rst_get_section(CPT_SECT_UBC, ctx, &start, &end); ++ if (err) ++ return err; ++ ++ while (start < end) { ++ v = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_UBC, start, v, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ cpt_obj_setpos(obj, start, ctx); ++ intern_cpt_object(CPT_OBJ_UBC, obj, ctx); ++ ++ restore_one_bc(v, obj, ctx); ++ ++ cpt_release_buf(ctx); ++ start += v->cpt_next; ++ } ++ return 0; ++} ++ ++void rst_finish_ubc(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ ++ for_each_object(obj, CPT_OBJ_UBC) ++ put_beancounter(obj->o_obj); ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_undump.c linux-2.6.16-026test015/kernel/cpt/rst_undump.c +--- linux-2.6.16.orig/kernel/cpt/rst_undump.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_undump.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,819 @@ ++/* ++ * ++ * kernel/cpt/rst_undump.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/pagemap.h> ++#include <linux/namespace.h> ++#include <linux/personality.h> ++#include <linux/binfmts.h> ++#include <linux/smp_lock.h> ++#include <linux/ve_proto.h> ++#include <linux/virtinfo.h> ++#include <linux/compat.h> ++#include <linux/vzcalluser.h> ++#include <ub/beancounter.h> ++#include <asm/desc.h> ++#include <asm/unistd.h> ++ ++#include "cpt_obj.h" ++#include "cpt_context.h" ++#include "cpt_files.h" ++#include "cpt_mm.h" ++#include "cpt_process.h" ++#include "cpt_socket.h" ++#include "cpt_net.h" ++#include "cpt_ubc.h" ++#include "cpt_kernel.h" ++ ++static int rst_utsname(cpt_context_t *ctx); ++ ++ ++struct thr_context { ++ struct completion init_complete; ++ struct completion task_done; ++ int error; ++ struct cpt_context *ctx; ++ cpt_object_t *tobj; ++}; ++ ++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx); ++ ++static int vps_rst_veinfo(struct cpt_context *ctx) ++{ ++ int err; ++ struct cpt_veinfo_image *i; ++ struct ve_struct *ve; ++ struct timespec delta; ++ loff_t start, end; ++ ++ err = rst_get_section(CPT_SECT_VEINFO, ctx, &start, &end); ++ if (err) ++ goto out; ++ ++ i = cpt_get_buf(ctx); ++ err = rst_get_object(CPT_OBJ_VEINFO, start, i, ctx); ++ if (err) ++ goto out_rel; ++ ++ ve = get_exec_env(); ++ ve->_shm_ctlall = i->shm_ctl_all; ++ ve->_shm_ctlmax = i->shm_ctl_max; ++ ve->_shm_ctlmni = i->shm_ctl_mni; ++ ++ ve->_msg_ctlmax = i->msg_ctl_max; ++ ve->_msg_ctlmni = i->msg_ctl_mni; ++ ve->_msg_ctlmnb = i->msg_ctl_mnb; ++ ++ BUG_ON(sizeof(ve->_sem_ctls) != sizeof(i->sem_ctl_arr)); ++ ve->_sem_ctls[0] = i->sem_ctl_arr[0]; ++ ve->_sem_ctls[1] = i->sem_ctl_arr[1]; ++ ve->_sem_ctls[2] = i->sem_ctl_arr[2]; ++ ve->_sem_ctls[3] = i->sem_ctl_arr[3]; ++ ++ cpt_timespec_import(&delta, i->start_timespec_delta); ++ _set_normalized_timespec(&ve->start_timespec, ++ ve->start_timespec.tv_sec - delta.tv_sec, ++ ve->start_timespec.tv_nsec - delta.tv_nsec); ++ ve->start_jiffies -= i->start_jiffies_delta; ++ // // FIXME: what??? ++ // // ve->start_cycles -= i->start_jiffies_delta * cycles_per_jiffy; ++ ++ err = 0; ++out_rel: ++ cpt_release_buf(ctx); ++out: ++ return err; ++} ++ ++static int vps_rst_reparent_root(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ int err; ++ struct env_create_param2 param; ++ ++ ctx->cpt_jiffies64 = get_jiffies_64(); ++ do_gettimespec(&ctx->delta_time); ++ ++ ctx->delta_time.tv_sec -= ctx->start_time.tv_sec; ++ if (ctx->start_time.tv_nsec > ctx->delta_time.tv_nsec) { ++ ctx->delta_time.tv_sec--; ++ ctx->delta_time.tv_nsec = 1000000000 - (ctx->start_time.tv_nsec - ctx->delta_time.tv_nsec); ++ } else { ++ ctx->delta_time.tv_nsec -= ctx->start_time.tv_nsec; ++ } ++ ++ memset(¶m, 0, sizeof(param)); ++ param.iptables_mask = ctx->iptables_mask; ++ ++ err = real_env_create(ctx->ve_id, VE_CREATE|VE_LOCK, 2, ¶m, sizeof(param)); ++ if (err < 0) ++ eprintk_ctx("real_env_create: %d\n", err); ++ get_exec_env()->jiffies_fixup = ((ctx->delta_time.tv_sec < 0) ? ++ 0 : timespec_to_jiffies(&ctx->delta_time)) - ++ (unsigned long)(ctx->cpt_jiffies64 - ctx->virt_jiffies64); ++ return err < 0 ? err : 0; ++} ++ ++ ++static int hook(void *arg) ++{ ++ struct thr_context *thr_ctx = arg; ++ struct cpt_context *ctx; ++ cpt_object_t *tobj; ++ struct cpt_task_image *ti; ++ int err = 0; ++ ++ current->state = TASK_UNINTERRUPTIBLE; ++ complete(&thr_ctx->init_complete); ++ schedule(); ++ ++ ctx = thr_ctx->ctx; ++ tobj = thr_ctx->tobj; ++ ti = tobj->o_image; ++ ++ current->fs->umask = 0; ++ ++ if (ti->cpt_pid == 1) { ++ err = vps_rst_reparent_root(tobj, ctx); ++ ++ if (err) { ++ rst_report_error(err, ctx); ++ goto out; ++ } ++ ++ memcpy(&get_exec_env()->cap_default, &ti->cpt_ecap, sizeof(kernel_cap_t)); ++ ++ if (ctx->statusfile) { ++ fput(ctx->statusfile); ++ ctx->statusfile = NULL; ++ } ++ ++ if (ctx->lockfile) { ++ mm_segment_t oldfs; ++ ssize_t err = -EINVAL; ++ char b; ++ ++ oldfs = get_fs(); set_fs(KERNEL_DS); ++ if (ctx->lockfile->f_op && ctx->lockfile->f_op->read) ++ err = ctx->lockfile->f_op->read(ctx->lockfile, &b, 1, &ctx->lockfile->f_pos); ++ set_fs(oldfs); ++ fput(ctx->lockfile); ++ ctx->lockfile = NULL; ++ } ++ ++ err = vps_rst_veinfo(ctx); ++ if (err) { ++ eprintk_ctx("rst_veinfo: %d\n", err); ++ goto out; ++ } ++ ++ err = rst_utsname(ctx); ++ if (err) { ++ eprintk_ctx("rst_utsname: %d\n", err); ++ goto out; ++ } ++ ++ err = rst_root_namespace(ctx); ++ if (err) { ++ eprintk_ctx("rst_namespace: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_restore_net(ctx)) != 0) { ++ eprintk_ctx("rst_restore_net: %d\n", err); ++ goto out; ++ } ++ ++ err = rst_sockets(ctx); ++ if (err) { ++ eprintk_ctx("rst_sockets: %d\n", err); ++ goto out; ++ } ++ err = rst_sysv_ipc(ctx); ++ if (err) { ++ eprintk_ctx("rst_sysv_ipc: %d\n", err); ++ goto out; ++ } ++ } ++ ++ do { ++ if (current->user->uid != ti->cpt_user) { ++ struct user_struct *u = alloc_uid(ti->cpt_user); ++ if (!u) { ++ eprintk_ctx("alloc_user\n"); ++ } else { ++ switch_uid(u); ++ } ++ } ++ } while (0); ++ ++ if ((err = rst_mm_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_mm: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_files_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_files: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_fs_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_fs: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_semundo_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_semundo: %d\n", err); ++ goto out; ++ } ++ ++ if ((err = rst_signal_complete(ti, ctx)) != 0) { ++ eprintk_ctx("rst_signal: %d\n", err); ++ goto out; ++ } ++ ++ if (ti->cpt_namespace == CPT_NULL) ++ exit_namespace(current); ++ ++ if (ti->cpt_personality != 0) ++ __set_personality(ti->cpt_personality); ++ ++ current->set_child_tid = NULL; ++ current->clear_child_tid = NULL; ++ current->flags &= ~(PF_FORKNOEXEC|PF_SUPERPRIV); ++ current->flags |= ti->cpt_flags&(PF_FORKNOEXEC|PF_SUPERPRIV); ++ current->exit_code = ti->cpt_exit_code; ++ current->pdeath_signal = ti->cpt_pdeath_signal; ++ ++ if (ti->cpt_restart.fn != CPT_RBL_0) { ++ if (ti->cpt_restart.fn != CPT_RBL_NANOSLEEP ++ && ti->cpt_restart.fn != CPT_RBL_COMPAT_NANOSLEEP ++ ) { ++ eprintk_ctx("unknown restart block\n"); ++ } else { ++ current->thread_info->restart_block.fn = nanosleep_restart; ++#ifdef CONFIG_X86_64 ++ if (!ti->cpt_64bit) ++ current->thread_info->restart_block.fn = compat_nanosleep_restart; ++#endif ++ if (ctx->image_version != 0) { ++ current->thread_info->restart_block.arg0 = ti->cpt_restart.arg0; ++ current->thread_info->restart_block.arg1 = ti->cpt_restart.arg1; ++ current->thread_info->restart_block.arg2 = ti->cpt_restart.arg2; ++ current->thread_info->restart_block.arg3 = ti->cpt_restart.arg3; ++ if (debug_level > 2) { ++ ktime_t e, e1; ++ struct timespec now; ++ ++ do_posix_clock_monotonic_gettime(&now); ++ e = timespec_to_ktime(now); ++ e1.tv64 = ((u64)current->thread_info->restart_block.arg1 << 32) | (u64) current->thread_info->restart_block.arg0; ++ e = ktime_sub(e1, e); ++ dprintk("rst " CPT_FID " RBL %ld/%ld %Ld\n", CPT_TID(current), ++ current->thread_info->restart_block.arg1, ++ current->thread_info->restart_block.arg0, e.tv64); ++ } ++ } else { ++ struct timespec now; ++ ktime_t expire; ++ unsigned long val = ti->cpt_restart.arg0 - ++ timespec_to_jiffies(&ctx->delta_time); ++ if ((long)val <= 0) ++ val = 1; ++ do_posix_clock_monotonic_gettime(&now); ++ expire = ktime_add_ns(timespec_to_ktime(now), (u64)val*TICK_NSEC); ++ current->thread_info->restart_block.arg0 = expire.tv64 & 0xFFFFFFFF; ++ current->thread_info->restart_block.arg1 = expire.tv64 >> 32; ++ current->thread_info->restart_block.arg2 = ti->cpt_restart.arg1; ++ current->thread_info->restart_block.arg3 = CLOCK_MONOTONIC; ++ } ++ } ++ } ++ ++ if (thread_group_leader(current)) { ++ current->signal->it_real_incr.tv64 = 0; ++ if (ctx->image_version != 0) { ++ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr); ++ } else { ++ ktime_add_ns(current->signal->it_real_incr, ti->cpt_it_real_incr*TICK_NSEC); ++ } ++ current->signal->it_prof_incr = ti->cpt_it_prof_incr; ++ current->signal->it_virt_incr = ti->cpt_it_virt_incr; ++ current->signal->it_prof_expires = ti->cpt_it_prof_value; ++ current->signal->it_virt_expires = ti->cpt_it_virt_value; ++ } ++ ++ err = rst_clone_children(tobj, ctx); ++ if (err) { ++ eprintk_ctx("rst_clone_children\n"); ++ goto out; ++ } ++ ++ if (ti->cpt_pid == 1) { ++ if ((err = rst_process_linkage(ctx)) != 0) { ++ eprintk_ctx("rst_process_linkage: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_do_filejobs(ctx)) != 0) { ++ eprintk_ctx("rst_do_filejobs: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_eventpoll(ctx)) != 0) { ++ eprintk_ctx("rst_eventpoll: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_sockets_complete(ctx)) != 0) { ++ eprintk_ctx("rst_sockets_complete: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_stray_files(ctx)) != 0) { ++ eprintk_ctx("rst_stray_files: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_posix_locks(ctx)) != 0) { ++ eprintk_ctx("rst_posix_locks: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_tty_jobcontrol(ctx)) != 0) { ++ eprintk_ctx("rst_tty_jobcontrol: %d\n", err); ++ goto out; ++ } ++ if ((err = rst_restore_fs(ctx)) != 0) { ++ eprintk_ctx("rst_restore_fs: %d\n", err); ++ goto out; ++ } ++ } ++ ++out: ++ thr_ctx->error = err; ++ lock_kernel(); ++ complete(&thr_ctx->task_done); ++ ++ if (!err && (ti->cpt_state & (EXIT_ZOMBIE|EXIT_DEAD))) { ++ preempt_disable(); ++ current->exit_state = EXIT_ZOMBIE; ++ write_lock_irq(&tasklist_lock); ++ nr_zombie++; ++ write_unlock_irq(&tasklist_lock); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) ++ atomic_dec(¤t->signal->live); ++#endif ++ current->flags |= PF_DEAD; ++ if (!(ti->cpt_flags&PF_DEAD)) ++ wprintk_ctx("zombie %d,%d(%s) is not pf_dead\n", current->pid, virt_pid(current), current->comm); ++ module_put(current->thread_info->exec_domain->module); ++ if (current->binfmt) ++ module_put(current->binfmt->module); ++ } else { ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ } ++ ++ schedule(); ++ ++ dprintk_ctx("leaked through %d/%d %p\n", current->pid, virt_pid(current), current->mm); ++ ++ module_put(THIS_MODULE); ++ complete_and_exit(NULL, 0); ++ return 0; ++} ++ ++#if 0 ++static void set_task_ubs(struct cpt_task_image *ti, struct cpt_context *ctx) ++{ ++ struct task_beancounter *tbc; ++ ++ tbc = task_bc(current); ++ ++ put_beancounter(tbc->fork_sub); ++ tbc->fork_sub = rst_lookup_ubc(ti->cpt_task_ub, ctx); ++ if (ti->cpt_mm_ub != CPT_NULL) { ++ put_beancounter(tbc->exec_ub); ++ tbc->exec_ub = rst_lookup_ubc(ti->cpt_mm_ub, ctx); ++ } ++} ++#endif ++ ++static int create_root_task(cpt_object_t *obj, struct cpt_context *ctx, ++ struct thr_context *thr_ctx) ++{ ++ task_t *tsk; ++ int pid; ++ ++ thr_ctx->ctx = ctx; ++ thr_ctx->error = 0; ++ init_completion(&thr_ctx->init_complete); ++ init_completion(&thr_ctx->task_done); ++#if 0 ++ set_task_ubs(obj->o_image, ctx); ++#endif ++ ++ pid = local_kernel_thread(hook, thr_ctx, 0, 0); ++ if (pid < 0) ++ return pid; ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid_ve(pid); ++ if (tsk) ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ if (tsk == NULL) ++ return -ESRCH; ++ cpt_obj_setobj(obj, tsk, ctx); ++ thr_ctx->tobj = obj; ++ return 0; ++} ++ ++static int rst_basic_init_task(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ task_t *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ ++ memcpy(tsk->comm, ti->cpt_comm, sizeof(tsk->comm)); ++ rst_mm_basic(obj, ti, ctx); ++ return 0; ++} ++ ++static int make_baby(cpt_object_t *cobj, ++ struct cpt_task_image *pi, ++ struct cpt_context *ctx) ++{ ++ unsigned long flags; ++ struct cpt_task_image *ci = cobj->o_image; ++ struct thr_context thr_ctx; ++ task_t *tsk; ++ pid_t pid; ++ ++ flags = rst_mm_flag(ci, ctx) | rst_files_flag(ci, ctx) ++ | rst_signal_flag(ci, ctx) | rst_semundo_flag(ci, ctx); ++ if (ci->cpt_rppid != pi->cpt_pid) { ++ flags |= CLONE_THREAD|CLONE_PARENT; ++ if (ci->cpt_signal != pi->cpt_signal || ++ !(flags&CLONE_SIGHAND) || ++ (!(flags&CLONE_VM) && pi->cpt_mm != CPT_NULL)) { ++ eprintk_ctx("something is wrong with threads: %d %d %d %Ld %Ld %08lx\n", ++ (int)ci->cpt_pid, (int)ci->cpt_rppid, (int)pi->cpt_pid, ++ ci->cpt_signal, pi->cpt_signal, flags ++ ); ++ return -EINVAL; ++ } ++ } ++ ++ thr_ctx.ctx = ctx; ++ thr_ctx.error = 0; ++ init_completion(&thr_ctx.init_complete); ++ init_completion(&thr_ctx.task_done); ++ thr_ctx.tobj = cobj; ++ ++#if 0 ++ set_task_ubs(ci, ctx); ++#endif ++ ++ pid = local_kernel_thread(hook, &thr_ctx, flags, ci->cpt_pid); ++ if (pid < 0) ++ return pid; ++ ++ read_lock(&tasklist_lock); ++ tsk = find_task_by_pid_ve(pid); ++ if (tsk) ++ get_task_struct(tsk); ++ read_unlock(&tasklist_lock); ++ if (tsk == NULL) ++ return -ESRCH; ++ cpt_obj_setobj(cobj, tsk, ctx); ++ thr_ctx.tobj = cobj; ++ wait_for_completion(&thr_ctx.init_complete); ++#ifdef CONFIG_SMP ++ wait_task_inactive(cobj->o_obj); ++#endif ++ rst_basic_init_task(cobj, ctx); ++ ++ /* clone() increases group_stop_count if it was not zero and ++ * CLONE_THREAD was asked. Undo. ++ */ ++ if (current->signal->group_stop_count && (flags & CLONE_THREAD)) { ++ if (tsk->signal != current->signal) BUG(); ++ current->signal->group_stop_count--; ++ } ++ ++ wake_up_process(tsk); ++ wait_for_completion(&thr_ctx.task_done); ++ wait_task_inactive(tsk); ++ ++ return thr_ctx.error; ++} ++ ++static int rst_clone_children(cpt_object_t *obj, struct cpt_context *ctx) ++{ ++ int err = 0; ++ struct cpt_task_image *ti = obj->o_image; ++ cpt_object_t *cobj; ++ ++ for_each_object(cobj, CPT_OBJ_TASK) { ++ struct cpt_task_image *ci = cobj->o_image; ++ if (cobj == obj) ++ continue; ++ if ((ci->cpt_rppid == ti->cpt_pid && ci->cpt_tgid == ci->cpt_pid) || ++ (ci->cpt_leader == ti->cpt_pid && ++ ci->cpt_tgid != ci->cpt_pid && ci->cpt_pid != 1)) { ++ err = make_baby(cobj, ti, ctx); ++ if (err) { ++ eprintk_ctx("make_baby: %d\n", err); ++ return err; ++ } ++ } ++ } ++ return 0; ++} ++ ++static int read_task_images(struct cpt_context *ctx) ++{ ++ int err; ++ loff_t start, end; ++ ++ err = rst_get_section(CPT_SECT_TASKS, ctx, &start, &end); ++ if (err) ++ return err; ++ ++ while (start < end) { ++ cpt_object_t *obj; ++ struct cpt_task_image *ti = cpt_get_buf(ctx); ++ ++ err = rst_get_object(CPT_OBJ_TASK, start, ti, ctx); ++ if (err) { ++ cpt_release_buf(ctx); ++ return err; ++ } ++ if (ti->cpt_pid != 1 && !__is_virtual_pid(ti->cpt_pid)) { ++ eprintk_ctx("BUG: pid %d is not virtual\n", ti->cpt_pid); ++ cpt_release_buf(ctx); ++ return -EINVAL; ++ } ++ obj = alloc_cpt_object(GFP_KERNEL, ctx); ++ cpt_obj_setpos(obj, start, ctx); ++ intern_cpt_object(CPT_OBJ_TASK, obj, ctx); ++ obj->o_image = kmalloc(ti->cpt_next, GFP_KERNEL); ++ if (obj->o_image == NULL) { ++ cpt_release_buf(ctx); ++ return -ENOMEM; ++ } ++ memcpy(obj->o_image, ti, sizeof(*ti)); ++ err = ctx->pread(obj->o_image + sizeof(*ti), ++ ti->cpt_next - sizeof(*ti), ctx, start + sizeof(*ti)); ++ cpt_release_buf(ctx); ++ if (err) ++ return err; ++ start += ti->cpt_next; ++ } ++ return 0; ++} ++ ++ ++static int vps_rst_restore_tree(struct cpt_context *ctx) ++{ ++ int err; ++ cpt_object_t *obj; ++ struct thr_context thr_ctx_root; ++ ++ err = read_task_images(ctx); ++ if (err) ++ return err; ++ ++ err = rst_undump_ubc(ctx); ++ if (err) ++ return err; ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ err = create_root_task(obj, ctx, &thr_ctx_root); ++ if (err) ++ return err; ++ ++ wait_for_completion(&thr_ctx_root.init_complete); ++#ifdef CONFIG_SMP ++ wait_task_inactive(obj->o_obj); ++#endif ++ rst_basic_init_task(obj, ctx); ++ ++ wake_up_process(obj->o_obj); ++ wait_for_completion(&thr_ctx_root.task_done); ++ wait_task_inactive(obj->o_obj); ++ err = thr_ctx_root.error; ++ if (err) ++ return err; ++ break; ++ } ++ ++ return err; ++} ++ ++ ++int vps_rst_undump(struct cpt_context *ctx) ++{ ++ int err; ++ unsigned long umask; ++ ++ err = rst_open_dumpfile(ctx); ++ if (err) ++ return err; ++ ++#ifndef CONFIG_X86_64 ++ if (ctx->tasks64) { ++ eprintk_ctx("Cannot restore 64 bit VE on this architecture\n"); ++ return -EINVAL; ++ } ++#endif ++ ++ umask = current->fs->umask; ++ current->fs->umask = 0; ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ err = rst_setup_pagein(ctx); ++#endif ++ ++ if (err == 0) ++ err = vps_rst_restore_tree(ctx); ++ ++ if (err == 0) ++ err = rst_restore_process(ctx); ++ ++ current->fs->umask = umask; ++ ++ return err; ++} ++ ++static int rst_unlock_ve(struct cpt_context *ctx) ++{ ++ struct ve_struct *env; ++ ++ env = get_ve_by_id(ctx->ve_id); ++ if (!env) ++ return -ESRCH; ++ down_write(&env->op_sem); ++ env->is_locked = 0; ++ up_write(&env->op_sem); ++ put_ve(env); ++ return 0; ++} ++ ++int rst_resume(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ int err = 0; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ ++ fput(file); ++ } ++ ++ rst_resume_network(ctx); ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ struct cpt_task_image *ti = obj->o_image; ++ ++ if (!tsk) ++ continue; ++ ++ if (ti->cpt_state == TASK_UNINTERRUPTIBLE) { ++ dprintk_ctx("task %d/%d(%s) is started\n", virt_pid(tsk), tsk->pid, tsk->comm); ++ ++ /* Weird... If a signal is sent to stopped task, ++ * nobody makes recalc_sigpending(). We have to do ++ * this by hands after wake_up_process(). ++ * if we did this before a signal could arrive before ++ * wake_up_process() and stall. ++ */ ++ spin_lock_irq(&tsk->sighand->siglock); ++ if (!signal_pending(tsk)) ++ recalc_sigpending_tsk(tsk); ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ wake_up_process(tsk); ++ } else { ++ if (ti->cpt_state == TASK_STOPPED || ++ ti->cpt_state == TASK_TRACED) { ++ set_task_state(tsk, ti->cpt_state); ++ } ++ } ++ put_task_struct(tsk); ++ } ++ ++ rst_unlock_ve(ctx); ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ rst_complete_pagein(ctx, 0); ++#endif ++ ++ rst_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ ++ return err; ++} ++ ++int rst_kill(struct cpt_context *ctx) ++{ ++ cpt_object_t *obj; ++ int err = 0; ++ ++ for_each_object(obj, CPT_OBJ_FILE) { ++ struct file *file = obj->o_obj; ++ ++ fput(file); ++ } ++ ++ for_each_object(obj, CPT_OBJ_TASK) { ++ task_t *tsk = obj->o_obj; ++ ++ if (tsk == NULL) ++ continue; ++ ++ if (tsk->exit_state == 0) { ++ send_sig(SIGKILL, tsk, 1); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ sigfillset(&tsk->blocked); ++ sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); ++ set_tsk_thread_flag(tsk, TIF_SIGPENDING); ++ clear_tsk_thread_flag(tsk, TIF_FREEZE); ++ if (tsk->flags & PF_FROZEN) ++ tsk->flags &= ~PF_FROZEN; ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ wake_up_process(tsk); ++ } ++ ++ put_task_struct(tsk); ++ } ++ ++#ifdef CONFIG_VZ_CHECKPOINT_LAZY ++ rst_complete_pagein(ctx, 1); ++#endif ++ ++ rst_finish_ubc(ctx); ++ cpt_object_destroy(ctx); ++ ++ return err; ++} ++ ++static int rst_utsname(cpt_context_t *ctx) ++{ ++ int err; ++ loff_t sec = ctx->sections[CPT_SECT_UTSNAME]; ++ loff_t endsec; ++ struct cpt_section_hdr h; ++ struct cpt_object_hdr o; ++ int i; ++ ++ if (sec == CPT_NULL) ++ return 0; ++ ++ err = ctx->pread(&h, sizeof(h), ctx, sec); ++ if (err) ++ return err; ++ if (h.cpt_section != CPT_SECT_UTSNAME || h.cpt_hdrlen < sizeof(h)) ++ return -EINVAL; ++ ++ i = 0; ++ endsec = sec + h.cpt_next; ++ sec += h.cpt_hdrlen; ++ while (sec < endsec) { ++ int len; ++ char *ptr; ++ err = rst_get_object(CPT_OBJ_NAME, sec, &o, ctx); ++ if (err) ++ return err; ++ len = o.cpt_next - o.cpt_hdrlen; ++ if (len > __NEW_UTS_LEN+1) ++ return -ENAMETOOLONG; ++ switch (i) { ++ case 0: ++ ptr = ve_utsname.nodename; break; ++ case 1: ++ ptr = ve_utsname.domainname; break; ++ default: ++ return -EINVAL; ++ } ++ err = ctx->pread(ptr, len, ctx, sec+o.cpt_hdrlen); ++ if (err) ++ return err; ++ i++; ++ sec += o.cpt_next; ++ } ++ ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/cpt/rst_x8664.S linux-2.6.16-026test015/kernel/cpt/rst_x8664.S +--- linux-2.6.16.orig/kernel/cpt/rst_x8664.S 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/cpt/rst_x8664.S 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,61 @@ ++#define ASSEMBLY 1 ++ ++#include <linux/config.h> ++#include <linux/linkage.h> ++#include <asm/segment.h> ++#include <asm/smp.h> ++#include <asm/cache.h> ++#include <asm/errno.h> ++#include <asm/dwarf2.h> ++#include <asm/calling.h> ++#include <asm/msr.h> ++#include <asm/unistd.h> ++#include <asm/thread_info.h> ++#include <asm/hw_irq.h> ++#include <asm/errno.h> ++#include <asm/asm-offsets.h> ++ ++ .code64 ++ .global schedule_tail_hook, schedule_tail_p ++ .align 8 ++schedule_tail_hook: ++ movq schedule_tail_p(%rip),%r11 ++ call *%r11 ++ GET_THREAD_INFO(%rcx) ++ btr $22,threadinfo_flags(%rcx) /* TIF_RESUME */ ++ jc 1f ++ retq ++ ++ /* If TIF_RESUME is set, (%rsp) is pointer to hook function ++ * the hook will do the work and jump to the next hook, ++ * everything should end at ret_from_fork+5. ++ */ ++1: addq $8,%rsp ++ retq ++ ++ .align 8 ++ .global ret_from_fork2 ++ret_from_fork2: ++ cmpq $0,ORIG_RAX(%rsp) ++ jge ret_from_fork+5 ++ RESTORE_REST ++ jmp int_ret_from_sys_call ++ ++ .align 8 ++ .global ret_last_siginfo ++ret_last_siginfo: ++ call rlsi ++ movq %rax,%rsp ++ retq ++ ++ .align 8 ++ .global ret_child_tid ++ret_child_tid: ++ movq %rsp,%rdi ++ call rct ++ movq %rax,%rsp ++ retq ++ ++ .data ++schedule_tail_p: ++ .quad 0 +diff -upr linux-2.6.16.orig/kernel/cpu.c linux-2.6.16-026test015/kernel/cpu.c +--- linux-2.6.16.orig/kernel/cpu.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/cpu.c 2006-07-04 14:41:39.000000000 +0400 +@@ -21,6 +21,11 @@ static DECLARE_MUTEX(cpucontrol); + static struct notifier_block *cpu_chain; + + #ifdef CONFIG_HOTPLUG_CPU ++ ++#ifdef CONFIG_SCHED_VCPU ++#error "CONFIG_HOTPLUG_CPU isn't supported with CONFIG_SCHED_VCPU" ++#endif ++ + static struct task_struct *lock_cpu_hotplug_owner; + static int lock_cpu_hotplug_depth; + +@@ -95,8 +100,8 @@ static inline void check_for_tasks(int c + struct task_struct *p; + + write_lock_irq(&tasklist_lock); +- for_each_process(p) { +- if (task_cpu(p) == cpu && ++ for_each_process_all(p) { ++ if (task_pcpu(p) == cpu && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ +@@ -106,6 +111,13 @@ static inline void check_for_tasks(int c + write_unlock_irq(&tasklist_lock); + } + ++#ifdef CONFIG_SCHED_VCPU ++#error VCPU vs. HOTPLUG: fix hotplug code below ++/* ++ * What should be fixed: ++ * - check for if (idle_cpu()) yield() ++ */ ++#endif + /* Take this CPU down. */ + static int take_cpu_down(void *unused) + { +diff -upr linux-2.6.16.orig/kernel/cpuset.c linux-2.6.16-026test015/kernel/cpuset.c +--- linux-2.6.16.orig/kernel/cpuset.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/cpuset.c 2006-07-04 14:41:38.000000000 +0400 +@@ -897,7 +897,7 @@ static int update_nodemask(struct cpuset + n = 0; + + /* Load up mmarray[] with mm reference for each task in cpuset. */ +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + struct mm_struct *mm; + + if (n >= ntasks) { +@@ -911,7 +911,7 @@ static int update_nodemask(struct cpuset + if (!mm) + continue; + mmarray[n++] = mm; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + write_unlock_irq(&tasklist_lock); + + /* +@@ -1125,7 +1125,7 @@ static int attach_task(struct cpuset *cs + if (pid) { + read_lock(&tasklist_lock); + +- tsk = find_task_by_pid(pid); ++ tsk = find_task_by_pid_all(pid); + if (!tsk || tsk->flags & PF_EXITING) { + read_unlock(&tasklist_lock); + return -ESRCH; +@@ -1561,13 +1561,13 @@ static int pid_array_load(pid_t *pidarra + + read_lock(&tasklist_lock); + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (p->cpuset == cs) { + pidarray[n++] = p->pid; + if (unlikely(n == npids)) + goto array_full; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + array_full: + read_unlock(&tasklist_lock); +diff -upr linux-2.6.16.orig/kernel/exec_domain.c linux-2.6.16-026test015/kernel/exec_domain.c +--- linux-2.6.16.orig/kernel/exec_domain.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/exec_domain.c 2006-07-04 14:41:36.000000000 +0400 +@@ -140,6 +140,7 @@ __set_personality(u_long personality) + ep = lookup_exec_domain(personality); + if (ep == current_thread_info()->exec_domain) { + current->personality = personality; ++ module_put(ep->module); + return 0; + } + +diff -upr linux-2.6.16.orig/kernel/exit.c linux-2.6.16-026test015/kernel/exit.c +--- linux-2.6.16.orig/kernel/exit.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/exit.c 2006-07-04 14:41:39.000000000 +0400 +@@ -42,7 +42,7 @@ extern struct task_struct *child_reaper; + + int getrusage(struct task_struct *, int, struct rusage __user *); + +-static void exit_mm(struct task_struct * tsk); ++void exit_mm(struct task_struct * tsk); + + static void __unhash_process(struct task_struct *p) + { +@@ -57,18 +57,19 @@ static void __unhash_process(struct task + } + + REMOVE_LINKS(p); ++ REMOVE_VE_LINKS(p); + } + + void release_task(struct task_struct * p) + { + int zap_leader; + task_t *leader; +- struct dentry *proc_dentry; ++ struct dentry *proc_dentry[2]; + + repeat: + atomic_dec(&p->user->processes); + spin_lock(&p->proc_lock); +- proc_dentry = proc_pid_unhash(p); ++ proc_pid_unhash(p, proc_dentry); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) + __ptrace_unlink(p); +@@ -80,6 +81,8 @@ repeat: + * the process by __unhash_process. + */ + __unhash_process(p); ++ nr_zombie--; ++ atomic_inc(&nr_dead); + + /* + * If we are the last non-leader member of the thread +@@ -107,6 +110,10 @@ repeat: + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); + release_thread(p); ++#ifdef CONFIG_VE ++ if (atomic_dec_and_test(&VE_TASK_INFO(p)->owner_env->pcounter)) ++ do_env_cleanup(VE_TASK_INFO(p)->owner_env); ++#endif + put_task_struct(p); + + p = leader; +@@ -118,10 +125,10 @@ repeat: + + void unhash_process(struct task_struct *p) + { +- struct dentry *proc_dentry; ++ struct dentry *proc_dentry[2]; + + spin_lock(&p->proc_lock); +- proc_dentry = proc_pid_unhash(p); ++ proc_pid_unhash(p, proc_dentry); + write_lock_irq(&tasklist_lock); + __unhash_process(p); + write_unlock_irq(&tasklist_lock); +@@ -139,14 +146,16 @@ int session_of_pgrp(int pgrp) + struct task_struct *p; + int sid = -1; + ++ WARN_ON(is_virtual_pid(pgrp)); ++ + read_lock(&tasklist_lock); +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + if (p->signal->session > 0) { + sid = p->signal->session; + goto out; + } +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); +- p = find_task_by_pid(pgrp); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); ++ p = find_task_by_pid_ve(pgrp); + if (p) + sid = p->signal->session; + out: +@@ -168,17 +177,19 @@ static int will_become_orphaned_pgrp(int + struct task_struct *p; + int ret = 1; + +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ WARN_ON(is_virtual_pid(pgrp)); ++ ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + if (p == ignored_task + || p->exit_state +- || p->real_parent->pid == 1) ++ || virt_pid(p->real_parent) == 1) + continue; + if (process_group(p->real_parent) != pgrp + && p->real_parent->signal->session == p->signal->session) { + ret = 0; + break; + } +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + return ret; /* (sighing) "Often!" */ + } + +@@ -186,6 +197,8 @@ int is_orphaned_pgrp(int pgrp) + { + int retval; + ++ WARN_ON(is_virtual_pid(pgrp)); ++ + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(pgrp, NULL); + read_unlock(&tasklist_lock); +@@ -198,7 +211,7 @@ static int has_stopped_jobs(int pgrp) + int retval = 0; + struct task_struct *p; + +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + if (p->state != TASK_STOPPED) + continue; + +@@ -214,7 +227,7 @@ static int has_stopped_jobs(int pgrp) + + retval = 1; + break; +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + return retval; + } + +@@ -263,6 +276,9 @@ void __set_special_pids(pid_t session, p + { + struct task_struct *curr = current->group_leader; + ++ WARN_ON(is_virtual_pid(pgrp)); ++ WARN_ON(is_virtual_pid(session)); ++ + if (curr->signal->session != session) { + detach_pid(curr, PIDTYPE_SID); + curr->signal->session = session; +@@ -281,6 +297,7 @@ void set_special_pids(pid_t session, pid + __set_special_pids(session, pgrp); + write_unlock_irq(&tasklist_lock); + } ++EXPORT_SYMBOL(set_special_pids); + + /* + * Let kernel threads use this to say that they +@@ -500,7 +517,7 @@ EXPORT_SYMBOL_GPL(exit_fs); + * Turn us into a lazy TLB process if we + * aren't already.. + */ +-static void exit_mm(struct task_struct * tsk) ++void exit_mm(struct task_struct * tsk) + { + struct mm_struct *mm = tsk->mm; + +@@ -535,6 +552,7 @@ static void exit_mm(struct task_struct * + task_unlock(tsk); + mmput(mm); + } ++EXPORT_SYMBOL_GPL(exit_mm); + + static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) + { +@@ -613,13 +631,12 @@ static void reparent_thread(task_t *p, t + static void forget_original_parent(struct task_struct * father, + struct list_head *to_release) + { +- struct task_struct *p, *reaper = father; ++ struct task_struct *p, *tsk_reaper, *reaper = father; + struct list_head *_p, *_n; + + do { + reaper = next_thread(reaper); + if (reaper == father) { +- reaper = child_reaper; + break; + } + } while (reaper->exit_state); +@@ -641,9 +658,16 @@ static void forget_original_parent(struc + /* if father isn't the real parent, then ptrace must be enabled */ + BUG_ON(father != p->real_parent && !ptrace); + ++ tsk_reaper = reaper; ++ if (tsk_reaper == father) ++#ifdef CONFIG_VE ++ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; ++ if (tsk_reaper == p) ++#endif ++ tsk_reaper = child_reaper; + if (father == p->real_parent) { +- /* reparent with a reaper, real father it's us */ +- choose_new_parent(p, reaper, child_reaper); ++ /* reparent with a tsk_reaper, real father it's us */ ++ choose_new_parent(p, tsk_reaper, child_reaper); + reparent_thread(p, father, 0); + } else { + /* reparent ptraced task to its real parent */ +@@ -664,7 +688,15 @@ static void forget_original_parent(struc + } + list_for_each_safe(_p, _n, &father->ptrace_children) { + p = list_entry(_p,struct task_struct,ptrace_list); +- choose_new_parent(p, reaper, child_reaper); ++ ++ tsk_reaper = reaper; ++ if (tsk_reaper == father) ++#ifdef CONFIG_VE ++ tsk_reaper = VE_TASK_INFO(p)->owner_env->init_entry; ++ if (tsk_reaper == p) ++#endif ++ tsk_reaper = child_reaper; ++ choose_new_parent(p, tsk_reaper, child_reaper); + reparent_thread(p, father, 1); + } + } +@@ -760,6 +792,9 @@ static void exit_notify(struct task_stru + && !capable(CAP_KILL)) + tsk->exit_signal = SIGCHLD; + ++ if (tsk->exit_signal != -1 && t == child_reaper) ++ /* We dont want people slaying init. */ ++ tsk->exit_signal = SIGCHLD; + + /* If something other than our normal parent is ptracing us, then + * send it a SIGCHLD instead of honoring exit_signal. exit_signal +@@ -778,6 +813,7 @@ static void exit_notify(struct task_stru + unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) + state = EXIT_DEAD; + tsk->exit_state = state; ++ nr_zombie++; + + write_unlock_irq(&tasklist_lock); + +@@ -792,6 +828,82 @@ static void exit_notify(struct task_stru + release_task(tsk); + } + ++#ifdef CONFIG_VE ++/* ++ * Handle exitting of init process, it's a special case for VE. ++ */ ++static void do_initproc_exit(void) ++{ ++ struct task_struct *tsk; ++ struct ve_struct *env; ++ struct siginfo info; ++ struct task_struct *g, *p; ++ long delay = 1L; ++ ++ tsk = current; ++ env = VE_TASK_INFO(current)->owner_env; ++ if (env->init_entry != tsk) ++ return; ++ ++ if (ve_is_super(env) && tsk->pid == 1) ++ panic("Attempted to kill init!"); ++ ++ memset(&info, 0, sizeof(info)); ++ info.si_errno = 0; ++ info.si_code = SI_KERNEL; ++ info.si_pid = virt_pid(tsk); ++ info.si_uid = current->uid; ++ info.si_signo = SIGKILL; ++ ++ /* ++ * Here the VE changes its state into "not running". ++ * op_sem taken for write is a barrier to all VE manipulations from ++ * ioctl: it waits for operations currently in progress and blocks all ++ * subsequent operations until is_running is set to 0 and op_sem is ++ * released. ++ */ ++ down_write(&env->op_sem); ++ env->is_running = 0; ++ up_write(&env->op_sem); ++ ++ /* send kill to all processes of VE */ ++ read_lock(&tasklist_lock); ++ do_each_thread_ve(g, p) { ++ force_sig_info(SIGKILL, &info, p); ++ } while_each_thread_ve(g, p); ++ read_unlock(&tasklist_lock); ++ ++ /* wait for all init childs exit */ ++ while (atomic_read(&env->pcounter) > 1) { ++ if (sys_wait4(-1, NULL, __WALL | WNOHANG, NULL) > 0) ++ continue; ++ /* it was ENOCHLD or no more children somehow */ ++ if (atomic_read(&env->pcounter) == 1) ++ break; ++ ++ /* clear all signals to avoid wakeups */ ++ if (signal_pending(tsk)) ++ flush_signals(tsk); ++ /* we have child without signal sent */ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(delay); ++ delay = (delay < HZ) ? (delay << 1) : HZ; ++ read_lock(&tasklist_lock); ++ do_each_thread_ve(g, p) { ++ if (p != tsk) ++ force_sig_info(SIGKILL, &info, p); ++ } while_each_thread_ve(g, p); ++ read_unlock(&tasklist_lock); ++ } ++ env->init_entry = child_reaper; ++ write_lock_irq(&tasklist_lock); ++ REMOVE_LINKS(tsk); ++ tsk->parent = tsk->real_parent = child_reaper; ++ SET_LINKS(tsk); ++ write_unlock_irq(&tasklist_lock); ++} ++#endif ++ + fastcall NORET_TYPE void do_exit(long code) + { + struct task_struct *tsk = current; +@@ -805,14 +917,20 @@ fastcall NORET_TYPE void do_exit(long co + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); ++#ifdef CONFIG_VE ++ do_initproc_exit(); ++#else + if (unlikely(tsk->pid == 1)) + panic("Attempted to kill init!"); ++#endif + if (tsk->io_context) + exit_io_context(); + + if (unlikely(current->ptrace & PT_TRACE_EXIT)) { + current->ptrace_message = code; ++ set_pn_state(current, PN_STOP_EXIT); + ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); ++ clear_pn_state(current); + } + + /* +@@ -828,14 +946,6 @@ fastcall NORET_TYPE void do_exit(long co + + tsk->flags |= PF_EXITING; + +- /* +- * Make sure we don't try to process any timer firings +- * while we are already exiting. +- */ +- tsk->it_virt_expires = cputime_zero; +- tsk->it_prof_expires = cputime_zero; +- tsk->it_sched_expires = 0; +- + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, current->pid, +@@ -911,7 +1021,14 @@ asmlinkage long sys_exit(int error_code) + + task_t fastcall *next_thread(const task_t *p) + { +- return pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); ++ task_t *tsk; ++ ++ tsk = pid_task(p->pids[PIDTYPE_TGID].pid_list.next, PIDTYPE_TGID); ++#ifdef CONFIG_VE ++ /* all threads should belong to ONE ve! */ ++ BUG_ON(VE_TASK_INFO(tsk)->owner_env != VE_TASK_INFO(p)->owner_env); ++#endif ++ return tsk; + } + + EXPORT_SYMBOL(next_thread); +@@ -960,14 +1077,19 @@ asmlinkage void sys_exit_group(int error + static int eligible_child(pid_t pid, int options, task_t *p) + { + if (pid > 0) { +- if (p->pid != pid) ++ if ((is_virtual_pid(pid) ? virt_pid(p) : p->pid) != pid) + return 0; + } else if (!pid) { + if (process_group(p) != process_group(current)) + return 0; + } else if (pid != -1) { +- if (process_group(p) != -pid) +- return 0; ++ if (__is_virtual_pid(-pid)) { ++ if (virt_pgid(p) != -pid) ++ return 0; ++ } else { ++ if (process_group(p) != -pid) ++ return 0; ++ } + } + + /* +@@ -1157,7 +1279,7 @@ static int wait_task_zombie(task_t *p, i + p->exit_state = EXIT_ZOMBIE; + return retval; + } +- retval = p->pid; ++ retval = get_task_pid(p); + if (p->real_parent != p->parent) { + write_lock_irq(&tasklist_lock); + /* Double-check with lock held. */ +@@ -1292,7 +1414,7 @@ bail_ref: + if (!retval && infop) + retval = put_user(p->uid, &infop->si_uid); + if (!retval) +- retval = p->pid; ++ retval = get_task_pid(p); + put_task_struct(p); + + BUG_ON(!retval); +@@ -1574,6 +1696,7 @@ asmlinkage long sys_wait4(pid_t pid, int + prevent_tail_call(ret); + return ret; + } ++EXPORT_SYMBOL_GPL(sys_wait4); + + #ifdef __ARCH_WANT_SYS_WAITPID + +diff -upr linux-2.6.16.orig/kernel/fairsched.c linux-2.6.16-026test015/kernel/fairsched.c +--- linux-2.6.16.orig/kernel/fairsched.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/fairsched.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1288 @@ ++/* ++ * Fair Scheduler ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * Start-tag scheduling follows the theory presented in ++ * http://www.cs.utexas.edu/users/dmcl/papers/ps/SIGCOMM96.ps ++ */ ++ ++#include <linux/config.h> ++#include <linux/kernel.h> ++#include <asm/timex.h> ++#include <asm/atomic.h> ++#include <linux/spinlock.h> ++#include <asm/semaphore.h> ++#include <linux/init.h> ++#include <linux/slab.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++#include <linux/fs.h> ++#include <linux/dcache.h> ++#include <linux/sysctl.h> ++#include <linux/module.h> ++#include <linux/vmalloc.h> ++#include <linux/sched.h> ++#include <linux/fairsched.h> ++#include <linux/vsched.h> ++ ++/* we need it for vsched routines in sched.c */ ++spinlock_t fairsched_lock = SPIN_LOCK_UNLOCKED; ++ ++#ifdef CONFIG_FAIRSCHED ++ ++#define FAIRSHED_DEBUG " debug" ++ ++ ++/*********************************************************************/ ++/* ++ * Special arithmetics ++ */ ++/*********************************************************************/ ++ ++#define CYCLES_SHIFT (8) ++#define SCYCLES_TIME(time) \ ++ ((scycles_t) {((time) + (1 << CYCLES_SHIFT) - 1) >> CYCLES_SHIFT}) ++ ++#define CYCLES_ZERO (0) ++static inline int CYCLES_BEFORE(cycles_t x, cycles_t y) ++{ ++ return (__s64)(x-y) < 0; ++} ++static inline int CYCLES_AFTER(cycles_t x, cycles_t y) ++{ ++ return (__s64)(y-x) < 0; ++} ++static inline void CYCLES_DADD(cycles_t *x, fschdur_t y) {*x+=y.d;} ++ ++#define FSCHDUR_ZERO (0) ++#define TICK_DUR ((fschdur_t){cycles_per_jiffy}) ++static inline fschdur_t FSCHDURATION(cycles_t x, cycles_t y) ++{ ++ return (fschdur_t){x - y}; ++} ++static inline int FSCHDUR_CMP(fschdur_t x, fschdur_t y) ++{ ++ if (x.d < y.d) return -1; ++ if (x.d > y.d) return 1; ++ return 0; ++} ++static inline fschdur_t FSCHDUR_SUB(fschdur_t x, fschdur_t y) ++{ ++ return (fschdur_t){x.d - y.d}; ++} ++ ++#define FSCHTAG_ZERO ((fschtag_t){0}) ++static inline int FSCHTAG_CMP(fschtag_t x, fschtag_t y) ++{ ++ if (x.t < y.t) return -1; ++ if (x.t > y.t) return 1; ++ return 0; ++} ++static inline fschtag_t FSCHTAG_MAX(fschtag_t x, fschtag_t y) ++{ ++ return x.t >= y.t ? x : y; ++} ++static inline int FSCHTAG_DADD(fschtag_t *tag, fschdur_t dur, unsigned w) ++{ ++ cycles_t new_tag; ++ new_tag = tag->t + (cycles_t)dur.d * w; ++ if (new_tag < tag->t) ++ return -1; ++ /* DEBUG */ ++ if (new_tag >= (1ULL << 48)) ++ return -1; ++ tag->t = new_tag; ++ return 0; ++} ++static inline int FSCHTAG_ADD(fschtag_t *tag, fschtag_t y) ++{ ++ cycles_t new_tag; ++ new_tag = tag->t + y.t; ++ if (new_tag < tag->t) ++ return -1; ++ tag->t = new_tag; ++ return 0; ++} ++static inline fschtag_t FSCHTAG_SUB(fschtag_t x, fschtag_t y) ++{ ++ return (fschtag_t){x.t - y.t}; ++} ++ ++#define FSCHVALUE_ZERO ((fschvalue_t){0}) ++#define TICK_VALUE ((fschvalue_t){(cycles_t)cycles_per_jiffy << FSCHRATE_SHIFT}) ++static inline fschvalue_t FSCHVALUE(unsigned long t) ++{ ++ return (fschvalue_t){(cycles_t)t << FSCHRATE_SHIFT}; ++} ++static inline int FSCHVALUE_CMP(fschvalue_t x, fschvalue_t y) ++{ ++ if (x.v < y.v) return -1; ++ if (x.v > y.v) return 1; ++ return 0; ++} ++static inline void FSCHVALUE_DADD(fschvalue_t *val, fschdur_t dur, ++ unsigned rate) ++{ ++ val->v += (cycles_t)dur.d * rate; ++} ++static inline fschvalue_t FSCHVALUE_SUB(fschvalue_t x, fschvalue_t y) ++{ ++ return (fschvalue_t){x.v - y.v}; ++} ++static inline cycles_t FSCHVALUE_TO_DELAY(fschvalue_t val, unsigned rate) ++{ ++ unsigned long t; ++ /* ++ * Here we lose precision to make the division 32-bit on IA-32. ++ * The value is not greater than TICK_VALUE. ++ * (TICK_VALUE >> FSCHRATE_SHIFT) fits unsigned long. ++ */ ++ t = (val.v + (1 << FSCHRATE_SHIFT) - 1) >> FSCHRATE_SHIFT; ++ return (cycles_t)((t + rate - 1) / rate) << FSCHRATE_SHIFT; ++} ++ ++ ++/*********************************************************************/ ++/* ++ * Global data ++ */ ++/*********************************************************************/ ++ ++#define fsch_assert(x) \ ++ do { \ ++ static int count; \ ++ if (!(x) && count++ < 10) \ ++ printk("fsch_assert " #x " failed\n"); \ ++ } while (0) ++ ++/* ++ * Configurable parameters ++ */ ++unsigned fairsched_max_latency = 25; /* jiffies */ ++ ++/* ++ * Parameters initialized at startup ++ */ ++/* Number of online CPUs */ ++unsigned fairsched_nr_cpus; ++/* Token Bucket depth (burst size) */ ++static fschvalue_t max_value; ++ ++struct fairsched_node fairsched_init_node = { ++ .id = INT_MAX, ++#ifdef CONFIG_VE ++ .owner_env = get_ve0(), ++#endif ++ .weight = 1, ++}; ++EXPORT_SYMBOL(fairsched_init_node); ++ ++struct fairsched_node fairsched_idle_node = { ++ .id = -1, ++}; ++ ++static int fairsched_nr_nodes; ++static LIST_HEAD(fairsched_node_head); ++static LIST_HEAD(fairsched_running_head); ++static LIST_HEAD(fairsched_delayed_head); ++ ++DEFINE_PER_CPU(cycles_t, prev_schedule); ++static fschtag_t max_latency; ++ ++static DECLARE_MUTEX(fairsched_mutex); ++ ++/*********************************************************************/ ++/* ++ * Small helper routines ++ */ ++/*********************************************************************/ ++ ++/* this didn't proved to be very valuable statistics... */ ++#define fairsched_inc_ve_strv(node, cycles) do {} while(0) ++#define fairsched_dec_ve_strv(node, cycles) do {} while(0) ++ ++/*********************************************************************/ ++/* ++ * Runlist management ++ */ ++/*********************************************************************/ ++ ++/* ++ * Returns the start_tag of the first runnable node, or 0. ++ */ ++static inline fschtag_t virtual_time(void) ++{ ++ struct fairsched_node *p; ++ ++ if (!list_empty(&fairsched_running_head)) { ++ p = list_first_entry(&fairsched_running_head, ++ struct fairsched_node, runlist); ++ return p->start_tag; ++ } ++ return FSCHTAG_ZERO; ++} ++ ++static void fairsched_recompute_max_latency(void) ++{ ++ struct fairsched_node *p; ++ unsigned w; ++ fschtag_t tag; ++ ++ w = FSCHWEIGHT_MAX; ++ list_for_each_entry(p, &fairsched_node_head, nodelist) { ++ if (p->weight < w) ++ w = p->weight; ++ } ++ tag = FSCHTAG_ZERO; ++ (void) FSCHTAG_DADD(&tag, TICK_DUR, ++ fairsched_nr_cpus * fairsched_max_latency * w); ++ max_latency = tag; ++} ++ ++static void fairsched_reset_start_tags(void) ++{ ++ struct fairsched_node *cnode; ++ fschtag_t min_tag; ++ ++ min_tag = virtual_time(); ++ list_for_each_entry(cnode, &fairsched_node_head, nodelist) { ++ if (FSCHTAG_CMP(cnode->start_tag, min_tag) > 0) ++ cnode->start_tag = FSCHTAG_SUB(cnode->start_tag, ++ min_tag); ++ else ++ cnode->start_tag = FSCHTAG_ZERO; ++ } ++} ++ ++static void fairsched_running_insert(struct fairsched_node *node) ++{ ++ struct list_head *tmp; ++ struct fairsched_node *p; ++ fschtag_t start_tag_max; ++ ++ if (!list_empty(&fairsched_running_head)) { ++ start_tag_max = virtual_time(); ++ if (!FSCHTAG_ADD(&start_tag_max, max_latency) && ++ FSCHTAG_CMP(start_tag_max, node->start_tag) < 0) ++ node->start_tag = start_tag_max; ++ } ++ ++ list_for_each(tmp, &fairsched_running_head) { ++ p = list_entry(tmp, struct fairsched_node, runlist); ++ if (FSCHTAG_CMP(node->start_tag, p->start_tag) <= 0) ++ break; ++ } ++ /* insert node just before tmp */ ++ list_add_tail(&node->runlist, tmp); ++} ++ ++static inline void fairsched_running_insert_fromsleep( ++ struct fairsched_node *node) ++{ ++ node->start_tag = FSCHTAG_MAX(node->start_tag, virtual_time()); ++ fairsched_running_insert(node); ++} ++ ++ ++/*********************************************************************/ ++/* ++ * CPU limiting helper functions ++ * ++ * These functions compute rates, delays and manipulate with sleep ++ * lists and so on. ++ */ ++/*********************************************************************/ ++ ++/* ++ * Insert a node into the list of nodes removed from scheduling, ++ * sorted by the time at which the the node is allowed to run, ++ * historically called `delay'. ++ */ ++static void fairsched_delayed_insert(struct fairsched_node *node) ++{ ++ struct fairsched_node *p; ++ struct list_head *tmp; ++ ++ list_for_each(tmp, &fairsched_delayed_head) { ++ p = list_entry(tmp, struct fairsched_node, ++ runlist); ++ if (CYCLES_AFTER(p->delay, node->delay)) ++ break; ++ } ++ /* insert node just before tmp */ ++ list_add_tail(&node->runlist, tmp); ++} ++ ++static inline void nodevalue_add(struct fairsched_node *node, ++ fschdur_t duration, unsigned rate) ++{ ++ FSCHVALUE_DADD(&node->value, duration, rate); ++ if (FSCHVALUE_CMP(node->value, max_value) > 0) ++ node->value = max_value; ++} ++ ++/* ++ * The node has been selected to run. ++ * This function accounts in advance for the time that the node will run. ++ * The advance not used by the node will be credited back. ++ */ ++static void fairsched_ratelimit_charge_advance( ++ struct fairsched_node *node, ++ cycles_t time) ++{ ++ fsch_assert(!node->delayed); ++ fsch_assert(FSCHVALUE_CMP(node->value, TICK_VALUE) >= 0); ++ ++ /* ++ * Account for the time passed since last update. ++ * It might be needed if the node has become runnable because of ++ * a wakeup, but hasn't gone through other functions updating ++ * the bucket value. ++ */ ++ if (CYCLES_AFTER(time, node->last_updated_at)) { ++ nodevalue_add(node, FSCHDURATION(time, node->last_updated_at), ++ node->rate); ++ node->last_updated_at = time; ++ } ++ ++ /* charge for the full tick the node might be running */ ++ node->value = FSCHVALUE_SUB(node->value, TICK_VALUE); ++ if (FSCHVALUE_CMP(node->value, TICK_VALUE) < 0) { ++ list_del(&node->runlist); ++ node->delayed = 1; ++ node->delay = node->last_updated_at + FSCHVALUE_TO_DELAY( ++ FSCHVALUE_SUB(TICK_VALUE, node->value), ++ node->rate); ++ node->nr_ready = 0; ++ fairsched_delayed_insert(node); ++ } ++} ++ ++static void fairsched_ratelimit_credit_unused( ++ struct fairsched_node *node, ++ cycles_t time, fschdur_t duration) ++{ ++ /* account for the time passed since last update */ ++ if (CYCLES_AFTER(time, node->last_updated_at)) { ++ nodevalue_add(node, FSCHDURATION(time, node->last_updated_at), ++ node->rate); ++ node->last_updated_at = time; ++ } ++ ++ /* ++ * When the node was given this CPU, it was charged for 1 tick. ++ * Credit back the unused time. ++ */ ++ if (FSCHDUR_CMP(duration, TICK_DUR) < 0) ++ nodevalue_add(node, FSCHDUR_SUB(TICK_DUR, duration), ++ 1 << FSCHRATE_SHIFT); ++ ++ /* check if the node is allowed to run */ ++ if (FSCHVALUE_CMP(node->value, TICK_VALUE) < 0) { ++ /* ++ * The node was delayed and remain such. ++ * But since the bucket value has been updated, ++ * update the delay time and move the node in the list. ++ */ ++ fsch_assert(node->delayed); ++ node->delay = node->last_updated_at + FSCHVALUE_TO_DELAY( ++ FSCHVALUE_SUB(TICK_VALUE, node->value), ++ node->rate); ++ } else if (node->delayed) { ++ /* ++ * The node was delayed, but now it is allowed to run. ++ * We do not manipulate with lists, it will be done by the ++ * caller. ++ */ ++ node->nr_ready = node->nr_runnable; ++ node->delayed = 0; ++ } ++} ++ ++static void fairsched_delayed_wake(cycles_t time) ++{ ++ struct fairsched_node *p; ++ ++ while (!list_empty(&fairsched_delayed_head)) { ++ p = list_entry(fairsched_delayed_head.next, ++ struct fairsched_node, ++ runlist); ++ if (CYCLES_AFTER(p->delay, time)) ++ break; ++ ++ /* ok, the delay period is completed */ ++ /* account for the time passed since last update */ ++ if (CYCLES_AFTER(time, p->last_updated_at)) { ++ nodevalue_add(p, FSCHDURATION(time, p->last_updated_at), ++ p->rate); ++ p->last_updated_at = time; ++ } ++ ++ fsch_assert(FSCHVALUE_CMP(p->value, TICK_VALUE) >= 0); ++ p->nr_ready = p->nr_runnable; ++ p->delayed = 0; ++ list_del_init(&p->runlist); ++ if (p->nr_ready) ++ fairsched_running_insert_fromsleep(p); ++ } ++} ++ ++static struct fairsched_node *fairsched_find(unsigned int id); ++ ++void fairsched_cpu_online_map(int id, cpumask_t *mask) ++{ ++ /* FIXME - obtain real map */ ++ *mask = cpu_online_map; ++#if 0 ++ struct fairsched_node *node; ++ ++ down(&fairsched_mutex); ++ node = fairsched_find(id); ++ if (node == NULL) ++ *mask = CPU_MASK_NONE; ++ else ++ vsched_cpu_online_map(node->vsched, mask); ++ up(&fairsched_mutex); ++#endif ++} ++ ++/*********************************************************************/ ++/* ++ * The heart of the algorithm: ++ * fairsched_incrun, fairsched_decrun, fairsched_schedule ++ * ++ * Note: old property nr_ready >= nr_pcpu doesn't hold anymore. ++ * However, nr_runnable, nr_ready and delayed are maintained in sync. ++ */ ++/*********************************************************************/ ++ ++/* ++ * Called on a wakeup inside the node. ++ */ ++void fairsched_incrun(struct fairsched_node *node) ++{ ++ if (!node->delayed && !node->nr_ready++) ++ /* the node wasn't on the running list, insert */ ++ fairsched_running_insert_fromsleep(node); ++ node->nr_runnable++; ++} ++ ++/* ++ * Called from inside schedule() when a sleeping state is entered. ++ */ ++void fairsched_decrun(struct fairsched_node *node) ++{ ++ if (!node->delayed && !--node->nr_ready) ++ /* nr_ready changed 1->0, remove from the running list */ ++ list_del_init(&node->runlist); ++ --node->nr_runnable; ++} ++ ++void fairsched_inccpu(struct fairsched_node *node) ++{ ++ node->nr_pcpu++; ++ fairsched_dec_ve_strv(node, cycles); ++} ++ ++static inline void __fairsched_deccpu(struct fairsched_node *node) ++{ ++ node->nr_pcpu--; ++ fairsched_inc_ve_strv(node, cycles); ++} ++ ++void fairsched_deccpu(struct fairsched_node *node) ++{ ++ if (node == &fairsched_idle_node) ++ return; ++ ++ __fairsched_deccpu(node); ++} ++ ++static void fairsched_account(struct fairsched_node *node, ++ cycles_t time) ++{ ++ fschdur_t duration; ++ ++ duration = FSCHDURATION(time, __get_cpu_var(prev_schedule)); ++#ifdef CONFIG_VE ++ CYCLES_DADD(&node->owner_env->cpu_used_ve, duration); ++#endif ++ ++ /* ++ * The duration is not greater than TICK_DUR since ++ * task->need_resched is always 1. ++ */ ++ if (FSCHTAG_DADD(&node->start_tag, duration, node->weight)) { ++ fairsched_reset_start_tags(); ++ (void) FSCHTAG_DADD(&node->start_tag, duration, ++ node->weight); ++ } ++ ++ list_del_init(&node->runlist); ++ if (node->rate_limited) ++ fairsched_ratelimit_credit_unused(node, time, duration); ++ if (!node->delayed) { ++ if (node->nr_ready) ++ fairsched_running_insert(node); ++ } else ++ fairsched_delayed_insert(node); ++} ++ ++/* ++ * Scheduling decision ++ * ++ * Updates CPU usage for the node releasing the CPU and selects a new node. ++ */ ++struct fairsched_node *fairsched_schedule( ++ struct fairsched_node *prev_node, ++ struct fairsched_node *cur_node, ++ int cur_node_active, ++ cycles_t time) ++{ ++ struct fairsched_node *p; ++ ++ if (prev_node != &fairsched_idle_node) ++ fairsched_account(prev_node, time); ++ __get_cpu_var(prev_schedule) = time; ++ ++ fairsched_delayed_wake(time); ++ ++ list_for_each_entry(p, &fairsched_running_head, runlist) { ++ if (p->nr_pcpu < p->nr_ready || ++ (cur_node_active && p == cur_node)) { ++ if (p->rate_limited) ++ fairsched_ratelimit_charge_advance(p, time); ++ return p; ++ } ++ } ++ return NULL; ++} ++ ++ ++/*********************************************************************/ ++/* ++ * System calls ++ * ++ * All do_xxx functions are called under fairsched semaphore and after ++ * capability check. ++ * ++ * The binary interfaces follow some other Fair Scheduler implementations ++ * (although some system call arguments are not needed for our implementation). ++ */ ++/*********************************************************************/ ++ ++static struct fairsched_node *fairsched_find(unsigned int id) ++{ ++ struct fairsched_node *p; ++ ++ list_for_each_entry(p, &fairsched_node_head, nodelist) { ++ if (p->id == id) ++ return p; ++ } ++ return NULL; ++} ++ ++static int do_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid) ++{ ++ struct fairsched_node *node; ++ int retval; ++ ++ retval = -EINVAL; ++ if (weight < 1 || weight > FSCHWEIGHT_MAX) ++ goto out; ++ if (newid < 0 || newid > INT_MAX) ++ goto out; ++ ++ retval = -EBUSY; ++ if (fairsched_find(newid) != NULL) ++ goto out; ++ ++ retval = -ENOMEM; ++ node = kmalloc(sizeof(*node), GFP_KERNEL); ++ if (node == NULL) ++ goto out; ++ ++ memset(node, 0, sizeof(*node)); ++ node->weight = weight; ++ INIT_LIST_HEAD(&node->runlist); ++ node->id = newid; ++#ifdef CONFIG_VE ++ node->owner_env = get_exec_env(); ++#endif ++ ++ spin_lock_irq(&fairsched_lock); ++ list_add(&node->nodelist, &fairsched_node_head); ++ fairsched_nr_nodes++; ++ fairsched_recompute_max_latency(); ++ spin_unlock_irq(&fairsched_lock); ++ ++ retval = newid; ++out: ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ down(&fairsched_mutex); ++ retval = do_fairsched_mknod(parent, weight, newid); ++ up(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_mknod); ++ ++static int do_fairsched_rmnod(unsigned int id) ++{ ++ struct fairsched_node *node; ++ int retval; ++ ++ retval = -EINVAL; ++ node = fairsched_find(id); ++ if (node == NULL) ++ goto out; ++ if (node == &fairsched_init_node) ++ goto out; ++ ++ retval = vsched_destroy(node->vsched); ++ if (retval) ++ goto out; ++ ++ spin_lock_irq(&fairsched_lock); ++ list_del(&node->runlist); /* required for delayed nodes */ ++ list_del(&node->nodelist); ++ fairsched_nr_nodes--; ++ fairsched_recompute_max_latency(); ++ spin_unlock_irq(&fairsched_lock); ++ ++ kfree(node); ++ retval = 0; ++out: ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_rmnod(unsigned int id) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ down(&fairsched_mutex); ++ retval = do_fairsched_rmnod(id); ++ up(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_rmnod); ++ ++int do_fairsched_chwt(unsigned int id, unsigned weight) ++{ ++ struct fairsched_node *node; ++ ++ if (id == 0) ++ return -EINVAL; ++ if (weight < 1 || weight > FSCHWEIGHT_MAX) ++ return -EINVAL; ++ ++ node = fairsched_find(id); ++ if (node == NULL) ++ return -ENOENT; ++ ++ spin_lock_irq(&fairsched_lock); ++ node->weight = weight; ++ fairsched_recompute_max_latency(); ++ spin_unlock_irq(&fairsched_lock); ++ ++ return 0; ++} ++ ++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned weight) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ down(&fairsched_mutex); ++ retval = do_fairsched_chwt(id, weight); ++ up(&fairsched_mutex); ++ ++ return retval; ++} ++ ++int do_fairsched_rate(unsigned int id, int op, unsigned rate) ++{ ++ struct fairsched_node *node; ++ cycles_t time; ++ int retval; ++ ++ if (id == 0) ++ return -EINVAL; ++ if (op == 0 && (rate < 1 || rate >= (1UL << 31))) ++ return -EINVAL; ++ ++ node = fairsched_find(id); ++ if (node == NULL) ++ return -ENOENT; ++ ++ retval = -EINVAL; ++ spin_lock_irq(&fairsched_lock); ++ time = get_cycles(); ++ switch (op) { ++ case 0: ++ node->rate = rate; ++ if (node->rate > (fairsched_nr_cpus << FSCHRATE_SHIFT)) ++ node->rate = ++ fairsched_nr_cpus << FSCHRATE_SHIFT; ++ node->rate_limited = 1; ++ node->value = max_value; ++ if (node->delayed) { ++ list_del(&node->runlist); ++ node->delay = time; ++ fairsched_delayed_insert(node); ++ node->last_updated_at = time; ++ fairsched_delayed_wake(time); ++ } ++ retval = node->rate; ++ break; ++ case 1: ++ node->rate = 0; /* This assignment is not needed ++ for the kernel code, and it should ++ not rely on rate being 0 when it's ++ unset. This is a band-aid for some ++ existing tools (don't know which one ++ exactly). --SAW */ ++ node->rate_limited = 0; ++ node->value = max_value; ++ if (node->delayed) { ++ list_del(&node->runlist); ++ node->delay = time; ++ fairsched_delayed_insert(node); ++ node->last_updated_at = time; ++ fairsched_delayed_wake(time); ++ } ++ retval = 0; ++ break; ++ case 2: ++ if (node->rate_limited) ++ retval = node->rate; ++ else ++ retval = -ENODATA; ++ break; ++ } ++ spin_unlock_irq(&fairsched_lock); ++ ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ down(&fairsched_mutex); ++ retval = do_fairsched_rate(id, op, rate); ++ up(&fairsched_mutex); ++ ++ return retval; ++} ++ ++/* ++ * Called under fairsched_mutex. ++ */ ++static int __do_fairsched_mvpr(struct task_struct *p, ++ struct fairsched_node *node) ++{ ++ int retval; ++ ++ if (node->vsched == NULL) { ++ retval = vsched_create(node->id, node); ++ if (retval < 0) ++ return retval; ++ } ++ ++ /* no need to destroy vsched in case of mvpr failure */ ++ return vsched_mvpr(p, node->vsched); ++} ++ ++int do_fairsched_mvpr(pid_t pid, unsigned int nodeid) ++{ ++ struct task_struct *p; ++ struct fairsched_node *node; ++ int retval; ++ ++ retval = -ENOENT; ++ node = fairsched_find(nodeid); ++ if (node == NULL) ++ goto out; ++ ++ read_lock(&tasklist_lock); ++ retval = -ESRCH; ++ p = find_task_by_pid_all(pid); ++ if (p == NULL) ++ goto out_unlock; ++ get_task_struct(p); ++ read_unlock(&tasklist_lock); ++ ++ retval = __do_fairsched_mvpr(p, node); ++ put_task_struct(p); ++ return retval; ++ ++out_unlock: ++ read_unlock(&tasklist_lock); ++out: ++ return retval; ++} ++ ++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) ++{ ++ int retval; ++ ++ if (!capable(CAP_SETVEID)) ++ return -EPERM; ++ ++ down(&fairsched_mutex); ++ retval = do_fairsched_mvpr(pid, nodeid); ++ up(&fairsched_mutex); ++ ++ return retval; ++} ++EXPORT_SYMBOL(sys_fairsched_mvpr); ++ ++ ++/*********************************************************************/ ++/* ++ * proc interface ++ */ ++/*********************************************************************/ ++ ++struct fairsched_node_dump { ++#ifdef CONFIG_VE ++ envid_t veid; ++#endif ++ int id; ++ unsigned weight; ++ unsigned rate; ++ unsigned rate_limited : 1, ++ delayed : 1; ++ fschtag_t start_tag; ++ fschvalue_t value; ++ cycles_t delay; ++ int nr_ready; ++ int nr_runnable; ++ int nr_pcpu; ++ int nr_tasks, nr_runtasks; ++}; ++ ++struct fairsched_dump { ++ int len, compat; ++ struct fairsched_node_dump nodes[0]; ++}; ++ ++static struct fairsched_dump *fairsched_do_dump(int compat) ++{ ++ int nr_nodes; ++ int len, i; ++ struct fairsched_dump *dump; ++ struct fairsched_node *node; ++ struct fairsched_node_dump *p; ++ unsigned long flags; ++ ++start: ++ nr_nodes = (ve_is_super(get_exec_env()) ? fairsched_nr_nodes + 16 : 1); ++ len = sizeof(*dump) + nr_nodes * sizeof(dump->nodes[0]); ++ dump = ub_vmalloc(len); ++ if (dump == NULL) ++ goto out; ++ ++ spin_lock_irqsave(&fairsched_lock, flags); ++ if (ve_is_super(get_exec_env()) && nr_nodes < fairsched_nr_nodes) ++ goto repeat; ++ p = dump->nodes; ++ list_for_each_entry_reverse(node, &fairsched_node_head, nodelist) { ++ if ((char *)p - (char *)dump >= len) ++ break; ++ p->nr_tasks = 0; ++ p->nr_runtasks = 0; ++#ifdef CONFIG_VE ++ if (!ve_accessible(node->owner_env, get_exec_env())) ++ continue; ++ p->veid = node->owner_env->veid; ++ if (compat) { ++ p->nr_tasks = atomic_read(&node->owner_env->pcounter); ++ for (i = 0; i < NR_CPUS; i++) ++ p->nr_runtasks += ++ VE_CPU_STATS(node->owner_env, i) ++ ->nr_running; ++ if (p->nr_runtasks < 0) ++ p->nr_runtasks = 0; ++ } ++#endif ++ p->id = node->id; ++ p->weight = node->weight; ++ p->rate = node->rate; ++ p->rate_limited = node->rate_limited; ++ p->delayed = node->delayed; ++ p->start_tag = node->start_tag; ++ p->value = node->value; ++ p->delay = node->delay; ++ p->nr_ready = node->nr_ready; ++ p->nr_runnable = node->nr_runnable; ++ p->nr_pcpu = node->nr_pcpu; ++ p++; ++ } ++ dump->len = p - dump->nodes; ++ dump->compat = compat; ++ spin_unlock_irqrestore(&fairsched_lock, flags); ++ ++out: ++ return dump; ++ ++repeat: ++ spin_unlock_irqrestore(&fairsched_lock, flags); ++ vfree(dump); ++ goto start; ++} ++ ++#define FAIRSCHED_PROC_HEADLINES 2 ++ ++#if defined(CONFIG_VE) ++/* ++ * File format is dictated by compatibility reasons. ++ */ ++static int fairsched_seq_show(struct seq_file *m, void *v) ++{ ++ struct fairsched_dump *dump; ++ struct fairsched_node_dump *p; ++ unsigned vid, nid, pid, r; ++ ++ dump = m->private; ++ p = (struct fairsched_node_dump *)((unsigned long)v & ~3UL); ++ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { ++ if (p == dump->nodes) ++ seq_printf(m, "Version: 2.6 debug\n"); ++ else if (p == dump->nodes + 1) ++ seq_printf(m, ++ " veid " ++ " id " ++ " parent " ++ "weight " ++ " rate " ++ "tasks " ++ " run " ++ "cpus" ++ " " ++ "flg " ++ "ready " ++ " start_tag " ++ " value " ++ " delay" ++ "\n"); ++ } else { ++ p -= FAIRSCHED_PROC_HEADLINES; ++ vid = nid = pid = 0; ++ r = (unsigned long)v & 3; ++ if (p == dump->nodes) { ++ if (r == 2) ++ nid = p->id; ++ } else { ++ if (!r) ++ nid = p->id; ++ else if (r == 1) ++ vid = pid = p->id; ++ else ++ vid = p->id, nid = 1; ++ } ++ seq_printf(m, ++ "%10u " ++ "%10u %10u %6u %5u %5u %5u %4u" ++ " " ++ " %c%c %5u %20Lu %20Lu %20Lu" ++ "\n", ++ vid, ++ nid, ++ pid, ++ p->weight, ++ p->rate, ++ p->nr_tasks, ++ p->nr_runtasks, ++ p->nr_pcpu, ++ p->rate_limited ? 'L' : '.', ++ p->delayed ? 'D' : '.', ++ p->nr_ready, ++ p->start_tag.t, ++ p->value.v, ++ p->delay ++ ); ++ } ++ ++ return 0; ++} ++ ++static void *fairsched_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct fairsched_dump *dump; ++ unsigned long l; ++ ++ dump = m->private; ++ if (*pos >= dump->len * 3 - 1 + FAIRSCHED_PROC_HEADLINES) ++ return NULL; ++ if (*pos < FAIRSCHED_PROC_HEADLINES) ++ return dump->nodes + *pos; ++ /* guess why... */ ++ l = (unsigned long)(dump->nodes + ++ ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) / 3); ++ l |= ((unsigned long)*pos + FAIRSCHED_PROC_HEADLINES * 2 + 1) % 3; ++ return (void *)l; ++} ++static void *fairsched_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return fairsched_seq_start(m, pos); ++} ++#endif ++ ++static int fairsched2_seq_show(struct seq_file *m, void *v) ++{ ++ struct fairsched_dump *dump; ++ struct fairsched_node_dump *p; ++ ++ dump = m->private; ++ p = v; ++ if (p - dump->nodes < FAIRSCHED_PROC_HEADLINES) { ++ if (p == dump->nodes) ++ seq_printf(m, "Version: 2.7" FAIRSHED_DEBUG "\n"); ++ else if (p == dump->nodes + 1) ++ seq_printf(m, ++ " id " ++ "weight " ++ " rate " ++ " run " ++ "cpus" ++#ifdef FAIRSHED_DEBUG ++ " " ++ "flg " ++ "ready " ++ " start_tag " ++ " value " ++ " delay" ++#endif ++ "\n"); ++ } else { ++ p -= FAIRSCHED_PROC_HEADLINES; ++ seq_printf(m, ++ "%10u %6u %5u %5u %4u" ++#ifdef FAIRSHED_DEBUG ++ " " ++ " %c%c %5u %20Lu %20Lu %20Lu" ++#endif ++ "\n", ++ p->id, ++ p->weight, ++ p->rate, ++ p->nr_runnable, ++ p->nr_pcpu ++#ifdef FAIRSHED_DEBUG ++ , ++ p->rate_limited ? 'L' : '.', ++ p->delayed ? 'D' : '.', ++ p->nr_ready, ++ p->start_tag.t, ++ p->value.v, ++ p->delay ++#endif ++ ); ++ } ++ ++ return 0; ++} ++ ++static void *fairsched2_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct fairsched_dump *dump; ++ ++ dump = m->private; ++ if (*pos >= dump->len + FAIRSCHED_PROC_HEADLINES) ++ return NULL; ++ return dump->nodes + *pos; ++} ++static void *fairsched2_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return fairsched2_seq_start(m, pos); ++} ++static void fairsched2_seq_stop(struct seq_file *m, void *v) ++{ ++} ++ ++#ifdef CONFIG_VE ++static struct seq_operations fairsched_seq_op = { ++ .start = fairsched_seq_start, ++ .next = fairsched_seq_next, ++ .stop = fairsched2_seq_stop, ++ .show = fairsched_seq_show ++}; ++#endif ++static struct seq_operations fairsched2_seq_op = { ++ .start = fairsched2_seq_start, ++ .next = fairsched2_seq_next, ++ .stop = fairsched2_seq_stop, ++ .show = fairsched2_seq_show ++}; ++static int fairsched_seq_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ struct seq_file *m; ++ int compat; ++ ++#ifdef CONFIG_VE ++ compat = (file->f_dentry->d_name.len == sizeof("fairsched") - 1); ++ ret = seq_open(file, compat ? &fairsched_seq_op : &fairsched2_seq_op); ++#else ++ compat = 0; ++ ret = seq_open(file, fairsched2_seq_op); ++#endif ++ if (ret) ++ return ret; ++ m = file->private_data; ++ m->private = fairsched_do_dump(compat); ++ if (m->private == NULL) { ++ seq_release(inode, file); ++ ret = -ENOMEM; ++ } ++ return ret; ++} ++static int fairsched_seq_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *m; ++ struct fairsched_dump *dump; ++ ++ m = file->private_data; ++ dump = m->private; ++ m->private = NULL; ++ vfree(dump); ++ seq_release(inode, file); ++ return 0; ++} ++static struct file_operations proc_fairsched_operations = { ++ .open = fairsched_seq_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = fairsched_seq_release ++}; ++ ++ ++/*********************************************************************/ ++/* ++ * Fairsched initialization ++ */ ++/*********************************************************************/ ++ ++int fsch_sysctl_latency(ctl_table *ctl, int write, struct file *filp, ++ void *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int *valp = ctl->data; ++ int val = *valp; ++ int ret; ++ ++ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); ++ ++ if (!write || *valp == val) ++ return ret; ++ ++ spin_lock_irq(&fairsched_lock); ++ fairsched_recompute_max_latency(); ++ spin_unlock_irq(&fairsched_lock); ++ return ret; ++} ++ ++static void fairsched_calibrate(void) ++{ ++ fairsched_nr_cpus = num_online_cpus(); ++ max_value = FSCHVALUE(cycles_per_jiffy * (fairsched_nr_cpus + 1)); ++} ++ ++void __init fairsched_init_early(void) ++{ ++ list_add(&fairsched_init_node.nodelist, &fairsched_node_head); ++ fairsched_nr_nodes++; ++} ++ ++/* ++ * Note: this function is execute late in the initialization sequence. ++ * We ourselves need calibrated cycles and initialized procfs... ++ * The consequence of this late initialization is that start tags are ++ * efficiently ignored and each node preempts others on insertion. ++ * But it isn't a problem (only init node can be runnable). ++ */ ++void __init fairsched_init_late(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ if (get_cycles() == 0) ++ panic("FAIRSCHED: no TSC!\n"); ++ fairsched_calibrate(); ++ fairsched_recompute_max_latency(); ++ ++ entry = create_proc_glob_entry("fairsched", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &proc_fairsched_operations; ++ entry = create_proc_glob_entry("fairsched2", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &proc_fairsched_operations; ++} ++ ++ ++#else /* CONFIG_FAIRSCHED */ ++ ++ ++/*********************************************************************/ ++/* ++ * No Fairsched ++ */ ++/*********************************************************************/ ++ ++asmlinkage int sys_fairsched_mknod(unsigned int parent, unsigned int weight, ++ unsigned int newid) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage int sys_fairsched_rmnod(unsigned int id) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage int sys_fairsched_chwt(unsigned int id, unsigned int weight) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage int sys_fairsched_mvpr(pid_t pid, unsigned int nodeid) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage int sys_fairsched_rate(unsigned int id, int op, unsigned rate) ++{ ++ return -ENOSYS; ++} ++ ++void __init fairsched_init_late(void) ++{ ++} ++ ++#endif /* CONFIG_FAIRSCHED */ +diff -upr linux-2.6.16.orig/kernel/fork.c linux-2.6.16-026test015/kernel/fork.c +--- linux-2.6.16.orig/kernel/fork.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/fork.c 2006-07-04 14:41:39.000000000 +0400 +@@ -20,6 +20,7 @@ + #include <linux/vmalloc.h> + #include <linux/completion.h> + #include <linux/namespace.h> ++#include <linux/file.h> + #include <linux/personality.h> + #include <linux/mempolicy.h> + #include <linux/sem.h> +@@ -52,11 +53,15 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++#include <ub/ub_misc.h> ++ + /* + * Protected counters by write_lock_irq(&tasklist_lock) + */ + unsigned long total_forks; /* Handle normal Linux uptimes. */ + int nr_threads; /* The idle threads do not count.. */ ++EXPORT_SYMBOL(nr_threads); + + int max_threads; /* tunable limit on nr_threads */ + +@@ -103,6 +108,7 @@ static kmem_cache_t *mm_cachep; + + void free_task(struct task_struct *tsk) + { ++ ub_task_uncharge(tsk); + free_thread_info(tsk->thread_info); + free_task_struct(tsk); + } +@@ -122,9 +128,14 @@ void __put_task_struct_cb(struct rcu_hea + free_uid(tsk->user); + put_group_info(tsk->group_info); + ++#ifdef CONFIG_VE ++ put_ve(VE_TASK_INFO(tsk)->owner_env); ++ atomic_dec(&nr_dead); ++#endif + if (!profile_handoff_task(tsk)) + free_task(tsk); + } ++EXPORT_SYMBOL_GPL(__put_task_struct_cb); + + void __init fork_init(unsigned long mempages) + { +@@ -135,7 +146,7 @@ void __init fork_init(unsigned long memp + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), +- ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); ++ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_UBC, NULL, NULL); + #endif + + /* +@@ -166,22 +177,30 @@ static struct task_struct *dup_task_stru + + tsk = alloc_task_struct(); + if (!tsk) +- return NULL; ++ goto out; + + ti = alloc_thread_info(tsk); +- if (!ti) { +- free_task_struct(tsk); +- return NULL; +- } ++ if (!ti) ++ goto out_tsk; + + *tsk = *orig; + tsk->thread_info = ti; + setup_thread_stack(tsk, orig); + ++ if (ub_task_charge(orig, tsk)) ++ goto out_ti; ++ + /* One for us, one for whoever does the "release_task()" (usually parent) */ + atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); + return tsk; ++ ++out_ti: ++ free_thread_info(ti); ++out_tsk: ++ free_task_struct(tsk); ++out: ++ return NULL; + } + + #ifdef CONFIG_MMU +@@ -219,7 +238,12 @@ static inline int dup_mmap(struct mm_str + -pages); + continue; + } ++ + charge = 0; ++ if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start, ++ mpnt->vm_flags & ~VM_LOCKED, ++ mpnt->vm_file, UB_HARD)) ++ goto fail_noch; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) +@@ -238,6 +262,7 @@ static inline int dup_mmap(struct mm_str + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_mm = mm; + tmp->vm_next = NULL; ++ set_vma_rss(tmp, 0); + anon_vma_link(tmp); + file = tmp->vm_file; + if (file) { +@@ -266,7 +291,7 @@ static inline int dup_mmap(struct mm_str + rb_parent = &tmp->vm_rb; + + mm->map_count++; +- retval = copy_page_range(mm, oldmm, mpnt); ++ retval = copy_page_range(mm, oldmm, tmp, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); +@@ -283,6 +308,9 @@ out: + fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); + fail_nomem: ++ ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start, ++ mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file); ++fail_noch: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +@@ -313,7 +341,8 @@ static inline void mm_free_pgd(struct mm + + #include <linux/init_task.h> + +-static struct mm_struct * mm_init(struct mm_struct * mm) ++static struct mm_struct * mm_init(struct mm_struct * mm, ++ struct task_struct *tsk) + { + atomic_set(&mm->mm_users, 1); + atomic_set(&mm->mm_count, 1); +@@ -328,11 +357,14 @@ static struct mm_struct * mm_init(struct + mm->ioctx_list = NULL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; ++ set_mm_ub(mm, tsk); + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + return mm; + } ++ ++ put_mm_ub(mm); + free_mm(mm); + return NULL; + } +@@ -347,10 +379,11 @@ struct mm_struct * mm_alloc(void) + mm = allocate_mm(); + if (mm) { + memset(mm, 0, sizeof(*mm)); +- mm = mm_init(mm); ++ mm = mm_init(mm, NULL); + } + return mm; + } ++EXPORT_SYMBOL_GPL(mm_alloc); + + /* + * Called when the last reference to the mm +@@ -362,8 +395,10 @@ void fastcall __mmdrop(struct mm_struct + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); ++ put_mm_ub(mm); + free_mm(mm); + } ++EXPORT_SYMBOL_GPL(__mmdrop); + + /* + * Decrement the use count and release all resources for an mm. +@@ -466,7 +501,7 @@ static struct mm_struct *dup_mm(struct t + + memcpy(mm, oldmm, sizeof(*mm)); + +- if (!mm_init(mm)) ++ if (!mm_init(mm, tsk)) + goto fail_nomem; + + if (init_new_context(tsk, mm)) +@@ -720,7 +755,7 @@ out_release: + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); + kmem_cache_free(files_cachep, newf); +- goto out; ++ return NULL; + } + + static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +@@ -896,7 +931,7 @@ asmlinkage long sys_set_tid_address(int + { + current->clear_child_tid = tidptr; + +- return current->pid; ++ return virt_pid(current); + } + + /* +@@ -913,7 +948,7 @@ static task_t *copy_process(unsigned lon + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr, +- int pid) ++ int pid, long pid0) + { + int retval; + struct task_struct *p = NULL; +@@ -974,12 +1009,20 @@ static task_t *copy_process(unsigned lon + p->did_exec = 0; + copy_flags(clone_flags, p); + p->pid = pid; ++#ifdef CONFIG_VE ++ set_virt_pid(p, alloc_vpid(p->pid, pid0 ? : -1)); ++ if (virt_pid(p) < 0) ++ goto bad_fork_cleanup_module; ++#endif + retval = -EFAULT; + if (clone_flags & CLONE_PARENT_SETTID) +- if (put_user(p->pid, parent_tidptr)) ++ if (put_user(virt_pid(p), parent_tidptr)) + goto bad_fork_cleanup; + + p->proc_dentry = NULL; ++#ifdef CONFIG_VE ++ p->ve_task_info.glob_proc_dentry = NULL; ++#endif + + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); +@@ -1027,8 +1070,13 @@ static task_t *copy_process(unsigned lon + #endif + + p->tgid = p->pid; +- if (clone_flags & CLONE_THREAD) ++ set_virt_tgid(p, virt_pid(p)); ++ set_virt_pgid(p, virt_pgid(current)); ++ set_virt_sid(p, virt_sid(current)); ++ if (clone_flags & CLONE_THREAD) { + p->tgid = current->tgid; ++ set_virt_tgid(p, virt_tgid(current)); ++ } + + if ((retval = security_task_alloc(p))) + goto bad_fork_cleanup_policy; +@@ -1111,8 +1159,8 @@ static task_t *copy_process(unsigned lon + */ + p->cpus_allowed = current->cpus_allowed; + if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || +- !cpu_online(task_cpu(p)))) +- set_task_cpu(p, smp_processor_id()); ++ !vcpu_online(task_cpu(p)))) ++ set_task_cpu(p, task_cpu(current)); + + /* + * Check for pending SIGKILL! The new thread should not be allowed +@@ -1181,6 +1229,12 @@ static task_t *copy_process(unsigned lon + if (unlikely(p->ptrace & PT_PTRACED)) + __ptrace_link(p, current->parent); + ++#ifdef CONFIG_VE ++ SET_VE_LINKS(p); ++ atomic_inc(&p->ve_task_info.owner_env->pcounter); ++ get_ve(p->ve_task_info.owner_env); ++ seqcount_init(&p->ve_task_info.wakeup_lock); ++#endif + if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); +@@ -1228,6 +1282,11 @@ bad_fork_cleanup_cpuset: + #endif + cpuset_exit(p); + bad_fork_cleanup: ++#ifdef CONFIG_VE ++ if (virt_pid(p) != p->pid && virt_pid(p) > 0) ++ free_vpid(virt_pid(p), get_exec_env()); ++bad_fork_cleanup_module: ++#endif + if (p->binfmt) + module_put(p->binfmt->module); + bad_fork_cleanup_put_domain: +@@ -1253,7 +1312,7 @@ task_t * __devinit fork_idle(int cpu) + task_t *task; + struct pt_regs regs; + +- task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0); ++ task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, NULL, 0, 0); + if (!task) + return ERR_PTR(-ENOMEM); + init_idle(task, cpu); +@@ -1283,12 +1342,13 @@ static inline int fork_traceflag (unsign + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +-long do_fork(unsigned long clone_flags, ++long do_fork_pid(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, +- int __user *child_tidptr) ++ int __user *child_tidptr, ++ long pid0) + { + struct task_struct *p; + int trace = 0; +@@ -1302,7 +1362,8 @@ long do_fork(unsigned long clone_flags, + clone_flags |= CLONE_PTRACE; + } + +- p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid); ++ p = copy_process(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr, pid, pid0); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. +@@ -1310,6 +1371,7 @@ long do_fork(unsigned long clone_flags, + if (!IS_ERR(p)) { + struct completion vfork; + ++ pid = virt_pid(p); + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); +@@ -1330,13 +1392,18 @@ long do_fork(unsigned long clone_flags, + + if (unlikely (trace)) { + current->ptrace_message = pid; ++ set_pn_state(current, PN_STOP_FORK); + ptrace_notify ((trace << 8) | SIGTRAP); ++ clear_pn_state(current); + } + + if (clone_flags & CLONE_VFORK) { + wait_for_completion(&vfork); +- if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) ++ if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { ++ set_pn_state(current, PN_STOP_VFORK); + ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); ++ clear_pn_state(current); ++ } + } + } else { + free_pidmap(pid); +@@ -1349,26 +1416,39 @@ long do_fork(unsigned long clone_flags, + #define ARCH_MIN_MMSTRUCT_ALIGN 0 + #endif + ++EXPORT_SYMBOL(do_fork_pid); ++ ++long do_fork(unsigned long clone_flags, ++ unsigned long stack_start, ++ struct pt_regs *regs, ++ unsigned long stack_size, ++ int __user *parent_tidptr, ++ int __user *child_tidptr) ++{ ++ return do_fork_pid(clone_flags, stack_start, regs, stack_size, ++ parent_tidptr, child_tidptr, 0); ++} ++ + void __init proc_caches_init(void) + { + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), 0, + SLAB_PANIC, NULL, NULL); + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, +- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + } + + +diff -upr linux-2.6.16.orig/kernel/hrtimer.c linux-2.6.16-026test015/kernel/hrtimer.c +--- linux-2.6.16.orig/kernel/hrtimer.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/hrtimer.c 2006-07-04 14:41:39.000000000 +0400 +@@ -439,6 +439,7 @@ hrtimer_start(struct hrtimer *timer, kti + + return ret; + } ++EXPORT_SYMBOL_GPL(hrtimer_start); + + /** + * hrtimer_try_to_cancel - try to deactivate a timer +@@ -467,6 +468,7 @@ int hrtimer_try_to_cancel(struct hrtimer + return ret; + + } ++EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); + + /** + * hrtimer_cancel - cancel a timer and wait for the handler to finish. +@@ -504,6 +506,7 @@ ktime_t hrtimer_get_remaining(const stru + + return rem; + } ++EXPORT_SYMBOL_GPL(hrtimer_get_remaining); + + #ifdef CONFIG_NO_IDLE_HZ + /** +@@ -670,7 +673,7 @@ void hrtimer_run_queues(void) + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + */ +-static ktime_t __sched ++ktime_t __sched + schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode) + { + /* fn stays NULL, meaning single-shot wakeup: */ +@@ -697,7 +700,7 @@ schedule_hrtimer_interruptible(struct hr + return schedule_hrtimer(timer, mode); + } + +-static long __sched nanosleep_restart(struct restart_block *restart) ++long __sched nanosleep_restart(struct restart_block *restart) + { + struct timespec __user *rmtp; + struct timespec tu; +@@ -726,6 +729,7 @@ static long __sched nanosleep_restart(st + /* The other values in restart are already filled in */ + return -ERESTART_RESTARTBLOCK; + } ++EXPORT_SYMBOL_GPL(nanosleep_restart); + + long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, + const enum hrtimer_mode mode, const clockid_t clockid) +diff -upr linux-2.6.16.orig/kernel/irq/handle.c linux-2.6.16-026test015/kernel/irq/handle.c +--- linux-2.6.16.orig/kernel/irq/handle.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/irq/handle.c 2006-07-04 14:41:37.000000000 +0400 +@@ -14,6 +14,8 @@ + + #include "internals.h" + ++#include <ub/beancounter.h> ++ + /* + * Linux has a controller-independent interrupt architecture. + * Every controller has a 'controller-template', that is used +@@ -80,10 +82,12 @@ fastcall int handle_IRQ_event(unsigned i + struct irqaction *action) + { + int ret, retval = 0, status = 0; ++ struct user_beancounter *ub; + + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + ++ ub = set_exec_ub(get_ub0()); + do { + ret = action->handler(irq, action->dev_id, regs); + if (ret == IRQ_HANDLED) +@@ -91,6 +95,7 @@ fastcall int handle_IRQ_event(unsigned i + retval |= ret; + action = action->next; + } while (action); ++ (void)set_exec_ub(ub); + + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); +diff -upr linux-2.6.16.orig/kernel/kmod.c linux-2.6.16-026test015/kernel/kmod.c +--- linux-2.6.16.orig/kernel/kmod.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/kmod.c 2006-07-04 14:41:38.000000000 +0400 +@@ -78,6 +78,10 @@ int request_module(const char *fmt, ...) + #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ + static int kmod_loop_msg; + ++ /* Don't allow request_module() inside VE. */ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); +@@ -246,6 +250,9 @@ int call_usermodehelper_keys(char *path, + }; + DECLARE_WORK(work, __call_usermodehelper, &sub_info); + ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + if (!khelper_wq) + return -EBUSY; + +diff -upr linux-2.6.16.orig/kernel/kthread.c linux-2.6.16-026test015/kernel/kthread.c +--- linux-2.6.16.orig/kernel/kthread.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/kthread.c 2006-07-04 14:41:38.000000000 +0400 +@@ -114,7 +114,7 @@ static void keventd_create_kthread(void + create->result = ERR_PTR(pid); + } else { + wait_for_completion(&create->started); +- create->result = find_task_by_pid(pid); ++ create->result = find_task_by_pid_all(pid); + } + complete(&create->done); + } +diff -upr linux-2.6.16.orig/kernel/module.c linux-2.6.16-026test015/kernel/module.c +--- linux-2.6.16.orig/kernel/module.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/module.c 2006-07-04 14:41:38.000000000 +0400 +@@ -2130,6 +2130,8 @@ static void *m_start(struct seq_file *m, + loff_t n = 0; + + down(&module_mutex); ++ if (!ve_is_super(get_exec_env())) ++ return NULL; + list_for_each(i, &modules) { + if (n++ == *pos) + break; +diff -upr linux-2.6.16.orig/kernel/mutex-debug.c linux-2.6.16-026test015/kernel/mutex-debug.c +--- linux-2.6.16.orig/kernel/mutex-debug.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/mutex-debug.c 2006-07-04 14:41:38.000000000 +0400 +@@ -193,12 +193,12 @@ retry: + if (count != 10) + printk(" locked it.\n"); + +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + printk("\n"); + show_held_locks(NULL); +diff -upr linux-2.6.16.orig/kernel/panic.c linux-2.6.16-026test015/kernel/panic.c +--- linux-2.6.16.orig/kernel/panic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/panic.c 2006-07-04 14:41:38.000000000 +0400 +@@ -23,6 +23,8 @@ + int panic_timeout; + int panic_on_oops; + int tainted; ++int kernel_text_csum_broken; ++EXPORT_SYMBOL(kernel_text_csum_broken); + + EXPORT_SYMBOL(panic_timeout); + +@@ -156,7 +158,8 @@ const char *print_tainted(void) + { + static char buf[20]; + if (tainted) { +- snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", ++ snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", ++ kernel_text_csum_broken ? 'B' : ' ', + tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', + tainted & TAINT_FORCED_MODULE ? 'F' : ' ', + tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', +diff -upr linux-2.6.16.orig/kernel/pid.c linux-2.6.16-026test015/kernel/pid.c +--- linux-2.6.16.orig/kernel/pid.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/pid.c 2006-07-04 14:41:39.000000000 +0400 +@@ -27,6 +27,10 @@ + #include <linux/bootmem.h> + #include <linux/hash.h> + ++#ifdef CONFIG_VE ++static void __free_vpid(int vpid, struct ve_struct *ve); ++#endif ++ + #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) + static struct hlist_head *pid_hash[PIDTYPE_MAX]; + static int pidhash_shift; +@@ -57,8 +61,14 @@ typedef struct pidmap { + void *page; + } pidmap_t; + ++#ifdef CONFIG_VE ++#define PIDMAP_NRFREE (BITS_PER_PAGE/2) ++#else ++#define PIDMAP_NRFREE BITS_PER_PAGE ++#endif ++ + static pidmap_t pidmap_array[PIDMAP_ENTRIES] = +- { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; ++ { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(PIDMAP_NRFREE), NULL } }; + + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); + +@@ -67,9 +77,13 @@ fastcall void free_pidmap(int pid) + pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + int offset = pid & BITS_PER_PAGE_MASK; + +- clear_bit(offset, map->page); ++ BUG_ON(__is_virtual_pid(pid) || pid == 1); ++ ++ if (test_and_clear_bit(offset, map->page) == 0) ++ BUG(); + atomic_inc(&map->nr_free); + } ++EXPORT_SYMBOL_GPL(free_pidmap); + + int alloc_pidmap(void) + { +@@ -77,6 +91,8 @@ int alloc_pidmap(void) + pidmap_t *map; + + pid = last + 1; ++ if (__is_virtual_pid(pid)) ++ pid += VPID_DIV; + if (pid >= pid_max) + pid = RESERVED_PIDS; + offset = pid & BITS_PER_PAGE_MASK; +@@ -106,6 +122,8 @@ int alloc_pidmap(void) + return pid; + } + offset = find_next_offset(map, offset); ++ if (__is_virtual_pid(offset)) ++ offset += VPID_DIV; + pid = mk_pid(map, offset); + /* + * find_next_offset() found a bit, the pid from it +@@ -130,6 +148,7 @@ int alloc_pidmap(void) + } + return -1; + } ++EXPORT_SYMBOL_GPL(alloc_pidmap); + + struct pid * fastcall find_pid(enum pid_type type, int nr) + { +@@ -143,6 +162,7 @@ struct pid * fastcall find_pid(enum pid_ + } + return NULL; + } ++EXPORT_SYMBOL(find_pid); + + int fastcall attach_pid(task_t *task, enum pid_type type, int nr) + { +@@ -162,6 +182,7 @@ int fastcall attach_pid(task_t *task, en + + return 0; + } ++EXPORT_SYMBOL_GPL(attach_pid); + + static fastcall int __detach_pid(task_t *task, enum pid_type type) + { +@@ -201,13 +222,27 @@ void fastcall detach_pid(task_t *task, e + if (tmp != type && find_pid(tmp, nr)) + return; + ++#ifdef CONFIG_VE ++ __free_vpid(task->pids[type].vnr, VE_TASK_INFO(task)->owner_env); ++#endif + free_pidmap(nr); + } ++EXPORT_SYMBOL_GPL(detach_pid); + + task_t *find_task_by_pid_type(int type, int nr) + { ++ BUG(); ++ return NULL; ++} ++ ++EXPORT_SYMBOL(find_task_by_pid_type); ++ ++task_t *find_task_by_pid_type_all(int type, int nr) ++{ + struct pid *pid; + ++ BUG_ON(nr != -1 && is_virtual_pid(nr)); ++ + pid = find_pid(type, nr); + if (!pid) + return NULL; +@@ -215,7 +250,35 @@ task_t *find_task_by_pid_type(int type, + return pid_task(&pid->pid_list, type); + } + +-EXPORT_SYMBOL(find_task_by_pid_type); ++EXPORT_SYMBOL(find_task_by_pid_type_all); ++ ++#ifdef CONFIG_VE ++ ++task_t *find_task_by_pid_type_ve(int type, int nr) ++{ ++ task_t *tsk; ++ int gnr = nr; ++ struct pid *pid; ++ ++ if (is_virtual_pid(nr)) { ++ gnr = __vpid_to_pid(nr); ++ if (unlikely(gnr == -1)) ++ return NULL; ++ } ++ ++ pid = find_pid(type, gnr); ++ if (!pid) ++ return NULL; ++ ++ tsk = pid_task(&pid->pid_list, type); ++ if (!ve_accessible(VE_TASK_INFO(tsk)->owner_env, get_exec_env())) ++ return NULL; ++ return tsk; ++} ++ ++EXPORT_SYMBOL(find_task_by_pid_type_ve); ++ ++#endif + + /* + * This function switches the PIDs if a non-leader thread calls +@@ -234,12 +297,16 @@ void switch_exec_pids(task_t *leader, ta + + leader->pid = leader->tgid = thread->pid; + thread->pid = thread->tgid; ++ set_virt_tgid(leader, virt_pid(thread)); ++ set_virt_pid(leader, virt_pid(thread)); ++ set_virt_pid(thread, virt_tgid(thread)); + + attach_pid(thread, PIDTYPE_PID, thread->pid); + attach_pid(thread, PIDTYPE_TGID, thread->tgid); + attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); + attach_pid(thread, PIDTYPE_SID, thread->signal->session); + list_add_tail(&thread->tasks, &init_task.tasks); ++ SET_VE_LINKS(thread); + + attach_pid(leader, PIDTYPE_PID, leader->pid); + attach_pid(leader, PIDTYPE_TGID, leader->tgid); +@@ -247,6 +314,362 @@ void switch_exec_pids(task_t *leader, ta + attach_pid(leader, PIDTYPE_SID, leader->signal->session); + } + ++#ifdef CONFIG_VE ++ ++/* Virtual PID bits. ++ * ++ * At the moment all internal structures in kernel store real global pid. ++ * The only place, where virtual PID is used, is at user frontend. We ++ * remap virtual pids obtained from user to global ones (vpid_to_pid) and ++ * map globals to virtuals before showing them to user (virt_pid_type). ++ * ++ * We hold virtual PIDs inside struct pid, so map global -> virtual is easy. ++ */ ++ ++pid_t _pid_type_to_vpid(int type, pid_t pid) ++{ ++ struct pid * p; ++ ++ if (unlikely(is_virtual_pid(pid))) ++ return -1; ++ ++ read_lock(&tasklist_lock); ++ p = find_pid(type, pid); ++ if (p) { ++ pid = p->vnr; ++ } else { ++ pid = -1; ++ } ++ read_unlock(&tasklist_lock); ++ return pid; ++} ++EXPORT_SYMBOL_GPL(_pid_type_to_vpid); ++ ++pid_t pid_type_to_vpid(int type, pid_t pid) ++{ ++ int vpid; ++ ++ if (unlikely(pid <= 0)) ++ return pid; ++ ++ BUG_ON(is_virtual_pid(pid)); ++ ++ if (ve_is_super(get_exec_env())) ++ return pid; ++ ++ vpid = _pid_type_to_vpid(type, pid); ++ if (unlikely(vpid == -1)) { ++ /* It is allowed: global pid can be used everywhere. ++ * This can happen, when kernel remembers stray pids: ++ * signal queues, locks etc. ++ */ ++ vpid = pid; ++ } ++ return vpid; ++} ++EXPORT_SYMBOL_GPL(pid_type_to_vpid); ++ ++/* To map virtual pids to global we maintain special hash table. ++ * ++ * Mapping entries are allocated when a process with non-trivial ++ * mapping is forked, which is possible only after VE migrated. ++ * Mappings are destroyed, when a global pid is removed from global ++ * pidmap, which means we do not need to refcount mappings. ++ */ ++ ++static struct hlist_head *vpid_hash; ++ ++struct vpid_mapping ++{ ++ int vpid; ++ int veid; ++ int pid; ++ struct hlist_node link; ++ struct rcu_head rcu; ++}; ++ ++static kmem_cache_t *vpid_mapping_cachep; ++ ++static inline int vpid_hashfn(int vnr, int veid) ++{ ++ return hash_long((unsigned long)(vnr+(veid<<16)), pidhash_shift); ++} ++ ++struct vpid_mapping *__lookup_vpid_mapping(int vnr, int veid) ++{ ++ struct hlist_node *elem; ++ struct vpid_mapping *map; ++ ++ hlist_for_each_entry_rcu(map, elem, ++ &vpid_hash[vpid_hashfn(vnr, veid)], link) { ++ if (map->vpid == vnr && map->veid == veid) ++ return map; ++ } ++ return NULL; ++} ++ ++/* __vpid_to_pid() is raw version of vpid_to_pid(). It is to be used ++ * only under tasklist_lock. In some places we must use only this version ++ * (f.e. __kill_pg_info is called under write lock!) ++ * ++ * Caller should pass virtual pid. This function returns an error, when ++ * seeing a global pid. ++ */ ++int __vpid_to_pid(int pid) ++{ ++ struct vpid_mapping *map; ++ ++ if (unlikely(!is_virtual_pid(pid) || ve_is_super(get_exec_env()))) ++ return -1; ++ ++ if (!get_exec_env()->sparse_vpid) { ++ if (pid != 1) ++ return pid - VPID_DIV; ++ return get_exec_env()->init_entry->pid; ++ } ++ ++ map = __lookup_vpid_mapping(pid, VEID(get_exec_env())); ++ if (map) ++ return map->pid; ++ return -1; ++} ++EXPORT_SYMBOL_GPL(__vpid_to_pid); ++ ++int vpid_to_pid(int pid) ++{ ++ /* User gave bad pid. It is his problem. */ ++ if (unlikely(pid <= 0)) ++ return pid; ++ ++ if (!is_virtual_pid(pid)) ++ return pid; ++ ++ read_lock(&tasklist_lock); ++ pid = __vpid_to_pid(pid); ++ read_unlock(&tasklist_lock); ++ return pid; ++} ++EXPORT_SYMBOL_GPL(vpid_to_pid); ++ ++/* VEs which never migrated have trivial "arithmetic" mapping pid <-> vpid: ++ * ++ * vpid == 1 -> ve->init_task->pid ++ * else pid & ~VPID_DIV ++ * ++ * In this case VE has ve->sparse_vpid = 0 and we do not use vpid hash table. ++ * ++ * When VE migrates and we see non-trivial mapping the first time, we ++ * scan process table and populate mapping hash table. ++ */ ++ ++static int add_mapping(int pid, int vpid, int veid, struct hlist_head *cache) ++{ ++ if (unlikely(pid <= 0 || vpid <= 0)) ++ return 0; ++ ++ /* VE can contain non-virtual (VE_ENTER'ed) processes when ++ * switching to sparse mapping. We should not create mappings ++ * for them. */ ++ if (unlikely(!__is_virtual_pid(vpid) && vpid != 1)) { ++ printk("DEBUG (do not worry, but report): non-virtual pid while switching mode %d %d\n", pid, vpid); ++ return 0; ++ } ++ ++ if (!__lookup_vpid_mapping(vpid, veid)) { ++ struct vpid_mapping *m; ++ if (hlist_empty(cache)) { ++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_ATOMIC); ++ if (unlikely(m == NULL)) ++ return -ENOMEM; ++ } else { ++ m = hlist_entry(cache->first, struct vpid_mapping, link); ++ hlist_del_rcu(&m->link); ++ } ++ m->pid = pid; ++ m->vpid = vpid; ++ m->veid = veid; ++ hlist_add_head_rcu(&m->link, ++ &vpid_hash[vpid_hashfn(vpid, veid)]); ++ } ++ return 0; ++} ++ ++static int switch_to_sparse_mapping(int pid) ++{ ++ struct ve_struct *env = get_exec_env(); ++ struct hlist_head cache; ++ task_t *g, *t; ++ int pcount; ++ int err; ++ ++ /* Transition happens under write_lock_irq, so we try to make ++ * it more reliable and fast preallocating mapping entries. ++ * pcounter may be not enough, we could have lots of orphaned ++ * process groups and sessions, which also require mappings. ++ */ ++ INIT_HLIST_HEAD(&cache); ++ pcount = atomic_read(&env->pcounter); ++ err = -ENOMEM; ++ while (pcount > 0) { ++ struct vpid_mapping *m; ++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL); ++ if (!m) ++ goto out; ++ hlist_add_head(&m->link, &cache); ++ pcount--; ++ } ++ ++ write_lock_irq(&tasklist_lock); ++ err = 0; ++ if (env->sparse_vpid) ++ goto out_unlock; ++ ++ err = -ENOMEM; ++ do_each_thread_ve(g, t) { ++ if (t->pid == pid) ++ continue; ++ if (add_mapping(t->pid, virt_pid(t), VEID(env), &cache)) ++ goto out_unlock; ++ } while_each_thread_ve(g, t); ++ ++ for_each_process_ve(t) { ++ if (t->pid == pid) ++ continue; ++ ++ if (add_mapping(t->tgid, virt_tgid(t), VEID(env), &cache)) ++ goto out_unlock; ++ if (add_mapping(t->signal->pgrp, virt_pgid(t), VEID(env), &cache)) ++ goto out_unlock; ++ if (add_mapping(t->signal->session, virt_sid(t), VEID(env), &cache)) ++ goto out_unlock; ++ } ++ env->sparse_vpid = 1; ++ err = 0; ++ ++out_unlock: ++ if (err) { ++ int i; ++ ++ for (i=0; i<(1<<pidhash_shift); i++) { ++ struct hlist_node *elem, *next; ++ struct vpid_mapping *map; ++ ++ hlist_for_each_entry_safe(map, elem, next, &vpid_hash[i], link) { ++ if (map->veid == VEID(env)) { ++ hlist_del(elem); ++ hlist_add_head(elem, &cache); ++ } ++ } ++ } ++ } ++ write_unlock_irq(&tasklist_lock); ++ ++out: ++ while (!hlist_empty(&cache)) { ++ struct vpid_mapping *m; ++ m = hlist_entry(cache.first, struct vpid_mapping, link); ++ hlist_del_rcu(&m->link); ++ kmem_cache_free(vpid_mapping_cachep, m); ++ } ++ return err; ++} ++ ++int alloc_vpid(int pid, int virt_pid) ++{ ++ int result; ++ struct vpid_mapping *m; ++ struct ve_struct *env = get_exec_env(); ++ ++ if (ve_is_super(env) || !env->virt_pids) ++ return pid; ++ ++ if (!env->sparse_vpid) { ++ if (virt_pid == -1) ++ return pid + VPID_DIV; ++ ++ if (virt_pid == 1 || virt_pid == pid + VPID_DIV) ++ return virt_pid; ++ ++ if ((result = switch_to_sparse_mapping(pid)) < 0) ++ return result; ++ } ++ ++ m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL); ++ if (!m) ++ return -ENOMEM; ++ ++ m->pid = pid; ++ m->veid = VEID(env); ++ ++ result = (virt_pid == -1) ? pid + VPID_DIV : virt_pid; ++ ++ write_lock_irq(&tasklist_lock); ++ if (unlikely(__lookup_vpid_mapping(result, m->veid))) { ++ if (virt_pid > 0) { ++ result = -EEXIST; ++ goto out; ++ } ++ ++ /* No luck. Now we search for some not-existing vpid. ++ * It is weak place. We do linear search. */ ++ do { ++ result++; ++ if (!__is_virtual_pid(result)) ++ result += VPID_DIV; ++ if (result >= pid_max) ++ result = RESERVED_PIDS + VPID_DIV; ++ } while (__lookup_vpid_mapping(result, m->veid) != NULL); ++ ++ /* And set last_pid in hope future alloc_pidmap to avoid ++ * collisions after future alloc_pidmap() */ ++ last_pid = result - VPID_DIV; ++ } ++ if (result > 0) { ++ m->vpid = result; ++ hlist_add_head_rcu(&m->link, ++ &vpid_hash[vpid_hashfn(result, m->veid)]); ++ } ++out: ++ write_unlock_irq(&tasklist_lock); ++ if (result < 0) ++ kmem_cache_free(vpid_mapping_cachep, m); ++ return result; ++} ++EXPORT_SYMBOL(alloc_vpid); ++ ++static void vpid_free_cb(struct rcu_head *rhp) ++{ ++ struct vpid_mapping *m; ++ ++ m = container_of(rhp, struct vpid_mapping, rcu); ++ kmem_cache_free(vpid_mapping_cachep, m); ++} ++ ++static void __free_vpid(int vpid, struct ve_struct *ve) ++{ ++ struct vpid_mapping *m; ++ ++ if (!ve->sparse_vpid) ++ return; ++ ++ if (!__is_virtual_pid(vpid) && (vpid != 1 || ve_is_super(ve))) ++ return; ++ ++ m = __lookup_vpid_mapping(vpid, ve->veid); ++ BUG_ON(m == NULL); ++ hlist_del_rcu(&m->link); ++ call_rcu(&m->rcu, vpid_free_cb); ++} ++ ++void free_vpid(int vpid, struct ve_struct *ve) ++{ ++ write_lock_irq(&tasklist_lock); ++ __free_vpid(vpid, ve); ++ write_unlock_irq(&tasklist_lock); ++} ++EXPORT_SYMBOL(free_vpid); ++#endif ++ + /* + * The pid hash table is scaled according to the amount of memory in the + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or +@@ -273,6 +696,14 @@ void __init pidhash_init(void) + for (j = 0; j < pidhash_size; j++) + INIT_HLIST_HEAD(&pid_hash[i][j]); + } ++ ++#ifdef CONFIG_VE ++ vpid_hash = alloc_bootmem(pidhash_size * sizeof(struct hlist_head)); ++ if (!vpid_hash) ++ panic("Could not alloc vpid_hash!\n"); ++ for (j = 0; j < pidhash_size; j++) ++ INIT_HLIST_HEAD(&vpid_hash[j]); ++#endif + } + + void __init pidmap_init(void) +@@ -289,4 +720,12 @@ void __init pidmap_init(void) + + for (i = 0; i < PIDTYPE_MAX; i++) + attach_pid(current, i, 0); ++ ++#ifdef CONFIG_VE ++ vpid_mapping_cachep = ++ kmem_cache_create("vpid_mapping", ++ sizeof(struct vpid_mapping), ++ __alignof__(struct vpid_mapping), ++ SLAB_PANIC|SLAB_UBC, NULL, NULL); ++#endif + } +diff -upr linux-2.6.16.orig/kernel/posix-cpu-timers.c linux-2.6.16-026test015/kernel/posix-cpu-timers.c +--- linux-2.6.16.orig/kernel/posix-cpu-timers.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/posix-cpu-timers.c 2006-07-04 14:41:38.000000000 +0400 +@@ -20,7 +20,7 @@ static int check_clock(const clockid_t w + return 0; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (!p || (CPUCLOCK_PERTHREAD(which_clock) ? + p->tgid != current->tgid : p->tgid != pid)) { + error = -EINVAL; +@@ -292,7 +292,7 @@ int posix_cpu_clock_get(const clockid_t + */ + struct task_struct *p; + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (p) { + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (p->tgid == current->tgid) { +@@ -336,7 +336,7 @@ int posix_cpu_timer_create(struct k_itim + if (pid == 0) { + p = current; + } else { +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (p && p->tgid != current->tgid) + p = NULL; + } +@@ -344,7 +344,7 @@ int posix_cpu_timer_create(struct k_itim + if (pid == 0) { + p = current->group_leader; + } else { +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (p && p->tgid != pid) + p = NULL; + } +@@ -1173,6 +1173,9 @@ static void check_process_timers(struct + } + t = tsk; + do { ++ if (unlikely(t->flags & PF_EXITING)) ++ continue; ++ + ticks = cputime_add(cputime_add(t->utime, t->stime), + prof_left); + if (!cputime_eq(prof_expires, cputime_zero) && +@@ -1193,11 +1196,7 @@ static void check_process_timers(struct + t->it_sched_expires > sched)) { + t->it_sched_expires = sched; + } +- +- do { +- t = next_thread(t); +- } while (unlikely(t->flags & PF_EXITING)); +- } while (t != tsk); ++ } while ((t = next_thread(t)) != tsk); + } + } + +@@ -1289,30 +1288,30 @@ void run_posix_cpu_timers(struct task_st + + #undef UNEXPIRED + +- BUG_ON(tsk->exit_state); +- + /* + * Double-check with locks held. + */ + read_lock(&tasklist_lock); +- spin_lock(&tsk->sighand->siglock); ++ if (likely(tsk->signal != NULL)) { ++ spin_lock(&tsk->sighand->siglock); + +- /* +- * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] +- * all the timers that are firing, and put them on the firing list. +- */ +- check_thread_timers(tsk, &firing); +- check_process_timers(tsk, &firing); ++ /* ++ * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] ++ * all the timers that are firing, and put them on the firing list. ++ */ ++ check_thread_timers(tsk, &firing); ++ check_process_timers(tsk, &firing); + +- /* +- * We must release these locks before taking any timer's lock. +- * There is a potential race with timer deletion here, as the +- * siglock now protects our private firing list. We have set +- * the firing flag in each timer, so that a deletion attempt +- * that gets the timer lock before we do will give it up and +- * spin until we've taken care of that timer below. +- */ +- spin_unlock(&tsk->sighand->siglock); ++ /* ++ * We must release these locks before taking any timer's lock. ++ * There is a potential race with timer deletion here, as the ++ * siglock now protects our private firing list. We have set ++ * the firing flag in each timer, so that a deletion attempt ++ * that gets the timer lock before we do will give it up and ++ * spin until we've taken care of that timer below. ++ */ ++ spin_unlock(&tsk->sighand->siglock); ++ } + read_unlock(&tasklist_lock); + + /* +diff -upr linux-2.6.16.orig/kernel/posix-timers.c linux-2.6.16-026test015/kernel/posix-timers.c +--- linux-2.6.16.orig/kernel/posix-timers.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/posix-timers.c 2006-07-04 14:41:38.000000000 +0400 +@@ -31,6 +31,7 @@ + * POSIX clocks & timers + */ + #include <linux/mm.h> ++#include <linux/module.h> + #include <linux/smp_lock.h> + #include <linux/interrupt.h> + #include <linux/slab.h> +@@ -48,6 +49,8 @@ + #include <linux/workqueue.h> + #include <linux/module.h> + ++#include <ub/beancounter.h> ++ + /* + * Management arrays for POSIX timers. Timers are kept in slab memory + * Timer ids are allocated by an external routine that keeps track of the +@@ -241,7 +244,8 @@ static __init int init_posix_timers(void + register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", +- sizeof (struct k_itimer), 0, 0, NULL, NULL); ++ sizeof (struct k_itimer), 0, ++ SLAB_UBC, NULL, NULL); + idr_init(&posix_timers_id); + return 0; + } +@@ -294,6 +298,13 @@ void do_schedule_next_timer(struct sigin + + int posix_timer_event(struct k_itimer *timr,int si_private) + { ++ int ret; ++ struct ve_struct *ve; ++ struct user_beancounter *ub; ++ ++ ve = set_exec_env(timr->it_process->ve_task_info.owner_env); ++ ub = set_exec_ub(timr->it_process->task_bc.task_ub); ++ + memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + timr->sigq->info.si_sys_private = si_private; + /* Send signal to the process that owns this timer.*/ +@@ -306,11 +317,11 @@ int posix_timer_event(struct k_itimer *t + + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { + struct task_struct *leader; +- int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, ++ ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + + if (likely(ret >= 0)) +- return ret; ++ goto out; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; +@@ -318,8 +329,12 @@ int posix_timer_event(struct k_itimer *t + timr->it_process = leader; + } + +- return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, ++ ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); ++out: ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(ve); ++ return ret; + } + EXPORT_SYMBOL_GPL(posix_timer_event); + +@@ -366,7 +381,7 @@ static struct task_struct * good_sigeven + struct task_struct *rtn = current->group_leader; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && +- (!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) || ++ (!(rtn = find_task_by_pid_ve(event->sigev_notify_thread_id)) || + rtn->tgid != current->tgid || + (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + return NULL; +diff -upr linux-2.6.16.orig/kernel/power/Kconfig linux-2.6.16-026test015/kernel/power/Kconfig +--- linux-2.6.16.orig/kernel/power/Kconfig 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/power/Kconfig 2006-07-04 14:41:39.000000000 +0400 +@@ -38,7 +38,7 @@ config PM_DEBUG + + config SOFTWARE_SUSPEND + bool "Software Suspend" +- depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) ++ depends on PM && SWAP && X86 || ((FRV || PPC32) && !SMP) + ---help--- + Enable the possibility of suspending the machine. + It doesn't need APM. +diff -upr linux-2.6.16.orig/kernel/power/process.c linux-2.6.16-026test015/kernel/power/process.c +--- linux-2.6.16.orig/kernel/power/process.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/power/process.c 2006-07-04 14:41:39.000000000 +0400 +@@ -38,18 +38,23 @@ void refrigerator(void) + processes around? */ + long save; + save = current->state; ++ current->state = TASK_UNINTERRUPTIBLE; + pr_debug("%s entered refrigerator\n", current->comm); +- printk("="); ++ /* printk("="); */ + +- frozen_process(current); + spin_lock_irq(¤t->sighand->siglock); +- recalc_sigpending(); /* We sent fake signal, clean it up */ ++ if (test_and_clear_thread_flag(TIF_FREEZE)) { ++ recalc_sigpending(); /* We sent fake signal, clean it up */ ++ current->flags |= PF_FROZEN; ++ } else { ++ /* Freeze request could be canceled before we entered ++ * refrigerator(). In this case we do nothing. */ ++ current->state = save; ++ } + spin_unlock_irq(¤t->sighand->siglock); + +- while (frozen(current)) { +- current->state = TASK_UNINTERRUPTIBLE; ++ while (current->flags & PF_FROZEN) + schedule(); +- } + pr_debug("%s left refrigerator\n", current->comm); + current->state = save; + } +@@ -67,7 +72,7 @@ int freeze_processes(void) + do { + todo = 0; + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (!freezeable(p)) + continue; + if (frozen(p)) +@@ -78,7 +83,7 @@ int freeze_processes(void) + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + todo++; +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + yield(); /* Yield is okay here */ + if (todo && time_after(jiffies, start_time + TIMEOUT)) { +@@ -95,15 +100,15 @@ int freeze_processes(void) + */ + if (todo) { + read_lock(&tasklist_lock); +- do_each_thread(g, p) ++ do_each_thread_all(g, p) + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); +- p->flags &= ~PF_FREEZE; + spin_lock_irqsave(&p->sighand->siglock, flags); ++ clear_tsk_thread_flag(p, TIF_FREEZE); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +- while_each_thread(g, p); ++ while_each_thread_all(g, p); + read_unlock(&tasklist_lock); + return todo; + } +@@ -119,12 +124,12 @@ void thaw_processes(void) + + printk( "Restarting tasks..." ); + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + if (!freezeable(p)) + continue; + if (!thaw_process(p)) + printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + read_unlock(&tasklist_lock); + schedule(); +diff -upr linux-2.6.16.orig/kernel/printk.c linux-2.6.16-026test015/kernel/printk.c +--- linux-2.6.16.orig/kernel/printk.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/printk.c 2006-07-04 14:41:38.000000000 +0400 +@@ -30,7 +30,9 @@ + #include <linux/smp.h> + #include <linux/security.h> + #include <linux/bootmem.h> ++#include <linux/vzratelimit.h> + #include <linux/syscalls.h> ++#include <linux/veprintk.h> + + #include <asm/uaccess.h> + +@@ -83,7 +85,7 @@ static int console_locked; + * It is also used in interesting ways to provide interlocking in + * release_console_sem(). + */ +-static DEFINE_SPINLOCK(logbuf_lock); ++DEFINE_SPINLOCK(logbuf_lock); + + #define LOG_BUF_MASK (log_buf_len-1) + #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) +@@ -114,6 +116,7 @@ static int preferred_console = -1; + + /* Flag: console code may call schedule() */ + static int console_may_schedule; ++int console_silence_loglevel; + + #ifdef CONFIG_PRINTK + +@@ -160,6 +163,19 @@ static int __init console_setup(char *st + + __setup("console=", console_setup); + ++static int __init setup_console_silencelevel(char *str) ++{ ++ int level; ++ ++ if (get_option(&str, &level) != 1) ++ return 0; ++ ++ console_silence_loglevel = level; ++ return 1; ++} ++ ++__setup("silencelevel=", setup_console_silencelevel); ++ + static int __init log_buf_len_setup(char *str) + { + unsigned long size = memparse(str, &str); +@@ -223,6 +239,10 @@ int do_syslog(int type, char __user *buf + char c; + int error = 0; + ++ if (!ve_is_super(get_exec_env()) && ++ (type == 6 || type == 7 || type == 8)) ++ goto out; ++ + error = security_syslog(type); + if (error) + return error; +@@ -243,15 +263,15 @@ int do_syslog(int type, char __user *buf + error = -EFAULT; + goto out; + } +- error = wait_event_interruptible(log_wait, +- (log_start - log_end)); ++ error = wait_event_interruptible(ve_log_wait, ++ (ve_log_start - ve_log_end)); + if (error) + goto out; + i = 0; + spin_lock_irq(&logbuf_lock); +- while (!error && (log_start != log_end) && i < len) { +- c = LOG_BUF(log_start); +- log_start++; ++ while (!error && (ve_log_start != ve_log_end) && i < len) { ++ c = VE_LOG_BUF(ve_log_start); ++ ve_log_start++; + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,buf); + buf++; +@@ -277,15 +297,17 @@ int do_syslog(int type, char __user *buf + error = -EFAULT; + goto out; + } ++ if (ve_log_buf == NULL) ++ goto out; + count = len; +- if (count > log_buf_len) +- count = log_buf_len; ++ if (count > ve_log_buf_len) ++ count = ve_log_buf_len; + spin_lock_irq(&logbuf_lock); +- if (count > logged_chars) +- count = logged_chars; ++ if (count > ve_logged_chars) ++ count = ve_logged_chars; + if (do_clear) +- logged_chars = 0; +- limit = log_end; ++ ve_logged_chars = 0; ++ limit = ve_log_end; + /* + * __put_user() could sleep, and while we sleep + * printk() could overwrite the messages +@@ -294,9 +316,9 @@ int do_syslog(int type, char __user *buf + */ + for (i = 0; i < count && !error; i++) { + j = limit-1-i; +- if (j + log_buf_len < log_end) ++ if (j + ve_log_buf_len < ve_log_end) + break; +- c = LOG_BUF(j); ++ c = VE_LOG_BUF(j); + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,&buf[count-1-i]); + cond_resched(); +@@ -320,7 +342,7 @@ int do_syslog(int type, char __user *buf + } + break; + case 5: /* Clear ring buffer */ +- logged_chars = 0; ++ ve_logged_chars = 0; + break; + case 6: /* Disable logging to console */ + console_loglevel = minimum_console_loglevel; +@@ -338,10 +360,10 @@ int do_syslog(int type, char __user *buf + error = 0; + break; + case 9: /* Number of chars in the log buffer */ +- error = log_end - log_start; ++ error = ve_log_end - ve_log_start; + break; + case 10: /* Size of the log buffer */ +- error = log_buf_len; ++ error = ve_log_buf_len; + break; + default: + error = -EINVAL; +@@ -439,14 +461,14 @@ static void call_console_drivers(unsigne + + static void emit_log_char(char c) + { +- LOG_BUF(log_end) = c; +- log_end++; +- if (log_end - log_start > log_buf_len) +- log_start = log_end - log_buf_len; +- if (log_end - con_start > log_buf_len) +- con_start = log_end - log_buf_len; +- if (logged_chars < log_buf_len) +- logged_chars++; ++ VE_LOG_BUF(ve_log_end) = c; ++ ve_log_end++; ++ if (ve_log_end - ve_log_start > ve_log_buf_len) ++ ve_log_start = ve_log_end - ve_log_buf_len; ++ if (ve_is_super(get_exec_env()) && ve_log_end - con_start > ve_log_buf_len) ++ con_start = ve_log_end - ve_log_buf_len; ++ if (ve_logged_chars < ve_log_buf_len) ++ ve_logged_chars++; + } + + /* +@@ -511,6 +533,30 @@ __attribute__((weak)) unsigned long long + * printf(3) + */ + ++static inline int ve_log_init(void) ++{ ++#ifdef CONFIG_VE ++ if (ve_log_buf != NULL) ++ return 0; ++ ++ if (ve_is_super(get_exec_env())) { ++ ve0._log_wait = &log_wait; ++ ve0._log_start = &log_start; ++ ve0._log_end = &log_end; ++ ve0._logged_chars = &logged_chars; ++ ve0.log_buf = log_buf; ++ return 0; ++ } ++ ++ ve_log_buf = kmalloc(ve_log_buf_len, GFP_ATOMIC); ++ if (!ve_log_buf) ++ return -ENOMEM; ++ ++ memset(ve_log_buf, 0, ve_log_buf_len); ++#endif ++ return 0; ++} ++ + asmlinkage int printk(const char *fmt, ...) + { + va_list args; +@@ -526,13 +572,14 @@ asmlinkage int printk(const char *fmt, . + /* cpu currently holding logbuf_lock */ + static volatile unsigned int printk_cpu = UINT_MAX; + +-asmlinkage int vprintk(const char *fmt, va_list args) ++asmlinkage int __vprintk(const char *fmt, va_list args) + { + unsigned long flags; + int printed_len; + char *p; + static char printk_buf[1024]; + static int log_level_unknown = 1; ++ int err, need_wake; + + preempt_disable(); + if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) +@@ -544,6 +591,12 @@ asmlinkage int vprintk(const char *fmt, + spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); + ++ err = ve_log_init(); ++ if (err) { ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++ return err; ++ } ++ + /* Emit the output into the temporary buffer */ + printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); + +@@ -615,7 +668,12 @@ asmlinkage int vprintk(const char *fmt, + spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } +- if (!down_trylock(&console_sem)) { ++ if (!ve_is_super(get_exec_env())) { ++ need_wake = (ve_log_start != ve_log_end); ++ spin_unlock_irqrestore(&logbuf_lock, flags); ++ if (!oops_in_progress && need_wake) ++ wake_up_interruptible(&ve_log_wait); ++ } else if (!down_trylock(&console_sem)) { + console_locked = 1; + /* + * We own the drivers. We can drop the spinlock and let +@@ -641,6 +699,38 @@ out: + EXPORT_SYMBOL(printk); + EXPORT_SYMBOL(vprintk); + ++asmlinkage int vprintk(const char *fmt, va_list args) ++{ ++ int i; ++ struct ve_struct *env; ++ ++ env = set_exec_env(get_ve0()); ++ i = __vprintk(fmt, args); ++ set_exec_env(env); ++ return i; ++} ++ ++asmlinkage int ve_printk(int dst, const char *fmt, ...) ++{ ++ va_list args; ++ int printed_len; ++ ++ printed_len = 0; ++ if (ve_is_super(get_exec_env()) || (dst & VE0_LOG)) { ++ va_start(args, fmt); ++ printed_len = vprintk(fmt, args); ++ va_end(args); ++ } ++ if (!ve_is_super(get_exec_env()) && (dst & VE_LOG)) { ++ va_start(args, fmt); ++ printed_len = __vprintk(fmt, args); ++ va_end(args); ++ } ++ return printed_len; ++} ++EXPORT_SYMBOL(ve_printk); ++ ++ + #else + + asmlinkage long sys_syslog(int type, char __user *buf, int len) +@@ -732,6 +822,12 @@ int is_console_locked(void) + } + EXPORT_SYMBOL(is_console_locked); + ++void wake_up_klogd(void) ++{ ++ if (!oops_in_progress && waitqueue_active(&log_wait)) ++ wake_up_interruptible(&log_wait); ++} ++ + /** + * release_console_sem - unlock the console system + * +@@ -768,8 +864,8 @@ void release_console_sem(void) + console_may_schedule = 0; + up(&console_sem); + spin_unlock_irqrestore(&logbuf_lock, flags); +- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) +- wake_up_interruptible(&log_wait); ++ if (wake_klogd) ++ wake_up_klogd(); + } + EXPORT_SYMBOL(release_console_sem); + +@@ -1049,3 +1145,33 @@ int printk_ratelimit(void) + printk_ratelimit_burst); + } + EXPORT_SYMBOL(printk_ratelimit); ++ ++/* ++ * Rate limiting stuff. ++ */ ++int vz_ratelimit(struct vz_rate_info *p) ++{ ++ unsigned long cjif, djif; ++ unsigned long flags; ++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; ++ long new_bucket; ++ ++ spin_lock_irqsave(&ratelimit_lock, flags); ++ cjif = jiffies; ++ djif = cjif - p->last; ++ if (djif < p->interval) { ++ if (p->bucket >= p->burst) { ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 0; ++ } ++ p->bucket++; ++ } else { ++ new_bucket = p->bucket - (djif / (unsigned)p->interval); ++ if (new_bucket < 0) ++ new_bucket = 0; ++ p->bucket = new_bucket + 1; ++ } ++ p->last = cjif; ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 1; ++} +diff -upr linux-2.6.16.orig/kernel/ptrace.c linux-2.6.16-026test015/kernel/ptrace.c +--- linux-2.6.16.orig/kernel/ptrace.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/ptrace.c 2006-07-04 14:41:39.000000000 +0400 +@@ -57,10 +57,6 @@ void ptrace_untrace(task_t *child) + signal_wake_up(child, 1); + } + } +- if (child->signal->flags & SIGNAL_GROUP_EXIT) { +- sigaddset(&child->pending.signal, SIGKILL); +- signal_wake_up(child, 1); +- } + spin_unlock(&child->sighand->siglock); + } + +@@ -82,7 +78,8 @@ void __ptrace_unlink(task_t *child) + SET_LINKS(child); + } + +- ptrace_untrace(child); ++ if (child->state == TASK_TRACED) ++ ptrace_untrace(child); + } + + /* +@@ -136,7 +133,10 @@ static int may_attach(struct task_struct + smp_rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; +- ++ if (!task->mm->vps_dumpable && !ve_is_super(get_exec_env())) ++ return -EPERM; ++ if (!ve_accessible(VE_TASK_INFO(task)->owner_env, get_exec_env())) ++ return -EPERM; + return security_ptrace(current, task); + } + +@@ -152,12 +152,34 @@ int ptrace_may_attach(struct task_struct + int ptrace_attach(struct task_struct *task) + { + int retval; +- task_lock(task); ++ + retval = -EPERM; + if (task->pid <= 1) +- goto bad; ++ goto out; + if (task->tgid == current->tgid) +- goto bad; ++ goto out; ++ ++repeat: ++ /* ++ * Nasty, nasty. ++ * ++ * We want to hold both the task-lock and the ++ * tasklist_lock for writing at the same time. ++ * But that's against the rules (tasklist_lock ++ * is taken for reading by interrupts on other ++ * cpu's that may have task_lock). ++ */ ++ task_lock(task); ++ local_irq_disable(); ++ if (!write_trylock(&tasklist_lock)) { ++ local_irq_enable(); ++ task_unlock(task); ++ do { ++ cpu_relax(); ++ } while (!write_can_lock(&tasklist_lock)); ++ goto repeat; ++ } ++ + /* the same process cannot be attached many times */ + if (task->ptrace & PT_PTRACED) + goto bad; +@@ -170,17 +192,15 @@ int ptrace_attach(struct task_struct *ta + ? PT_ATTACHED : 0); + if (capable(CAP_SYS_PTRACE)) + task->ptrace |= PT_PTRACE_CAP; +- task_unlock(task); + +- write_lock_irq(&tasklist_lock); + __ptrace_link(task, current); +- write_unlock_irq(&tasklist_lock); + + force_sig_specific(SIGSTOP, task); +- return 0; + + bad: ++ write_unlock_irq(&tasklist_lock); + task_unlock(task); ++out: + return retval; + } + +@@ -263,6 +283,7 @@ int access_process_vm(struct task_struct + + return buf - old_buf; + } ++EXPORT_SYMBOL_GPL(access_process_vm); + + int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) + { +@@ -421,21 +442,22 @@ int ptrace_request(struct task_struct *c + */ + int ptrace_traceme(void) + { +- int ret; ++ int ret = -EPERM; + + /* + * Are we already being traced? + */ +- if (current->ptrace & PT_PTRACED) +- return -EPERM; +- ret = security_ptrace(current->parent, current); +- if (ret) +- return -EPERM; +- /* +- * Set the ptrace bit in the process ptrace flags. +- */ +- current->ptrace |= PT_PTRACED; +- return 0; ++ task_lock(current); ++ if (!(current->ptrace & PT_PTRACED)) { ++ ret = security_ptrace(current->parent, current); ++ /* ++ * Set the ptrace bit in the process ptrace flags. ++ */ ++ if (!ret) ++ current->ptrace |= PT_PTRACED; ++ } ++ task_unlock(current); ++ return ret; + } + + /** +@@ -459,7 +481,7 @@ struct task_struct *ptrace_get_task_stru + return ERR_PTR(-EPERM); + + read_lock(&tasklist_lock); +- child = find_task_by_pid(pid); ++ child = find_task_by_pid_ve(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); +diff -upr linux-2.6.16.orig/kernel/sched.c linux-2.6.16-026test015/kernel/sched.c +--- linux-2.6.16.orig/kernel/sched.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/sched.c 2006-07-04 14:41:39.000000000 +0400 +@@ -49,6 +49,8 @@ + #include <linux/syscalls.h> + #include <linux/times.h> + #include <linux/acct.h> ++#include <linux/vsched.h> ++#include <linux/fairsched.h> + #include <asm/tlb.h> + + #include <asm/unistd.h> +@@ -134,7 +136,7 @@ + #ifdef CONFIG_SMP + #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ +- num_online_cpus()) ++ vsched_num_online_vcpus(task_vsched(p))) + #else + #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +@@ -199,6 +201,7 @@ struct prio_array { + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ ++typedef struct vcpu_info *vcpu_t; + struct runqueue { + spinlock_t lock; + +@@ -220,9 +223,12 @@ struct runqueue { + */ + unsigned long nr_uninterruptible; + ++ unsigned long nr_sleeping; ++ unsigned long nr_stopped; ++ + unsigned long expired_timestamp; + unsigned long long timestamp_last_tick; +- task_t *curr, *idle; ++ task_t *curr; + struct mm_struct *prev_mm; + prio_array_t *active, *expired, arrays[2]; + int best_expired_prio; +@@ -233,11 +239,12 @@ struct runqueue { + + /* For active balancing */ + int active_balance; +- int push_cpu; ++#endif ++ vcpu_t push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +-#endif ++ int cpu; + + #ifdef CONFIG_SCHEDSTATS + /* latency stats */ +@@ -260,7 +267,51 @@ struct runqueue { + #endif + }; + +-static DEFINE_PER_CPU(struct runqueue, runqueues); ++/* VCPU scheduler state description */ ++struct vcpu_info; ++struct vcpu_scheduler { ++ struct list_head idle_list; ++ struct list_head active_list; ++ struct list_head running_list; ++#ifdef CONFIG_FAIRSCHED ++ struct fairsched_node *node; ++#endif ++ struct vcpu_info *vcpu[NR_CPUS]; ++ int id; ++ cpumask_t vcpu_online_map, vcpu_running_map; ++ cpumask_t pcpu_running_map; ++ int num_online_vcpus; ++} ____cacheline_internodealigned_in_smp; ++ ++/* virtual CPU description */ ++struct vcpu_info { ++ struct runqueue rq; ++#ifdef CONFIG_SCHED_VCPU ++ unsigned active : 1, ++ running : 1; ++ struct list_head list; ++ struct vcpu_scheduler *vsched; ++ int last_pcpu; ++ u32 start_time; ++#endif ++ int id; ++} ____cacheline_internodealigned_in_smp; ++ ++/* physical CPU description */ ++struct pcpu_info { ++ struct vcpu_scheduler *vsched; ++ struct vcpu_info *vcpu; ++ task_t *idle; ++#ifdef CONFIG_SMP ++ struct sched_domain *sd; ++#endif ++ int id; ++} ____cacheline_internodealigned_in_smp; ++ ++struct pcpu_info pcpu_info[NR_CPUS]; ++ ++#define pcpu(nr) (&pcpu_info[nr]) ++#define this_pcpu() (pcpu(smp_processor_id())) + + /* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. +@@ -269,13 +320,399 @@ static DEFINE_PER_CPU(struct runqueue, r + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ ++#define for_each_pdomain(sd, domain) \ ++for (domain = rcu_dereference(sd); domain; domain = domain->parent) ++ + #define for_each_domain(cpu, domain) \ +-for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) ++ for_each_pdomain(vcpu_rq(cpu)->sd, domain) ++ ++#ifdef CONFIG_SCHED_VCPU ++ ++u32 vcpu_sched_timeslice = 5; ++u32 vcpu_timeslice = 0; ++EXPORT_SYMBOL(vcpu_sched_timeslice); ++EXPORT_SYMBOL(vcpu_timeslice); ++ ++extern spinlock_t fairsched_lock; ++static struct vcpu_scheduler default_vsched, idle_vsched; ++static struct vcpu_info boot_vcpu, boot_idle_vcpu; ++ ++#define vsched_default_vsched() (&default_vsched) ++#define vsched_default_vcpu(id) (default_vsched.vcpu[id]) ++ ++/* ++ * All macroses below could be used without locks, if there is no ++ * strict ordering requirements, because we assume, that: ++ * ++ * 1. VCPU could not disappear "on the fly" (FIXME) ++ * ++ * 2. p->vsched access is atomic. ++ */ ++ ++#define task_vsched(tsk) ((tsk)->vsched) ++#define this_vsched() (task_vsched(current)) ++ ++#define vsched_vcpu(vsched, id) ((vsched)->vcpu[id]) ++#define this_vcpu() (task_vcpu(current)) ++#define task_vcpu(p) ((p)->vcpu) ++ ++#define vsched_id(vsched) ((vsched)->id) ++#define vsched_vcpu_online_map(vsched) ((vsched)->vcpu_online_map) ++#define vsched_num_online_vcpus(vsched) ((vsched)->num_online_vcpus) ++#define vsched_pcpu_running_map(vsched) ((vsched)->pcpu_running_map) ++ ++#define vcpu_vsched(vcpu) ((vcpu)->vsched) ++#define vcpu_last_pcpu(vcpu) ((vcpu)->last_pcpu) ++#define vcpu_isset(vcpu, mask) (cpu_isset((vcpu)->id, mask)) ++#define vcpu_is_offline(vcpu) (!vcpu_isset(vcpu, \ ++ vcpu_vsched(vcpu)->vcpu_online_map)) ++ ++static int __add_vcpu(struct vcpu_scheduler *vsched, int id); ++ ++#else /* CONFIG_SCHED_VCPU */ ++ ++static DEFINE_PER_CPU(struct vcpu_info, vcpu_info); ++ ++#define task_vsched(p) NULL ++#define this_vcpu() (task_vcpu(current)) ++#define task_vcpu(p) (vcpu(task_cpu(p))) ++ ++#define vsched_vcpu(sched, id) (vcpu(id)) ++#define vsched_id(vsched) 0 ++#define vsched_default_vsched() NULL ++#define vsched_default_vcpu(id) (vcpu(id)) ++ ++#define vsched_vcpu_online_map(vsched) (cpu_online_map) ++#define vsched_num_online_vcpus(vsched) (num_online_cpus()) ++#define vsched_pcpu_running_map(vsched) (cpu_online_map) ++ ++#define vcpu(id) (&per_cpu(vcpu_info, id)) ++ ++#define vcpu_vsched(vcpu) NULL ++#define vcpu_last_pcpu(vcpu) ((vcpu)->id) ++#define vcpu_isset(vcpu, mask) (cpu_isset((vcpu)->id, mask)) ++#define vcpu_is_offline(vcpu) (cpu_is_offline((vcpu)->id)) ++ ++#endif /* CONFIG_SCHED_VCPU */ ++ ++#define this_rq() (vcpu_rq(this_vcpu())) ++#define task_rq(p) (vcpu_rq(task_vcpu(p))) ++#define vcpu_rq(vcpu) (&(vcpu)->rq) ++#define get_vcpu() ({ preempt_disable(); this_vcpu(); }) ++#define put_vcpu() ({ put_cpu(); }) ++#define rq_vcpu(__rq) (container_of((__rq), struct vcpu_info, rq)) ++ ++/** ++ * idle_task - return the idle task for a given cpu. ++ * @cpu: the processor in question. ++ */ ++task_t *idle_task(int cpu) ++{ ++ return pcpu(cpu)->idle; ++} ++ ++#ifdef CONFIG_SMP ++static inline void update_rq_cpu_load(runqueue_t *this_rq) ++{ ++ unsigned long old_load, this_load; ++ int i; ++ ++ if (unlikely(this_rq->nr_running == 0)) { ++ for (i = 0; i < 3; i++) ++ this_rq->cpu_load[i] = 0; ++ return; ++ } ++ ++ this_load = this_rq->nr_running * SCHED_LOAD_SCALE; ++ for (i = 0; i < 3; i++) { ++ unsigned long new_load = this_load; ++ int scale = 1 << i; ++ old_load = this_rq->cpu_load[i]; ++ /* ++ * Round up the averaging division if load is increasing. This ++ * prevents us from getting stuck on 9 if the load is 10, for ++ * example. ++ */ ++ if (new_load > old_load) ++ new_load += scale-1; ++ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; ++ } ++} ++#else /* CONFIG_SMP */ ++static inline void update_rq_cpu_load(runqueue_t *this_rq) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_SCHED_VCPU ++ ++void fastcall vsched_cpu_online_map(struct vcpu_scheduler *vsched, ++ cpumask_t *mask) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&fairsched_lock, flags); ++ *mask = vsched->vcpu_online_map; ++ spin_unlock_irqrestore(&fairsched_lock, flags); ++} ++ ++static inline void set_task_vsched(task_t *p, struct vcpu_scheduler *vsched) ++{ ++ /* NOTE: set_task_cpu() is required after every set_task_vsched()! */ ++ p->vsched = vsched; ++ p->vsched_id = vsched_id(vsched); ++} ++ ++inline void set_task_cpu(struct task_struct *p, unsigned int vcpu_id) ++{ ++ p->vcpu = vsched_vcpu(task_vsched(p), vcpu_id); ++ p->vcpu_id = vcpu_id; ++} ++ ++static inline void set_task_vcpu(struct task_struct *p, vcpu_t vcpu) ++{ ++ p->vcpu = vcpu; ++ p->vcpu_id = vcpu->id; ++} ++ ++/* this is called when rq->nr_running changes from 0 to 1 */ ++static void vcpu_attach(runqueue_t *rq) ++{ ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu; ++ ++ vcpu = rq_vcpu(rq); ++ vsched = vcpu_vsched(vcpu); ++ ++ BUG_ON(vcpu->active); ++ spin_lock(&fairsched_lock); ++ vcpu->active = 1; ++ if (!vcpu->running) ++ list_move_tail(&vcpu->list, &vsched->active_list); ++ ++ fairsched_incrun(vsched->node); ++ spin_unlock(&fairsched_lock); ++} ++ ++/* this is called when rq->nr_running changes from 1 to 0 */ ++static void vcpu_detach(runqueue_t *rq) ++{ ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu; ++ ++ vcpu = rq_vcpu(rq); ++ vsched = vcpu_vsched(vcpu); ++ BUG_ON(!vcpu->active); ++ ++ spin_lock(&fairsched_lock); ++ fairsched_decrun(vsched->node); ++ ++ vcpu->active = 0; ++ if (!vcpu->running) ++ list_move_tail(&vcpu->list, &vsched->idle_list); ++ spin_unlock(&fairsched_lock); ++} ++ ++static inline void __vcpu_get(vcpu_t vcpu) ++{ ++ struct pcpu_info *pcpu; ++ struct vcpu_scheduler *vsched; ++ ++ BUG_ON(!this_vcpu()->running); ++ ++ pcpu = this_pcpu(); ++ vsched = vcpu_vsched(vcpu); ++ ++ pcpu->vcpu = vcpu; ++ pcpu->vsched = vsched; ++ ++ fairsched_inccpu(vsched->node); ++ ++ list_move_tail(&vcpu->list, &vsched->running_list); ++ vcpu->start_time = jiffies; ++ vcpu->last_pcpu = pcpu->id; ++ vcpu->running = 1; ++ __set_bit(vcpu->id, vsched->vcpu_running_map.bits); ++ __set_bit(pcpu->id, vsched->pcpu_running_map.bits); ++#ifdef CONFIG_SMP ++ vcpu_rq(vcpu)->sd = pcpu->sd; ++#endif ++} ++ ++static void vcpu_put(vcpu_t vcpu) ++{ ++ struct vcpu_scheduler *vsched; ++ struct pcpu_info *cur_pcpu; ++ runqueue_t *rq; ++ ++ vsched = vcpu_vsched(vcpu); ++ rq = vcpu_rq(vcpu); ++ cur_pcpu = this_pcpu(); ++ ++ BUG_ON(!vcpu->running); ++ ++ spin_lock(&fairsched_lock); ++ vcpu->running = 0; ++ list_move_tail(&vcpu->list, ++ vcpu->active ? &vsched->active_list : &vsched->idle_list); ++ fairsched_deccpu(vsched->node); ++ __clear_bit(vcpu->id, vsched->vcpu_running_map.bits); ++ if (vsched != this_vsched()) ++ __clear_bit(cur_pcpu->id, vsched->pcpu_running_map.bits); ++ ++ if (!rq->nr_running) ++ rq->expired_timestamp = 0; ++ /* from this point task_running(prev_rq, prev) will be 0 */ ++ rq->curr = cur_pcpu->idle; ++ update_rq_cpu_load(rq); ++ spin_unlock(&fairsched_lock); ++} ++ ++static vcpu_t schedule_vcpu(vcpu_t cur_vcpu, cycles_t cycles) ++{ ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu; ++ runqueue_t *rq; ++#ifdef CONFIG_FAIRSCHED ++ struct fairsched_node *node, *nodec; ++ ++ nodec = vcpu_vsched(cur_vcpu)->node; ++ node = nodec; ++#endif ++ ++ BUG_ON(!cur_vcpu->running); ++restart: ++ if (unlikely(system_state == SYSTEM_BOOTING)) ++ goto affine; ++ ++ spin_lock(&fairsched_lock); ++#ifdef CONFIG_FAIRSCHED ++ node = fairsched_schedule(node, nodec, ++ cur_vcpu->active, ++ cycles); ++ if (unlikely(node == NULL)) ++ goto idle; ++ ++ vsched = node->vsched; ++#else ++ vsched = &default_vsched; ++#endif ++ /* FIXME: optimize vcpu switching, maybe we do not need to call ++ fairsched_schedule() at all if vcpu is still active and too ++ little time have passed so far */ ++ if (cur_vcpu->vsched == vsched && cur_vcpu->active && ++ jiffies - cur_vcpu->start_time < msecs_to_jiffies(vcpu_sched_timeslice)) { ++ vcpu = cur_vcpu; ++ goto done; ++ } ++ ++ if (list_empty(&vsched->active_list)) { ++ /* nothing except for this cpu can be scheduled */ ++ if (likely(cur_vcpu->vsched == vsched && cur_vcpu->active)) { ++ /* ++ * Current vcpu is the one we need. We have not ++ * put it yet, so it's not on the active_list. ++ */ ++ vcpu = cur_vcpu; ++ goto done; ++ } else ++ goto none; ++ } ++ ++ /* select vcpu and add to running list */ ++ vcpu = list_entry(vsched->active_list.next, struct vcpu_info, list); ++ __vcpu_get(vcpu); ++done: ++ spin_unlock(&fairsched_lock); ++ ++ rq = vcpu_rq(vcpu); ++ if (unlikely(vcpu != cur_vcpu)) { ++ spin_unlock(&vcpu_rq(cur_vcpu)->lock); ++ spin_lock(&rq->lock); ++ if (unlikely(!rq->nr_running)) { ++ /* race with balancing? */ ++ spin_unlock(&rq->lock); ++ vcpu_put(vcpu); ++ spin_lock(&vcpu_rq(cur_vcpu)->lock); ++ goto restart; ++ } ++ } ++ BUG_ON(!rq->nr_running); ++ return vcpu; ++ ++none: ++#ifdef CONFIG_FAIRSCHED ++ spin_unlock(&fairsched_lock); ++ ++ /* fairsched doesn't schedule more CPUs than we have active */ ++ BUG_ON(1); ++#else ++ goto idle; ++#endif ++ ++idle: ++ vcpu = task_vcpu(this_pcpu()->idle); ++ __vcpu_get(vcpu); ++ spin_unlock(&fairsched_lock); ++ spin_unlock(&vcpu_rq(cur_vcpu)->lock); ++ ++ spin_lock(&vcpu_rq(vcpu)->lock); ++ return vcpu; ++ ++affine: ++ vcpu = vsched_vcpu(&default_vsched, raw_smp_processor_id()); ++ /* current VCPU busy, continue */ ++ if (cur_vcpu == vcpu && vcpu->active) ++ return cur_vcpu; ++ /* current is idle and nothing to run, keep idle */ ++ if (vcpu_vsched(cur_vcpu) == &idle_vsched && !vcpu->active) ++ return cur_vcpu; ++ ++ /* need to switch to idle... */ ++ if (cur_vcpu == vcpu) { ++ spin_lock(&fairsched_lock); ++ goto idle; ++ } ++ ++ /* ... and from idle */ ++ spin_lock(&fairsched_lock); ++ __vcpu_get(vcpu); ++ goto done; ++} ++ ++#else /* CONFIG_SCHED_VCPU */ ++ ++#define set_task_vsched(task, vsched) do { } while (0) ++ ++static inline void vcpu_attach(runqueue_t *rq) ++{ ++} ++ ++static inline void vcpu_detach(runqueue_t *rq) ++{ ++} ++ ++static inline void vcpu_put(vcpu_t vcpu) ++{ ++} ++ ++static inline vcpu_t schedule_vcpu(vcpu_t prev_vcpu, cycles_t cycles) ++{ ++ return prev_vcpu; ++} ++ ++static inline void set_task_vcpu(struct task_struct *p, vcpu_t vcpu) ++{ ++ set_task_pcpu(p, vcpu->id); ++} ++ ++#endif /* CONFIG_SCHED_VCPU */ ++ ++int vcpu_online(int cpu) ++{ ++ return cpu_isset(cpu, vsched_vcpu_online_map(this_vsched())); ++} + +-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +-#define this_rq() (&__get_cpu_var(runqueues)) +-#define task_rq(p) cpu_rq(task_cpu(p)) +-#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + + #ifndef prepare_arch_switch + # define prepare_arch_switch(next) do { } while (0) +@@ -284,6 +721,11 @@ for (domain = rcu_dereference(cpu_rq(cpu + # define finish_arch_switch(prev) do { } while (0) + #endif + ++struct kernel_stat_glob kstat_glob; ++spinlock_t kstat_glb_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(kstat_glob); ++EXPORT_SYMBOL(kstat_glb_lock); ++ + #ifndef __ARCH_WANT_UNLOCKED_CTXSW + static inline int task_running(runqueue_t *rq, task_t *p) + { +@@ -300,7 +742,7 @@ static inline void finish_lock_switch(ru + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; + #endif +- spin_unlock_irq(&rq->lock); ++ spin_unlock(&rq->lock); + } + + #else /* __ARCH_WANT_UNLOCKED_CTXSW */ +@@ -374,6 +816,208 @@ static inline void task_rq_unlock(runque + spin_unlock_irqrestore(&rq->lock, *flags); + } + ++#ifdef CONFIG_VE ++#define ve_nr_iowait_inc(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_iowait++; \ ++ } while(0) ++#define ve_nr_iowait_dec(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_iowait--; \ ++ } while(0) ++#define ve_nr_unint_inc(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_unint++; \ ++ } while(0) ++#define ve_nr_unint_dec(env, cpu) \ ++ do { \ ++ VE_CPU_STATS((env), (cpu))->nr_unint--; \ ++ } while(0) ++ ++#define cycles_after(a, b) ((long long)(b) - (long long)(a) < 0) ++ ++cycles_t ve_sched_get_idle_time(struct ve_struct *ve, int cpu) ++{ ++ struct ve_cpu_stats *ve_stat; ++ unsigned v; ++ cycles_t strt, ret, cycles; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ do { ++ v = read_seqcount_begin(&ve_stat->stat_lock); ++ ret = ve_stat->idle_time; ++ strt = ve_stat->strt_idle_time; ++ if (strt && nr_uninterruptible_ve(ve) == 0) { ++ cycles = get_cycles(); ++ if (cycles_after(cycles, strt)) ++ ret += cycles - strt; ++ } ++ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); ++ return ret; ++} ++EXPORT_SYMBOL(ve_sched_get_idle_time); ++ ++cycles_t ve_sched_get_iowait_time(struct ve_struct *ve, int cpu) ++{ ++ struct ve_cpu_stats *ve_stat; ++ unsigned v; ++ cycles_t strt, ret, cycles; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ do { ++ v = read_seqcount_begin(&ve_stat->stat_lock); ++ ret = ve_stat->iowait_time; ++ strt = ve_stat->strt_idle_time; ++ if (strt && nr_iowait_ve(ve) > 0) { ++ cycles = get_cycles(); ++ if (cycles_after(cycles, strt)) ++ ret += cycles - strt; ++ } ++ } while (read_seqcount_retry(&ve_stat->stat_lock, v)); ++ return ret; ++} ++ ++EXPORT_SYMBOL(ve_sched_get_iowait_time); ++ ++static inline void ve_stop_idle(struct ve_struct *ve, ++ unsigned int cpu, cycles_t cycles) ++{ ++ struct ve_cpu_stats *ve_stat; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ ++ write_seqcount_begin(&ve_stat->stat_lock); ++ if (ve_stat->strt_idle_time) { ++ if (cycles_after(cycles, ve_stat->strt_idle_time)) { ++ if (nr_iowait_ve(ve) == 0) ++ ve_stat->idle_time += cycles - ++ ve_stat->strt_idle_time; ++ else ++ ve_stat->iowait_time += cycles - ++ ve_stat->strt_idle_time; ++ } ++ ve_stat->strt_idle_time = 0; ++ } ++ write_seqcount_end(&ve_stat->stat_lock); ++} ++ ++static inline void ve_strt_idle(struct ve_struct *ve, ++ unsigned int cpu, cycles_t cycles) ++{ ++ struct ve_cpu_stats *ve_stat; ++ ++ ve_stat = VE_CPU_STATS(ve, cpu); ++ ++ write_seqcount_begin(&ve_stat->stat_lock); ++ ve_stat->strt_idle_time = cycles; ++ write_seqcount_end(&ve_stat->stat_lock); ++} ++ ++#define ve_nr_running_inc(env, cpu, cycles) do { \ ++ if (++VE_CPU_STATS((env), (cpu))->nr_running == 1) \ ++ ve_stop_idle(env, cpu, cycles); \ ++ } while (0) ++#define ve_nr_running_dec(env, cpu, cyclses) do { \ ++ if (--VE_CPU_STATS((env), (cpu))->nr_running == 0) \ ++ ve_strt_idle(env, cpu, cycles); \ ++ } while (0) ++ ++void ve_sched_attach(struct ve_struct *envid) ++{ ++ struct task_struct *tsk; ++ unsigned int cpu; ++ cycles_t cycles; ++ ++ tsk = current; ++ preempt_disable(); ++ cycles = get_cycles(); ++ cpu = task_cpu(tsk); ++ ve_nr_running_dec(VE_TASK_INFO(tsk)->owner_env, cpu, cycles); ++ ve_nr_running_inc(envid, cpu, cycles); ++ preempt_enable(); ++} ++EXPORT_SYMBOL(ve_sched_attach); ++ ++static inline void write_wakeup_stamp(struct task_struct *p, cycles_t cyc) ++{ ++ struct ve_task_info *ti; ++ ++ ti = VE_TASK_INFO(p); ++ write_seqcount_begin(&ti->wakeup_lock); ++ ti->wakeup_stamp = cyc; ++ write_seqcount_end(&ti->wakeup_lock); ++} ++ ++static inline void update_sched_lat(struct task_struct *t, cycles_t cycles) ++{ ++ int cpu; ++ cycles_t ve_wstamp; ++ ++ /* safe due to runqueue lock */ ++ cpu = smp_processor_id(); ++ ve_wstamp = t->ve_task_info.wakeup_stamp; ++ ++ if (ve_wstamp && cycles > ve_wstamp) { ++ KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat, ++ cpu, cycles - ve_wstamp); ++ KSTAT_LAT_PCPU_ADD(&t->ve_task_info.exec_env->sched_lat_ve, ++ cpu, cycles - ve_wstamp); ++ } ++} ++ ++static inline void update_ve_task_info(task_t *prev, cycles_t cycles) ++{ ++#ifdef CONFIG_FAIRSCHED ++ if (prev != this_pcpu()->idle) { ++#else ++ if (prev != this_rq()->idle) { ++#endif ++ VE_CPU_STATS(prev->ve_task_info.owner_env, ++ smp_processor_id())->used_time += ++ cycles - prev->ve_task_info.sched_time; ++ ++ prev->ve_task_info.sched_time = cycles; ++ } ++} ++ ++#else ++#define ve_nr_running_inc(env, cpu, cycles) do { } while(0) ++#define ve_nr_running_dec(env, cpu, cycles) do { } while(0) ++#define ve_nr_iowait_inc(env, cpu) do { } while(0) ++#define ve_nr_iowait_dec(env, cpu) do { } while(0) ++#define ve_nr_unint_inc(env, cpu) do { } while(0) ++#define ve_nr_unint_dec(env, cpu) do { } while(0) ++#define update_ve_task_info(prev, cycles) do { } while (0) ++#endif ++ ++struct task_nrs_struct { ++ long nr_running; ++ long nr_unint; ++ long nr_stopped; ++ long nr_sleeping; ++ long nr_iowait; ++ long long nr_switches; ++} ____cacheline_aligned_in_smp; ++ ++static struct task_nrs_struct glob_task_nrs[NR_CPUS]; ++#define nr_running_inc(cpu) do { glob_task_nrs[cpu].nr_running++; } while (0) ++#define nr_running_dec(cpu) do { glob_task_nrs[cpu].nr_running--; } while (0) ++#define nr_unint_inc(cpu) do { glob_task_nrs[cpu].nr_unint++; } while (0) ++#define nr_unint_dec(cpu) do { glob_task_nrs[cpu].nr_unint--; } while (0) ++#define nr_stopped_inc(cpu) do { glob_task_nrs[cpu].nr_stopped++; } while (0) ++#define nr_stopped_dec(cpu) do { glob_task_nrs[cpu].nr_stopped--; } while (0) ++#define nr_sleeping_inc(cpu) do { glob_task_nrs[cpu].nr_sleeping++; } while (0) ++#define nr_sleeping_dec(cpu) do { glob_task_nrs[cpu].nr_sleeping--; } while (0) ++#define nr_iowait_inc(cpu) do { glob_task_nrs[cpu].nr_iowait++; } while (0) ++#define nr_iowait_dec(cpu) do { glob_task_nrs[cpu].nr_iowait--; } while (0) ++ ++ ++unsigned long nr_zombie = 0; /* protected by tasklist_lock */ ++EXPORT_SYMBOL(nr_zombie); ++ ++atomic_t nr_dead = ATOMIC_INIT(0); ++EXPORT_SYMBOL(nr_dead); ++ + #ifdef CONFIG_SCHEDSTATS + /* + * bump this up when changing the output format or the meaning of an existing +@@ -666,8 +1310,19 @@ static int effective_prio(task_t *p) + */ + static inline void __activate_task(task_t *p, runqueue_t *rq) + { ++ cycles_t cycles; ++ ++#ifdef CONFIG_VE ++ cycles = get_cycles(); ++ write_wakeup_stamp(p, cycles); ++ p->ve_task_info.sleep_time += cycles; ++#endif + enqueue_task(p, rq->active); + rq->nr_running++; ++ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, task_cpu(p), cycles); ++ nr_running_inc(smp_processor_id()); ++ if (rq->nr_running == 1) ++ vcpu_attach(rq); + } + + /* +@@ -800,9 +1455,38 @@ static void activate_task(task_t *p, run + */ + static void deactivate_task(struct task_struct *p, runqueue_t *rq) + { ++ cycles_t cycles; ++#ifdef CONFIG_VE ++ unsigned int cpu, pcpu; ++ struct ve_struct *ve; ++ ++ cycles = get_cycles(); ++ cpu = task_cpu(p); ++ pcpu = smp_processor_id(); ++ ve = p->ve_task_info.owner_env; ++ ++ p->ve_task_info.sleep_time -= cycles; ++#endif ++ if (p->state == TASK_UNINTERRUPTIBLE) { ++ ve_nr_unint_inc(ve, cpu); ++ nr_unint_inc(pcpu); ++ } ++ if (p->state == TASK_INTERRUPTIBLE) { ++ rq->nr_sleeping++; ++ nr_sleeping_inc(pcpu); ++ } ++ if (p->state == TASK_STOPPED) { ++ rq->nr_stopped++; ++ nr_stopped_inc(pcpu); ++ } ++ ++ ve_nr_running_dec(VE_TASK_INFO(p)->owner_env, cpu, cycles); ++ nr_running_dec(pcpu); + rq->nr_running--; + dequeue_task(p, p->array); + p->array = NULL; ++ if (rq->nr_running == 0) ++ vcpu_detach(rq); + } + + /* +@@ -813,18 +1497,22 @@ static void deactivate_task(struct task_ + * the target CPU. + */ + #ifdef CONFIG_SMP ++/* FIXME: need to add vsched arg */ + static void resched_task(task_t *p) + { + int cpu; + ++#if 0 ++ /* FIXME: this fails due to idle rq->curre == idle */ + assert_spin_locked(&task_rq(p)->lock); ++#endif + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + +- cpu = task_cpu(p); ++ cpu = task_pcpu(p); + if (cpu == smp_processor_id()) + return; + +@@ -847,15 +1535,35 @@ static inline void resched_task(task_t * + */ + inline int task_curr(const task_t *p) + { +- return cpu_curr(task_cpu(p)) == p; ++ return task_rq(p)->curr == p; + } + +-#ifdef CONFIG_SMP ++/** ++ * idle_cpu - is a given cpu idle currently? ++ * @cpu: the processor in question. ++ */ ++inline int idle_cpu(int cpu) ++{ ++ return pcpu(cpu)->vsched == &idle_vsched; ++} ++ ++EXPORT_SYMBOL_GPL(idle_cpu); ++ ++static inline int idle_vcpu(vcpu_t cpu) ++{ ++#ifdef CONFIG_SCHED_VCPU ++ return !cpu->active; ++#else ++ return idle_cpu(cpu->id); ++#endif ++} ++ ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU) + typedef struct { + struct list_head list; + + task_t *task; +- int dest_cpu; ++ vcpu_t dest_cpu; + + struct completion done; + } migration_req_t; +@@ -864,7 +1572,7 @@ typedef struct { + * The task's runqueue lock must be held. + * Returns true if you have to wait for migration thread. + */ +-static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) ++static int migrate_task(task_t *p, vcpu_t dest_cpu, migration_req_t *req) + { + runqueue_t *rq = task_rq(p); + +@@ -872,8 +1580,13 @@ static int migrate_task(task_t *p, int d + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ ++#ifdef CONFIG_SCHED_VCPU ++ BUG_ON(task_vsched(p) == &idle_vsched); ++ BUG_ON(vcpu_vsched(dest_cpu) == &idle_vsched); ++#endif + if (!p->array && !task_running(rq, p)) { +- set_task_cpu(p, dest_cpu); ++ set_task_vsched(p, vcpu_vsched(dest_cpu)); ++ set_task_vcpu(p, dest_cpu); + return 0; + } + +@@ -913,6 +1626,7 @@ repeat: + } + task_rq_unlock(rq, &flags); + } ++EXPORT_SYMBOL_GPL(wait_task_inactive); + + /*** + * kick_process - kick a running thread to enter/exit the kernel +@@ -932,21 +1646,26 @@ void kick_process(task_t *p) + int cpu; + + preempt_disable(); +- cpu = task_cpu(p); ++ cpu = task_pcpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) ++ /* FIXME: ??? think over */ ++ /* should add something like get_pcpu(cpu)->vcpu->id == task_cpu(p), ++ but with serialization of vcpu access... */ + smp_send_reschedule(cpu); + preempt_enable(); + } ++#endif + ++#ifdef CONFIG_SMP + /* + * Return a low guess at the load of a migration-source cpu. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +-static inline unsigned long source_load(int cpu, int type) ++static inline unsigned long source_load(vcpu_t cpu, int type) + { +- runqueue_t *rq = cpu_rq(cpu); ++ runqueue_t *rq = vcpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; +@@ -957,9 +1676,9 @@ static inline unsigned long source_load( + /* + * Return a high guess at the load of a migration-target cpu + */ +-static inline unsigned long target_load(int cpu, int type) ++static inline unsigned long target_load(vcpu_t cpu, int type) + { +- runqueue_t *rq = cpu_rq(cpu); ++ runqueue_t *rq = vcpu_rq(cpu); + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; +@@ -972,33 +1691,35 @@ static inline unsigned long target_load( + * domain. + */ + static struct sched_group * +-find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) ++find_idlest_group(struct sched_domain *sd, struct task_struct *p, vcpu_t this_cpu) + { + struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int load_idx = sd->forkexec_idx; + int imbalance = 100 + (sd->imbalance_pct-100)/2; ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu; ++ int this_pcpu; + ++ vsched = vcpu_vsched(this_cpu); ++ this_pcpu = vcpu_last_pcpu(this_cpu); + do { + unsigned long load, avg_load; + int local_group; + int i; + +- /* Skip over this group if it has no CPUs allowed */ +- if (!cpus_intersects(group->cpumask, p->cpus_allowed)) +- goto nextgroup; +- +- local_group = cpu_isset(this_cpu, group->cpumask); ++ local_group = cpu_isset(this_pcpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu_mask(i, group->cpumask) { ++ vcpu = pcpu(i)->vcpu; + /* Bias balancing toward cpus of our domain */ + if (local_group) +- load = source_load(i, load_idx); ++ load = source_load(vcpu, load_idx); + else +- load = target_load(i, load_idx); ++ load = target_load(vcpu, load_idx); + + avg_load += load; + } +@@ -1013,7 +1734,6 @@ find_idlest_group(struct sched_domain *s + min_load = avg_load; + idlest = group; + } +-nextgroup: + group = group->next; + } while (group != sd->groups); + +@@ -1025,23 +1745,31 @@ nextgroup: + /* + * find_idlest_queue - find the idlest runqueue among the cpus in group. + */ +-static int +-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) ++static vcpu_t ++find_idlest_cpu(struct sched_group *group, struct task_struct *p, vcpu_t this_cpu) + { +- cpumask_t tmp; + unsigned long load, min_load = ULONG_MAX; +- int idlest = -1; ++ cpumask_t vmask; ++ struct vcpu_scheduler *vsched; ++ vcpu_t idlest = (vcpu_t)-1; ++ vcpu_t vcpu; + int i; + +- /* Traverse only the allowed CPUs */ +- cpus_and(tmp, group->cpumask, p->cpus_allowed); ++ vsched = vcpu_vsched(this_cpu); ++ BUG_ON(vsched != task_vsched(p)); + +- for_each_cpu_mask(i, tmp) { +- load = source_load(i, 0); ++ cpus_and(vmask, vsched_vcpu_online_map(vsched), p->cpus_allowed); ++ for_each_cpu_mask(i, vmask) { ++ vcpu = vsched_vcpu(vsched, i); + +- if (load < min_load || (load == min_load && i == this_cpu)) { ++ if (!cpu_isset(vcpu_last_pcpu(vcpu), group->cpumask)) ++ continue; ++ ++ load = source_load(vcpu, 0); ++ ++ if (load < min_load || (load == min_load && vcpu == this_cpu)) { + min_load = load; +- idlest = i; ++ idlest = vcpu; + } + } + +@@ -1059,7 +1787,7 @@ find_idlest_cpu(struct sched_group *grou + * + * preempt must be disabled. + */ +-static int sched_balance_self(int cpu, int flag) ++static vcpu_t sched_balance_self(vcpu_t cpu, int flag) + { + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; +@@ -1071,7 +1799,7 @@ static int sched_balance_self(int cpu, i + while (sd) { + cpumask_t span; + struct sched_group *group; +- int new_cpu; ++ vcpu_t new_cpu; + int weight; + + span = sd->span; +@@ -1080,7 +1808,7 @@ static int sched_balance_self(int cpu, i + goto nextlevel; + + new_cpu = find_idlest_cpu(group, t, cpu); +- if (new_cpu == -1 || new_cpu == cpu) ++ if (new_cpu == (vcpu_t)(-1) || new_cpu == cpu) + goto nextlevel; + + /* Now try balancing at a lower domain level */ +@@ -1111,21 +1839,27 @@ nextlevel: + * Returns the CPU we should wake onto. + */ + #if defined(ARCH_HAS_SCHED_WAKE_IDLE) +-static int wake_idle(int cpu, task_t *p) ++static vcpu_t wake_idle(vcpu_t cpu, task_t *p) + { +- cpumask_t tmp; ++ cpumask_t vtmp; + struct sched_domain *sd; ++ struct vcpu_scheduler *vsched; + int i; + +- if (idle_cpu(cpu)) ++ if (idle_vcpu(cpu)) + return cpu; + ++ vsched = vcpu_vsched(cpu); ++ cpus_and(vtmp, vsched_vcpu_online_map(vsched), p->cpus_allowed); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { +- cpus_and(tmp, sd->span, p->cpus_allowed); +- for_each_cpu_mask(i, tmp) { +- if (idle_cpu(i)) +- return i; ++ for_each_cpu_mask(i, vtmp) { ++ vcpu_t vcpu; ++ vcpu = vsched_vcpu(vsched, i); ++ if (!cpu_isset(vcpu_last_pcpu(vcpu), sd->span)) ++ continue; ++ if (idle_vcpu(vcpu)) ++ return vcpu; + } + } + else +@@ -1134,7 +1868,7 @@ static int wake_idle(int cpu, task_t *p) + return cpu; + } + #else +-static inline int wake_idle(int cpu, task_t *p) ++static inline vcpu_t wake_idle(vcpu_t cpu, task_t *p) + { + return cpu; + } +@@ -1156,15 +1890,17 @@ static inline int wake_idle(int cpu, tas + */ + static int try_to_wake_up(task_t *p, unsigned int state, int sync) + { +- int cpu, this_cpu, success = 0; ++ vcpu_t cpu, this_cpu; ++ int success = 0; + unsigned long flags; + long old_state; + runqueue_t *rq; + #ifdef CONFIG_SMP + unsigned long load, this_load; + struct sched_domain *sd, *this_sd = NULL; +- int new_cpu; ++ vcpu_t new_cpu; + #endif ++ cpu = NULL; + + rq = task_rq_lock(p, &flags); + old_state = p->state; +@@ -1174,8 +1910,8 @@ static int try_to_wake_up(task_t *p, uns + if (p->array) + goto out_running; + +- cpu = task_cpu(p); +- this_cpu = smp_processor_id(); ++ cpu = task_vcpu(p); ++ this_cpu = this_vcpu(); + + #ifdef CONFIG_SMP + if (unlikely(task_running(rq, p))) +@@ -1184,20 +1920,23 @@ static int try_to_wake_up(task_t *p, uns + new_cpu = cpu; + + schedstat_inc(rq, ttwu_cnt); ++ /* FIXME: add vsched->last_vcpu array to optimize wakeups in different vsched */ ++ if (vcpu_vsched(cpu) != vcpu_vsched(this_cpu)) ++ goto out_set_cpu; + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + goto out_set_cpu; + } + + for_each_domain(this_cpu, sd) { +- if (cpu_isset(cpu, sd->span)) { ++ if (cpu_isset(vcpu_last_pcpu(cpu), sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; + } + } + +- if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) ++ if (unlikely(!vcpu_isset(this_cpu, p->cpus_allowed))) + goto out_set_cpu; + + /* +@@ -1253,7 +1992,7 @@ static int try_to_wake_up(task_t *p, uns + out_set_cpu: + new_cpu = wake_idle(new_cpu, p); + if (new_cpu != cpu) { +- set_task_cpu(p, new_cpu); ++ set_task_vcpu(p, new_cpu); + task_rq_unlock(rq, &flags); + /* might preempt at this point */ + rq = task_rq_lock(p, &flags); +@@ -1263,13 +2002,21 @@ out_set_cpu: + if (p->array) + goto out_running; + +- this_cpu = smp_processor_id(); +- cpu = task_cpu(p); ++ this_cpu = this_vcpu(); ++ cpu = task_vcpu(p); + } + + out_activate: + #endif /* CONFIG_SMP */ +- if (old_state == TASK_UNINTERRUPTIBLE) { ++ if (old_state == TASK_INTERRUPTIBLE) { ++ nr_sleeping_dec(smp_processor_id()); ++ rq->nr_sleeping--; ++ } else if (old_state == TASK_STOPPED) { ++ nr_stopped_dec(smp_processor_id()); ++ rq->nr_stopped--; ++ } else if (old_state == TASK_UNINTERRUPTIBLE) { ++ nr_unint_dec(smp_processor_id()); ++ ve_nr_unint_dec(p->ve_task_info.owner_env, task_cpu(p)); + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn +@@ -1324,17 +2071,45 @@ int fastcall wake_up_state(task_t *p, un + } + + /* ++ * init is special, it is forked from swapper (idle_vsched) and should ++ * belong to default_vsched, so we have to change it's vsched/fairsched manually ++ */ ++static void wake_up_init(task_t *p) ++{ ++ runqueue_t *rq; ++ unsigned long flags; ++ ++ /* we should change both fairsched node and vsched here */ ++ set_task_vsched(p, &default_vsched); ++ set_task_cpu(p, 0); ++ ++ /* ++ * can't call wake_up_new_task() directly here, ++ * since it assumes that a child belongs to the same vsched ++ */ ++ p->state = TASK_RUNNING; ++ p->sleep_avg = 0; ++ p->prio = effective_prio(p); ++ ++ rq = task_rq_lock(p, &flags); ++ __activate_task(p, rq); ++ task_rq_unlock(rq, &flags); ++} ++ ++/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ + void fastcall sched_fork(task_t *p, int clone_flags) + { +- int cpu = get_cpu(); +- ++ vcpu_t cpu; ++ ++ preempt_disable(); ++ cpu = this_vcpu(); + #ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); + #endif +- set_task_cpu(p, cpu); ++ set_task_vcpu(p, cpu); + + /* + * We mark the process as running here, but have not actually +@@ -1369,6 +2144,10 @@ void fastcall sched_fork(task_t *p, int + p->first_time_slice = 1; + current->time_slice >>= 1; + p->timestamp = sched_clock(); ++#ifdef CONFIG_VE ++ /*cosmetic: sleep till wakeup below*/ ++ p->ve_task_info.sleep_time -= get_cycles(); ++#endif + if (unlikely(!current->time_slice)) { + /* + * This case is rare, it happens when the parent has only +@@ -1379,7 +2158,7 @@ void fastcall sched_fork(task_t *p, int + scheduler_tick(); + } + local_irq_enable(); +- put_cpu(); ++ preempt_enable(); + } + + /* +@@ -1392,13 +2171,19 @@ void fastcall sched_fork(task_t *p, int + void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) + { + unsigned long flags; +- int this_cpu, cpu; ++ vcpu_t this_cpu, cpu; + runqueue_t *rq, *this_rq; + ++ if (unlikely(p->pid == 1)) { ++ wake_up_init(p); ++ return; ++ } ++ + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); +- this_cpu = smp_processor_id(); +- cpu = task_cpu(p); ++ BUG_ON(task_vsched(current) != task_vsched(p)); ++ this_cpu = this_vcpu(); ++ cpu = task_vcpu(p); + + /* + * We decrease the sleep average of forking parents +@@ -1426,6 +2211,9 @@ void fastcall wake_up_new_task(task_t *p + p->array = current->array; + p->array->nr_active++; + rq->nr_running++; ++ ve_nr_running_inc(VE_TASK_INFO(p)->owner_env, ++ task_cpu(p), get_cycles()); ++ nr_running_inc(smp_processor_id()); + } + set_need_resched(); + } else +@@ -1439,7 +2227,7 @@ void fastcall wake_up_new_task(task_t *p + */ + this_rq = rq; + } else { +- this_rq = cpu_rq(this_cpu); ++ this_rq = vcpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should +@@ -1482,7 +2270,7 @@ void fastcall sched_exit(task_t *p) + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); +- if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { ++ if (p->first_time_slice && task_vcpu(p) == task_vcpu(p->parent)) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > task_timeslice(p))) + p->parent->time_slice = task_timeslice(p); +@@ -1532,7 +2320,10 @@ static inline void finish_task_switch(ru + { + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; ++ vcpu_t prev_vcpu, vcpu; + ++ prev_vcpu = task_vcpu(prev); ++ vcpu = rq_vcpu(rq); + rq->prev_mm = NULL; + + /* +@@ -1549,6 +2340,10 @@ static inline void finish_task_switch(ru + prev_task_flags = prev->flags; + finish_arch_switch(prev); + finish_lock_switch(rq, prev); ++ if (prev_vcpu != vcpu) ++ vcpu_put(prev_vcpu); ++ local_irq_enable(); ++ + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) +@@ -1569,8 +2364,9 @@ asmlinkage void schedule_tail(task_t *pr + preempt_enable(); + #endif + if (current->set_child_tid) +- put_user(current->pid, current->set_child_tid); ++ put_user(virt_pid(current), current->set_child_tid); + } ++EXPORT_SYMBOL_GPL(schedule_tail); + + /* + * context_switch - switch to the new MM and the new +@@ -1610,20 +2406,26 @@ task_t * context_switch(runqueue_t *rq, + */ + unsigned long nr_running(void) + { +- unsigned long i, sum = 0; ++ unsigned long i, sum; + ++ sum = 0; + for_each_online_cpu(i) +- sum += cpu_rq(i)->nr_running; ++ sum += glob_task_nrs[i].nr_running; ++ ++ if (unlikely((long)sum < 0)) ++ sum = 0; + + return sum; + } ++EXPORT_SYMBOL(nr_running); + + unsigned long nr_uninterruptible(void) + { +- unsigned long i, sum = 0; ++ unsigned long i, sum; + ++ sum = 0; + for_each_cpu(i) +- sum += cpu_rq(i)->nr_uninterruptible; ++ sum += glob_task_nrs[i].nr_unint; + + /* + * Since we read the counters lockless, it might be slightly +@@ -1635,31 +2437,133 @@ unsigned long nr_uninterruptible(void) + return sum; + } + ++EXPORT_SYMBOL(nr_uninterruptible); ++ + unsigned long long nr_context_switches(void) + { +- unsigned long long i, sum = 0; ++ unsigned long long i, sum; + ++ sum = 0; + for_each_cpu(i) +- sum += cpu_rq(i)->nr_switches; ++ sum += glob_task_nrs[i].nr_switches; ++ ++ if (unlikely((long)sum < 0)) ++ sum = 0; + + return sum; + } + ++EXPORT_SYMBOL(nr_context_switches); ++ + unsigned long nr_iowait(void) + { +- unsigned long i, sum = 0; ++ unsigned long i, sum; + ++ sum = 0; + for_each_cpu(i) +- sum += atomic_read(&cpu_rq(i)->nr_iowait); ++ sum += glob_task_nrs[i].nr_iowait; ++ ++ if (unlikely((long)sum < 0)) ++ sum = 0; + + return sum; + } + +-#ifdef CONFIG_SMP ++EXPORT_SYMBOL(nr_iowait); ++ ++unsigned long nr_stopped(void) ++{ ++ unsigned long i, sum; ++ ++ sum = 0; ++ for_each_cpu(i) ++ sum += glob_task_nrs[i].nr_stopped; ++ ++ if (unlikely((long)sum < 0)) ++ sum = 0; ++ ++ return sum; ++} ++ ++EXPORT_SYMBOL(nr_stopped); ++ ++unsigned long nr_sleeping(void) ++{ ++ unsigned long i, sum; ++ ++ sum = 0; ++ for_each_cpu(i) ++ sum += glob_task_nrs[i].nr_sleeping; ++ ++ if (unlikely((long)sum < 0)) ++ sum = 0; ++ ++ return sum; ++} ++ ++EXPORT_SYMBOL(nr_sleeping); ++ ++#ifdef CONFIG_VE ++unsigned long nr_running_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_running; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++ ++EXPORT_SYMBOL(nr_running_ve); ++ ++unsigned long nr_uninterruptible_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_unint; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++ ++EXPORT_SYMBOL(nr_uninterruptible_ve); ++ ++unsigned long nr_iowait_ve(struct ve_struct *ve) ++{ ++ int i; ++ long sum; ++ cpumask_t ve_cpus; ++ ++ sum = 0; ++ ve_cpu_online_map(ve, &ve_cpus); ++ for_each_cpu_mask(i, ve_cpus) ++ sum += VE_CPU_STATS(ve, i)->nr_iowait; ++ return (unsigned long)(sum < 0 ? 0 : sum); ++} ++ ++EXPORT_SYMBOL(nr_iowait_ve); ++#endif ++ ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_VCPU) ++ ++#ifdef CONFIG_SCHED_VCPU ++#define rq_compare(rq1, rq2) (rq1 < rq2) ++#else ++#define rq_compare(rq1, rq2) (rq1->cpu < rq2->cpu) ++#endif + + /* + * double_rq_lock - safely lock two runqueues + * ++ * We must take them in cpu order to match code in ++ * dependent_sleeper and wake_dependent_sleeper. ++ * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +@@ -1671,7 +2575,7 @@ static void double_rq_lock(runqueue_t *r + spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { +- if (rq1 < rq2) { ++ if (rq_compare(rq1, rq2)) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { +@@ -1699,38 +2603,20 @@ static void double_rq_unlock(runqueue_t + } + + /* +- * double_lock_balance - lock the busiest runqueue, this_rq is locked already. +- */ +-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) +- __releases(this_rq->lock) +- __acquires(busiest->lock) +- __acquires(this_rq->lock) +-{ +- if (unlikely(!spin_trylock(&busiest->lock))) { +- if (busiest < this_rq) { +- spin_unlock(&this_rq->lock); +- spin_lock(&busiest->lock); +- spin_lock(&this_rq->lock); +- } else +- spin_lock(&busiest->lock); +- } +-} +- +-/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +-static void sched_migrate_task(task_t *p, int dest_cpu) ++static void sched_migrate_task(task_t *p, vcpu_t dest_cpu) + { + migration_req_t req; + runqueue_t *rq; + unsigned long flags; + + rq = task_rq_lock(p, &flags); +- if (!cpu_isset(dest_cpu, p->cpus_allowed) +- || unlikely(cpu_is_offline(dest_cpu))) ++ if (!vcpu_isset(dest_cpu, p->cpus_allowed) ++ || unlikely(vcpu_is_offline(dest_cpu))) + goto out; + + /* force the process onto the specified CPU */ +@@ -1747,6 +2633,26 @@ static void sched_migrate_task(task_t *p + out: + task_rq_unlock(rq, &flags); + } ++#endif ++ ++#ifdef CONFIG_SMP ++/* ++ * double_lock_balance - lock the busiest runqueue, this_rq is locked already. ++ */ ++static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) ++ __releases(this_rq->lock) ++ __acquires(busiest->lock) ++ __acquires(this_rq->lock) ++{ ++ if (unlikely(!spin_trylock(&busiest->lock))) { ++ if (rq_compare(busiest, this_rq)) { ++ spin_unlock(&this_rq->lock); ++ spin_lock(&busiest->lock); ++ spin_lock(&this_rq->lock); ++ } else ++ spin_lock(&busiest->lock); ++ } ++} + + /* + * sched_exec - execve() is a valuable balancing opportunity, because at +@@ -1754,9 +2660,12 @@ out: + */ + void sched_exec(void) + { +- int new_cpu, this_cpu = get_cpu(); ++ vcpu_t new_cpu, this_cpu; ++ ++ preempt_disable(); ++ this_cpu = this_vcpu(); + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); +- put_cpu(); ++ preempt_enable(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); + } +@@ -1767,12 +2676,24 @@ void sched_exec(void) + */ + static + void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, +- runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) ++ runqueue_t *this_rq, prio_array_t *this_array, vcpu_t this_cpu) + { ++ struct ve_struct *ve; ++ cycles_t cycles; ++ ++ cycles = get_cycles(); ++ ve = VE_TASK_INFO(p)->owner_env; ++ + dequeue_task(p, src_array); + src_rq->nr_running--; +- set_task_cpu(p, this_cpu); ++ ve_nr_running_dec(ve, task_cpu(p), cycles); ++ if (src_rq->nr_running == 0) ++ vcpu_detach(src_rq); ++ set_task_vcpu(p, this_cpu); ++ if (this_rq->nr_running == 0) ++ vcpu_attach(this_rq); + this_rq->nr_running++; ++ ve_nr_running_inc(ve, task_cpu(p), cycles); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; +@@ -1788,7 +2709,7 @@ void pull_task(runqueue_t *src_rq, prio_ + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ + static +-int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, ++int can_migrate_task(task_t *p, runqueue_t *rq, vcpu_t this_cpu, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) + { +@@ -1798,7 +2719,7 @@ int can_migrate_task(task_t *p, runqueue + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ +- if (!cpu_isset(this_cpu, p->cpus_allowed)) ++ if (!vcpu_isset(this_cpu, p->cpus_allowed)) + return 0; + *all_pinned = 0; + +@@ -1826,7 +2747,7 @@ int can_migrate_task(task_t *p, runqueue + * + * Called with both runqueues locked. + */ +-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, ++static int move_tasks(runqueue_t *this_rq, vcpu_t this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle, int *all_pinned) + { +@@ -1919,13 +2840,19 @@ out: + * moved to restore balance via the imbalance parameter. + */ + static struct sched_group * +-find_busiest_group(struct sched_domain *sd, int this_cpu, ++find_busiest_group(struct sched_domain *sd, vcpu_t this_cpu, + unsigned long *imbalance, enum idle_type idle, int *sd_idle) + { + struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; + int load_idx; ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu; ++ int this_pcpu; ++ ++ vsched = vcpu_vsched(this_cpu); ++ this_pcpu = vcpu_last_pcpu(this_cpu); + + max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) +@@ -1936,24 +2863,27 @@ find_busiest_group(struct sched_domain * + load_idx = sd->idle_idx; + + do { ++ cpumask_t tmp; + unsigned long load; + int local_group; + int i; + +- local_group = cpu_isset(this_cpu, group->cpumask); ++ local_group = cpu_isset(this_pcpu, group->cpumask); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; ++ cpus_and(tmp, group->cpumask, vsched_pcpu_running_map(vsched)); + +- for_each_cpu_mask(i, group->cpumask) { ++ for_each_cpu_mask(i, tmp) { ++ vcpu = pcpu(i)->vcpu; + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) +- load = target_load(i, load_idx); ++ load = target_load(vcpu, load_idx); + else +- load = source_load(i, load_idx); ++ load = source_load(vcpu, load_idx); + + avg_load += load; + } +@@ -1976,6 +2906,8 @@ find_busiest_group(struct sched_domain * + + if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) + goto out_balanced; ++ if (!this) ++ this = busiest; /* this->cpu_power is needed below */ + + avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + +@@ -2058,25 +2990,57 @@ out_balanced: + /* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +-static runqueue_t *find_busiest_queue(struct sched_group *group, ++static vcpu_t find_busiest_queue(vcpu_t this_cpu, struct sched_group *group, + enum idle_type idle) + { + unsigned long load, max_load = 0; +- runqueue_t *busiest = NULL; ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu, busiest = NULL; ++ cpumask_t tmp; + int i; + ++ vsched = vcpu_vsched(this_cpu); + for_each_cpu_mask(i, group->cpumask) { +- load = source_load(i, 0); ++ vcpu = pcpu(i)->vcpu; ++ if (vcpu_vsched(vcpu) != vsched && idle != SCHED_IDLE) ++ continue; ++ load = source_load(vcpu, 0); ++ if (load > max_load) { ++ max_load = load; ++ busiest = vcpu; ++ } ++ } + ++#ifdef CONFIG_SCHED_VCPU ++ cpus_andnot(tmp, vsched->vcpu_online_map, vsched->vcpu_running_map); ++ for_each_cpu_mask(i, tmp) { ++ vcpu = vsched_vcpu(vsched, i); ++ load = source_load(vcpu, 0); + if (load > max_load) { + max_load = load; +- busiest = cpu_rq(i); ++ busiest = vcpu; + } + } ++#endif + + return busiest; + } + ++#ifdef CONFIG_SCHED_VCPU ++vcpu_t find_idle_vcpu(struct vcpu_scheduler *vsched) ++{ ++ vcpu_t vcpu; ++ ++ vcpu = NULL; ++ spin_lock(&fairsched_lock); ++ if (!list_empty(&vsched->idle_list)) ++ vcpu = list_entry(vsched->idle_list.next, ++ struct vcpu_info, list); ++ spin_unlock(&fairsched_lock); ++ return vcpu; ++} ++#endif ++ + /* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. +@@ -2089,10 +3053,11 @@ static runqueue_t *find_busiest_queue(st + * + * Called with this_rq unlocked. + */ +-static int load_balance(int this_cpu, runqueue_t *this_rq, ++static int load_balance(vcpu_t this_cpu, runqueue_t *this_rq, + struct sched_domain *sd, enum idle_type idle) + { + struct sched_group *group; ++ vcpu_t busiest_vcpu; + runqueue_t *busiest; + unsigned long imbalance; + int nr_moved, all_pinned = 0; +@@ -2110,13 +3075,24 @@ static int load_balance(int this_cpu, ru + goto out_balanced; + } + +- busiest = find_busiest_queue(group, idle); +- if (!busiest) { ++ busiest_vcpu = find_busiest_queue(this_cpu, group, idle); ++ if (!busiest_vcpu) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + +- BUG_ON(busiest == this_rq); ++#ifdef CONFIG_SCHED_VCPU ++ if (vcpu_vsched(this_cpu) != vcpu_vsched(busiest_vcpu)) { ++ this_cpu = find_idle_vcpu(vcpu_vsched(busiest_vcpu)); ++ if (!this_cpu) ++ goto out_one_pinned; ++ this_rq = vcpu_rq(this_cpu); ++ } ++#endif ++ busiest = vcpu_rq(busiest_vcpu); ++ ++ if (unlikely(busiest == this_rq)) ++ goto out_balanced; + + schedstat_add(sd, lb_imbalance[idle], imbalance); + +@@ -2149,7 +3125,7 @@ static int load_balance(int this_cpu, ru + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ +- if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { ++ if (!vcpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + spin_unlock(&busiest->lock); + all_pinned = 1; + goto out_one_pinned; +@@ -2214,11 +3190,12 @@ out_one_pinned: + * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * this_rq is locked. + */ +-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, ++static int load_balance_newidle(vcpu_t this_cpu, runqueue_t *this_rq, + struct sched_domain *sd) + { + struct sched_group *group; +- runqueue_t *busiest = NULL; ++ runqueue_t *busiest; ++ vcpu_t busiest_vcpu; + unsigned long imbalance; + int nr_moved = 0; + int sd_idle = 0; +@@ -2233,13 +3210,12 @@ static int load_balance_newidle(int this + goto out_balanced; + } + +- busiest = find_busiest_queue(group, NEWLY_IDLE); +- if (!busiest) { ++ busiest_vcpu = find_busiest_queue(this_cpu, group, NEWLY_IDLE); ++ if (!busiest_vcpu || busiest_vcpu == this_cpu) { + schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + goto out_balanced; + } +- +- BUG_ON(busiest == this_rq); ++ busiest = vcpu_rq(busiest_vcpu); + + schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + +@@ -2272,8 +3248,11 @@ out_balanced: + /* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. ++ * ++ * Returns whether to continue with another runqueue ++ * instead of switching to idle. + */ +-static void idle_balance(int this_cpu, runqueue_t *this_rq) ++static int idle_balance(vcpu_t this_cpu, runqueue_t *this_rq) + { + struct sched_domain *sd; + +@@ -2281,10 +3260,11 @@ static void idle_balance(int this_cpu, r + if (sd->flags & SD_BALANCE_NEWIDLE) { + if (load_balance_newidle(this_cpu, this_rq, sd)) { + /* We've pulled tasks over so stop searching */ +- break; ++ return 1; + } + } + } ++ return 0; + } + + /* +@@ -2294,18 +3274,26 @@ static void idle_balance(int this_cpu, r + * logical imbalances. + * + * Called with busiest_rq locked. ++ * ++ * In human terms: balancing of CPU load by moving tasks between CPUs is ++ * performed by 2 methods, push and pull. ++ * In certain places when CPU is found to be idle, it performs pull from busy ++ * CPU to current (idle) CPU. ++ * active_load_balance implements push method, with migration thread getting ++ * scheduled on a busy CPU (hence, making all running processes on this CPU sit ++ * in the queue) and selecting where to push and which task. + */ +-static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) ++static void active_load_balance(runqueue_t *busiest_rq, vcpu_t busiest_cpu) + { + struct sched_domain *sd; + runqueue_t *target_rq; +- int target_cpu = busiest_rq->push_cpu; ++ vcpu_t target_cpu = busiest_rq->push_cpu; + + if (busiest_rq->nr_running <= 1) + /* no task to move */ + return; + +- target_rq = cpu_rq(target_cpu); ++ target_rq = vcpu_rq(target_cpu); + + /* + * This condition is "impossible", if it occurs +@@ -2317,10 +3305,17 @@ static void active_load_balance(runqueue + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + ++ /* ++ * Our main candidate where to push our tasks is busiest->push_cpu. ++ * First, find the domain that spans over both that candidate CPU and ++ * the current one. ++ * ++ * FIXME: make sure that push_cpu doesn't disappear before we get here. ++ */ + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) + if ((sd->flags & SD_LOAD_BALANCE) && +- cpu_isset(busiest_cpu, sd->span)) ++ cpu_isset(vcpu_last_pcpu(busiest_cpu), sd->span)) + break; + + if (unlikely(sd == NULL)) +@@ -2346,31 +3341,17 @@ out: + */ + + /* Don't have all balancing operations going off at once */ +-#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) ++#define CPU_OFFSET(cpu) (HZ * (cpu) / NR_CPUS) + +-static void rebalance_tick(int this_cpu, runqueue_t *this_rq, ++static void rebalance_tick(vcpu_t this_cpu, runqueue_t *this_rq, + enum idle_type idle) + { +- unsigned long old_load, this_load; +- unsigned long j = jiffies + CPU_OFFSET(this_cpu); ++ unsigned long j; + struct sched_domain *sd; +- int i; + +- this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + /* Update our load */ +- for (i = 0; i < 3; i++) { +- unsigned long new_load = this_load; +- int scale = 1 << i; +- old_load = this_rq->cpu_load[i]; +- /* +- * Round up the averaging division if load is increasing. This +- * prevents us from getting stuck on 9 if the load is 10, for +- * example. +- */ +- if (new_load > old_load) +- new_load += scale-1; +- this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; +- } ++ update_rq_cpu_load(this_rq); ++ j = jiffies + CPU_OFFSET(smp_processor_id()); + + for_each_domain(this_cpu, sd) { + unsigned long interval; +@@ -2404,17 +3385,19 @@ static void rebalance_tick(int this_cpu, + /* + * on UP we do not need to balance between CPUs: + */ +-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) ++static inline void rebalance_tick(vcpu_t cpu, runqueue_t *rq, enum idle_type idle) + { + } +-static inline void idle_balance(int cpu, runqueue_t *rq) ++static inline void idle_balance(vcpu_t cpu, runqueue_t *rq) + { + } + #endif + +-static inline int wake_priority_sleeper(runqueue_t *rq) ++static inline int wake_priority_sleeper(runqueue_t *rq, task_t *idle) + { + int ret = 0; ++#ifndef CONFIG_SCHED_VCPU ++ /* FIXME: can we implement SMT priority sleeping for this? */ + #ifdef CONFIG_SCHED_SMT + spin_lock(&rq->lock); + /* +@@ -2422,11 +3405,13 @@ static inline int wake_priority_sleeper( + * reasons reschedule the idle task to see if it can now run. + */ + if (rq->nr_running) { +- resched_task(rq->idle); ++ /* FIXME */ ++ resched_task(idle); + ret = 1; + } + spin_unlock(&rq->lock); + #endif ++#endif + return ret; + } + +@@ -2476,6 +3461,15 @@ unsigned long long current_sched_time(co + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) + ++#ifdef CONFIG_VE ++#define update_ve_cpu_time(p, time, tick) do { \ ++ VE_CPU_STATS((p)->ve_task_info.owner_env, \ ++ task_cpu(p))->time += tick; \ ++ } while (0) ++#else ++#define update_ve_cpu_time(p, time, tick) do { } while (0) ++#endif ++ + /* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to +@@ -2491,10 +3485,13 @@ void account_user_time(struct task_struc + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); +- if (TASK_NICE(p) > 0) ++ if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); +- else ++ update_ve_cpu_time(p, nice, tmp); ++ } else { + cpustat->user = cputime64_add(cpustat->user, tmp); ++ update_ve_cpu_time(p, user, tmp); ++ } + } + + /* +@@ -2511,14 +3508,16 @@ void account_system_time(struct task_str + cputime64_t tmp; + + p->stime = cputime_add(p->stime, cputime); ++ tmp = cputime_to_cputime64(cputime); ++ ++ update_ve_cpu_time(p, system, tmp); + + /* Add system time to cpustat. */ +- tmp = cputime_to_cputime64(cputime); + if (hardirq_count() - hardirq_offset) + cpustat->irq = cputime64_add(cpustat->irq, tmp); + else if (softirq_count()) + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); +- else if (p != rq->idle) ++ else if (p != this_pcpu()->idle) + cpustat->system = cputime64_add(cpustat->system, tmp); + else if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); +@@ -2539,7 +3538,7 @@ void account_steal_time(struct task_stru + cputime64_t tmp = cputime_to_cputime64(steal); + runqueue_t *rq = this_rq(); + +- if (p == rq->idle) { ++ if (p == this_pcpu()->idle) { + p->stime = cputime_add(p->stime, steal); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, tmp); +@@ -2559,18 +3558,23 @@ void account_steal_time(struct task_stru + void scheduler_tick(void) + { + int cpu = smp_processor_id(); +- runqueue_t *rq = this_rq(); ++ vcpu_t vcpu; ++ runqueue_t *rq; + task_t *p = current; + unsigned long long now = sched_clock(); + ++ vcpu = this_vcpu(); ++ rq = vcpu_rq(vcpu); + update_cpu_clock(p, rq, now); + + rq->timestamp_last_tick = now; + +- if (p == rq->idle) { +- if (wake_priority_sleeper(rq)) ++ set_tsk_need_resched(p); //FIXME ++ ++ if (p == pcpu(cpu)->idle) { ++ if (wake_priority_sleeper(rq, pcpu(cpu)->idle)) + goto out; +- rebalance_tick(cpu, rq, SCHED_IDLE); ++ rebalance_tick(vcpu, rq, SCHED_IDLE); + return; + } + +@@ -2646,10 +3650,14 @@ void scheduler_tick(void) + out_unlock: + spin_unlock(&rq->lock); + out: +- rebalance_tick(cpu, rq, NOT_IDLE); ++ rebalance_tick(vcpu, rq, NOT_IDLE); + } + +-#ifdef CONFIG_SCHED_SMT ++#if defined(CONFIG_SCHED_SMT) && !defined(CONFIG_SCHED_VCPU) ++/* FIXME: SMT scheduling ++ * rq->cpu is initialized with rq address if FAIRSCED is on ++ * this is not correct for SMT case ++ */ + static inline void wakeup_busy_runqueue(runqueue_t *rq) + { + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ +@@ -2657,7 +3665,7 @@ static inline void wakeup_busy_runqueue( + resched_task(rq->idle); + } + +-static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) ++static void wake_sleeping_dependent(vcpu_t this_cpu) + { + struct sched_domain *tmp, *sd = NULL; + cpumask_t sibling_map; +@@ -2711,7 +3719,7 @@ static inline unsigned long smt_slice(ta + return p->time_slice * (100 - sd->per_cpu_gain) / 100; + } + +-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) ++static int dependent_sleeper(vcpu_t this_cpu) + { + struct sched_domain *tmp, *sd = NULL; + cpumask_t sibling_map; +@@ -2812,11 +3820,11 @@ out_unlock: + return ret; + } + #else +-static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) ++static inline void wake_sleeping_dependent(vcpu_t this_cpu) + { + } + +-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) ++static inline int dependent_sleeper(vcpu_t this_cpu) + { + return 0; + } +@@ -2866,7 +3874,9 @@ asmlinkage void __sched schedule(void) + struct list_head *queue; + unsigned long long now; + unsigned long run_time; +- int cpu, idx, new_prio; ++ int idx, new_prio; ++ vcpu_t vcpu; ++ cycles_t cycles; + + /* + * Test if we are atomic. Since do_exit() needs to call into +@@ -2888,13 +3898,14 @@ need_resched: + prev = current; + release_kernel_lock(prev); + need_resched_nonpreemptible: ++ cycles = get_cycles(); + rq = this_rq(); + + /* + * The idle thread is not allowed to schedule! + * Remove this check after it has been exercised a bit. + */ +- if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { ++ if (unlikely(prev == this_pcpu()->idle) && prev->state != TASK_RUNNING) { + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + } +@@ -2932,25 +3943,35 @@ need_resched_nonpreemptible: + } + } + +- cpu = smp_processor_id(); ++ prev->sleep_avg -= run_time; ++ if ((long)prev->sleep_avg <= 0) ++ prev->sleep_avg = 0; ++ ++ vcpu = rq_vcpu(rq); ++ if (rq->nr_running && ++ jiffies - vcpu->start_time < msecs_to_jiffies(vcpu_timeslice)) ++ goto same_vcpu; ++ ++ if (unlikely(!rq->nr_running)) ++ idle_balance(vcpu, rq); ++ vcpu = schedule_vcpu(vcpu, cycles); ++ rq = vcpu_rq(vcpu); ++ + if (unlikely(!rq->nr_running)) { + go_idle: +- idle_balance(cpu, rq); +- if (!rq->nr_running) { +- next = rq->idle; +- rq->expired_timestamp = 0; +- wake_sleeping_dependent(cpu, rq); +- /* +- * wake_sleeping_dependent() might have released +- * the runqueue, so break out if we got new +- * tasks meanwhile: +- */ +- if (!rq->nr_running) +- goto switch_tasks; +- } ++ next = this_pcpu()->idle; ++ rq->expired_timestamp = 0; ++ wake_sleeping_dependent(vcpu); ++ /* ++ * wake_sleeping_dependent() might have released ++ * the runqueue, so break out if we got new ++ * tasks meanwhile: ++ */ ++ if (!rq->nr_running) ++ goto switch_tasks; + } else { +- if (dependent_sleeper(cpu, rq)) { +- next = rq->idle; ++ if (dependent_sleeper(vcpu)) { ++ next = this_pcpu()->idle; + goto switch_tasks; + } + /* +@@ -2962,6 +3983,7 @@ go_idle: + goto go_idle; + } + ++same_vcpu: + array = rq->active; + if (unlikely(!array->nr_active)) { + /* +@@ -2998,28 +4020,50 @@ go_idle: + requeue_task(next, array); + } + next->activated = 0; ++ + switch_tasks: +- if (next == rq->idle) ++ if (next == this_pcpu()->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); +- rcu_qsctr_inc(task_cpu(prev)); ++ rcu_qsctr_inc(task_pcpu(prev)); + + update_cpu_clock(prev, rq, now); + +- prev->sleep_avg -= run_time; +- if ((long)prev->sleep_avg <= 0) +- prev->sleep_avg = 0; ++ /* updated w/o rq->lock, which is ok due to after-read-checks */ + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { ++ cycles_t cycles; ++ ++ /* current physical CPU id should be valid after switch */ ++ set_task_vcpu(next, vcpu); ++ set_task_pcpu(next, task_pcpu(prev)); ++ cycles = get_cycles(); + next->timestamp = now; + rq->nr_switches++; ++ glob_task_nrs[smp_processor_id()].nr_switches++; + rq->curr = next; + ++*switch_count; + ++#ifdef CONFIG_VE ++ prev->ve_task_info.sleep_stamp = cycles; ++ if (prev->state == TASK_RUNNING && prev != this_pcpu()->idle) ++ write_wakeup_stamp(prev, cycles); ++ update_sched_lat(next, cycles); ++ ++ /* because next & prev are protected with ++ * runqueue lock we may not worry about ++ * wakeup_stamp and sched_time protection ++ * (same thing in 'else' branch below) ++ */ ++ update_ve_task_info(prev, cycles); ++ next->ve_task_info.sched_time = cycles; ++ write_wakeup_stamp(next, 0); ++#endif ++ + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); +@@ -3029,8 +4073,10 @@ switch_tasks: + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +- } else ++ } else { ++ update_ve_task_info(prev, get_cycles()); + spin_unlock_irq(&rq->lock); ++ } + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) +@@ -3565,27 +4611,9 @@ int task_prio(const task_t *p) + */ + int task_nice(const task_t *p) + { +- return TASK_NICE(p); +-} +-EXPORT_SYMBOL_GPL(task_nice); +- +-/** +- * idle_cpu - is a given cpu idle currently? +- * @cpu: the processor in question. +- */ +-int idle_cpu(int cpu) +-{ +- return cpu_curr(cpu) == cpu_rq(cpu)->idle; +-} +- +-/** +- * idle_task - return the idle task for a given cpu. +- * @cpu: the processor in question. +- */ +-task_t *idle_task(int cpu) +-{ +- return cpu_rq(cpu)->idle; ++ return TASK_NICE(p); + } ++EXPORT_SYMBOL_GPL(task_nice); + + /** + * find_process_by_pid - find a process with a matching PID value. +@@ -3593,7 +4621,7 @@ task_t *idle_task(int cpu) + */ + static inline task_t *find_process_by_pid(pid_t pid) + { +- return pid ? find_task_by_pid(pid) : current; ++ return pid ? find_task_by_pid_ve(pid) : current; + } + + /* Actually do priority change: must hold rq lock. */ +@@ -3653,7 +4681,7 @@ recheck: + /* + * Allow unprivileged RT tasks to decrease priority: + */ +- if (!capable(CAP_SYS_NICE)) { ++ if (!capable(CAP_SYS_ADMIN)) { + /* + * can't change policy, except between SCHED_NORMAL + * and SCHED_BATCH: +@@ -4110,10 +5138,19 @@ EXPORT_SYMBOL(yield); + */ + void __sched io_schedule(void) + { +- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); ++ struct runqueue *rq = this_rq(); ++ ++#ifdef CONFIG_VE ++ struct ve_struct *ve; ++ ve = current->ve_task_info.owner_env; ++#endif + + atomic_inc(&rq->nr_iowait); ++ ve_nr_iowait_inc(ve, task_cpu(current)); ++ nr_iowait_inc(smp_processor_id()); + schedule(); ++ nr_iowait_dec(smp_processor_id()); ++ ve_nr_iowait_dec(ve, task_cpu(current)); + atomic_dec(&rq->nr_iowait); + } + +@@ -4121,11 +5158,20 @@ EXPORT_SYMBOL(io_schedule); + + long __sched io_schedule_timeout(long timeout) + { +- struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); ++ struct runqueue *rq = this_rq(); + long ret; + ++#ifdef CONFIG_VE ++ struct ve_struct *ve; ++ ve = current->ve_task_info.owner_env; ++#endif ++ + atomic_inc(&rq->nr_iowait); ++ ve_nr_iowait_inc(ve, task_cpu(current)); ++ nr_iowait_inc(smp_processor_id()); + ret = schedule_timeout(timeout); ++ nr_iowait_dec(smp_processor_id()); ++ ve_nr_iowait_dec(ve, task_cpu(current)); + atomic_dec(&rq->nr_iowait); + return ret; + } +@@ -4248,15 +5294,9 @@ static void show_task(task_t *p) + else + printk("?"); + #if (BITS_PER_LONG == 32) +- if (state == TASK_RUNNING) +- printk(" running "); +- else +- printk(" %08lX ", thread_saved_pc(p)); ++ printk(" %08lX ", (unsigned long)p); + #else +- if (state == TASK_RUNNING) +- printk(" running task "); +- else +- printk(" %016lx ", thread_saved_pc(p)); ++ printk(" %016lx ", (unsigned long)p); + #endif + #ifdef CONFIG_DEBUG_STACK_USAGE + { +@@ -4295,26 +5335,41 @@ void show_state(void) + #if (BITS_PER_LONG == 32) + printk("\n" + " sibling\n"); +- printk(" task PC pid father child younger older\n"); ++ printk(" task taskaddr pid father child younger older\n"); + #else + printk("\n" + " sibling\n"); +- printk(" task PC pid father child younger older\n"); ++ printk(" task taskaddr pid father child younger older\n"); + #endif + read_lock(&tasklist_lock); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + + read_unlock(&tasklist_lock); + mutex_debug_show_all_locks(); + } + ++static void init_boot_vcpus(long cpu) ++{ ++ if (vsched_vcpu(&idle_vsched, cpu) != NULL) ++ return; ++ ++ if (__add_vcpu(&idle_vsched, cpu) != 0) ++ panic("Can't create idle vcpu %ld\n", cpu); ++ ++ /* Also create vcpu for default_vsched */ ++ if (__add_vcpu(&default_vsched, cpu) != 0) ++ panic("Can't create default vcpu %ld\n", cpu); ++ ++ cpu_set(cpu, idle_vsched.pcpu_running_map); ++} ++ + /** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question +@@ -4325,22 +5380,47 @@ void show_state(void) + */ + void __devinit init_idle(task_t *idle, int cpu) + { +- runqueue_t *rq = cpu_rq(cpu); ++ struct vcpu_scheduler *vsched; ++ vcpu_t vcpu; ++ runqueue_t *rq; + unsigned long flags; + ++#ifdef CONFIG_SCHED_VCPU ++ init_boot_vcpus(cpu); ++#endif ++ vsched = &idle_vsched; ++ vcpu = vsched_vcpu(vsched, cpu); ++ rq = vcpu_rq(vcpu); ++ + idle->timestamp = sched_clock(); + idle->sleep_avg = 0; + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + idle->cpus_allowed = cpumask_of_cpu(cpu); ++ set_task_vsched(idle, &idle_vsched); + set_task_cpu(idle, cpu); + + spin_lock_irqsave(&rq->lock, flags); +- rq->curr = rq->idle = idle; ++ pcpu(cpu)->idle = idle; ++ rq->curr = idle; + #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; + #endif ++ set_task_pcpu(idle, cpu); ++ set_task_vsched(idle, vsched); ++ set_task_vcpu(idle, vcpu); ++#ifdef CONFIG_SCHED_VCPU ++ /* the following code is very close to vcpu_get */ ++ spin_lock(&fairsched_lock); ++ pcpu(cpu)->vcpu = vcpu; ++ pcpu(cpu)->vsched = vcpu->vsched; ++ list_move_tail(&vcpu->list, &vsched->running_list); ++ __set_bit(cpu, vsched->vcpu_running_map.bits); ++ __set_bit(cpu, vsched->pcpu_running_map.bits); ++ vcpu->running = 1; ++ spin_unlock(&fairsched_lock); ++#endif + spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ +@@ -4360,7 +5440,6 @@ void __devinit init_idle(task_t *idle, i + */ + cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + +-#ifdef CONFIG_SMP + /* + * This is how migration works: + * +@@ -4377,6 +5456,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; + * 7) we wake up and the migration is done. + */ + ++#ifdef CONFIG_SMP + /* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on +@@ -4392,9 +5472,11 @@ int set_cpus_allowed(task_t *p, cpumask_ + int ret = 0; + migration_req_t req; + runqueue_t *rq; ++ struct vcpu_scheduler *vsched; + ++ vsched = task_vsched(p); + rq = task_rq_lock(p, &flags); +- if (!cpus_intersects(new_mask, cpu_online_map)) { ++ if (!cpus_intersects(new_mask, vsched_vcpu_online_map(vsched))) { + ret = -EINVAL; + goto out; + } +@@ -4404,7 +5486,8 @@ int set_cpus_allowed(task_t *p, cpumask_ + if (cpu_isset(task_cpu(p), new_mask)) + goto out; + +- if (migrate_task(p, any_online_cpu(new_mask), &req)) { ++ if (migrate_task(p, vsched_vcpu(vsched, any_online_cpu(new_mask)), ++ &req)) { + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); +@@ -4418,6 +5501,7 @@ out: + } + + EXPORT_SYMBOL_GPL(set_cpus_allowed); ++#endif + + /* + * Move (not current) task off this cpu, onto dest cpu. We're doing +@@ -4428,25 +5512,30 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + */ +-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) ++static void __migrate_task(struct task_struct *p, vcpu_t src_cpu, vcpu_t dest_cpu) + { + runqueue_t *rq_dest, *rq_src; + +- if (unlikely(cpu_is_offline(dest_cpu))) ++ if (unlikely(vcpu_is_offline(dest_cpu))) + return; + +- rq_src = cpu_rq(src_cpu); +- rq_dest = cpu_rq(dest_cpu); ++#ifdef CONFIG_SCHED_VCPU ++ BUG_ON(vcpu_vsched(src_cpu) == &idle_vsched); ++#endif ++ rq_src = vcpu_rq(src_cpu); ++ rq_dest = vcpu_rq(dest_cpu); + + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ +- if (task_cpu(p) != src_cpu) ++ if (task_vcpu(p) != src_cpu) + goto out; + /* Affinity changed (again). */ +- if (!cpu_isset(dest_cpu, p->cpus_allowed)) ++ if (!vcpu_isset(dest_cpu, p->cpus_allowed)) + goto out; + +- set_task_cpu(p, dest_cpu); ++ BUG_ON(task_running(rq_src, p)); ++ set_task_vsched(p, vcpu_vsched(dest_cpu)); ++ set_task_vcpu(p, dest_cpu); + if (p->array) { + /* + * Sync timestamp with rq_dest's before activating. +@@ -4474,9 +5563,9 @@ out: + static int migration_thread(void *data) + { + runqueue_t *rq; +- int cpu = (long)data; ++ vcpu_t cpu = (vcpu_t)data; + +- rq = cpu_rq(cpu); ++ rq = vcpu_rq(cpu); + BUG_ON(rq->migration_thread != current); + + set_current_state(TASK_INTERRUPTIBLE); +@@ -4488,15 +5577,17 @@ static int migration_thread(void *data) + + spin_lock_irq(&rq->lock); + +- if (cpu_is_offline(cpu)) { ++ if (vcpu_is_offline(cpu)) { + spin_unlock_irq(&rq->lock); + goto wait_to_die; + } + ++#ifdef CONFIG_SMP + if (rq->active_balance) { + active_load_balance(rq, cpu); + rq->active_balance = 0; + } ++#endif + + head = &rq->migration_queue; + +@@ -4529,14 +5620,16 @@ wait_to_die: + return 0; + } + +-#ifdef CONFIG_HOTPLUG_CPU + /* Figure out where task on dead CPU should go, use force if neccessary. */ +-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) ++static void move_task_off_dead_cpu(vcpu_t dead_cpu, struct task_struct *tsk) + { + int dest_cpu; ++ struct vcpu_scheduler *vsched; + cpumask_t mask; + ++#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_SCHED_VCPU) + /* On same node? */ ++#error FIXME: wrong code + mask = node_to_cpumask(cpu_to_node(dead_cpu)); + cpus_and(mask, mask, tsk->cpus_allowed); + dest_cpu = any_online_cpu(mask); +@@ -4560,9 +5653,20 @@ static void move_task_off_dead_cpu(int d + "longer affine to cpu%d\n", + tsk->pid, tsk->comm, dead_cpu); + } +- __migrate_task(tsk, dead_cpu, dest_cpu); ++#elif defined(CONFIG_SCHED_VCPU) ++ vsched = vcpu_vsched(dead_cpu); ++ mask = vsched_vcpu_online_map(vsched); ++ cpus_and(mask, mask, tsk->cpus_allowed); ++ dest_cpu = any_online_cpu(mask); ++ ++ /* On any allowed CPU? */ ++ if (dest_cpu == NR_CPUS) ++ dest_cpu = any_online_cpu(vsched_vcpu_online_map(vsched)); ++#endif ++ __migrate_task(tsk, dead_cpu, vsched_vcpu(vsched, dest_cpu)); + } + ++#ifdef CONFIG_HOTPLUG_CPU + /* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because +@@ -4582,25 +5686,30 @@ static void migrate_nr_uninterruptible(r + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + } ++#endif + + /* Run through task list and migrate tasks from the dead cpu. */ +-static void migrate_live_tasks(int src_cpu) ++static void migrate_live_tasks(vcpu_t src_cpu) + { + struct task_struct *tsk, *t; + ++ BUG_ON(vcpu_isset(src_cpu, vsched_vcpu_online_map(vcpu_vsched(src_cpu)))); + write_lock_irq(&tasklist_lock); + +- do_each_thread(t, tsk) { ++ do_each_thread_all(t, tsk) { + if (tsk == current) + continue; ++ if (tsk == vcpu_rq(src_cpu)->migration_thread) ++ continue; + +- if (task_cpu(tsk) == src_cpu) ++ if (task_vcpu(tsk) == src_cpu) + move_task_off_dead_cpu(src_cpu, tsk); +- } while_each_thread(t, tsk); ++ } while_each_thread_all(t, tsk); + + write_unlock_irq(&tasklist_lock); + } + ++#ifdef CONFIG_HOTPLUG_CPU + /* Schedules idle task to be the next runnable task on current CPU. + * It does so by boosting its priority to highest possible and adding it to + * the _front_ of runqueue. Used by CPU offline code. +@@ -4622,6 +5731,9 @@ void sched_idle_next(void) + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ ++#ifdef CONFIG_SCHED_VCPU ++#error "FIXME: VCPU vs. HOTPLUG: fix the code below" ++#endif + __activate_idle_task(p, rq); + + spin_unlock_irqrestore(&rq->lock, flags); +@@ -4683,48 +5795,83 @@ static void migrate_dead_tasks(unsigned + } + #endif /* CONFIG_HOTPLUG_CPU */ + ++static void migration_thread_bind(struct task_struct *k, vcpu_t cpu) ++{ ++ BUG_ON(k->state != TASK_INTERRUPTIBLE); ++ /* Must have done schedule() in kthread() before we set_task_cpu */ ++ wait_task_inactive(k); ++ ++ set_task_vsched(k, vcpu_vsched(cpu)); ++ set_task_vcpu(k, cpu); ++ k->cpus_allowed = cpumask_of_cpu(cpu->id); ++} ++ ++static void migration_thread_stop(runqueue_t *rq) ++{ ++ struct task_struct *thread; ++ ++ thread = rq->migration_thread; ++ if (thread == NULL) ++ return; ++ ++ get_task_struct(thread); ++ kthread_stop(thread); ++ ++ /* We MUST ensure, that the do_exit of the migration thread is ++ * completed and it will never scheduled again before vsched_destroy. ++ * The task with flag PF_DEAD if unscheduled will never receive ++ * CPU again. */ ++ while (!(thread->flags & PF_DEAD) || task_running(rq, thread)) ++ yield(); ++ put_task_struct(thread); ++ ++ rq->migration_thread = NULL; ++} ++ + /* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +-static int migration_call(struct notifier_block *nfb, unsigned long action, ++static int vmigration_call(struct notifier_block *nfb, unsigned long action, + void *hcpu) + { +- int cpu = (long)hcpu; ++ vcpu_t cpu = (vcpu_t)hcpu; + struct task_struct *p; + struct runqueue *rq; + unsigned long flags; + + switch (action) { + case CPU_UP_PREPARE: +- p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); ++ p = kthread_create(migration_thread, hcpu, "migration/%d/%d", ++ vsched_id(vcpu_vsched(cpu)), cpu->id); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; +- kthread_bind(p, cpu); +- /* Must be high prio: stop_machine expects to yield to it. */ ++ ++ migration_thread_bind(p, cpu); + rq = task_rq_lock(p, &flags); ++ /* Must be high prio: stop_machine expects to yield to it. */ + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + task_rq_unlock(rq, &flags); +- cpu_rq(cpu)->migration_thread = p; ++ vcpu_rq(cpu)->migration_thread = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ +- wake_up_process(cpu_rq(cpu)->migration_thread); ++ wake_up_process(vcpu_rq(cpu)->migration_thread); + break; +-#ifdef CONFIG_HOTPLUG_CPU ++#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_SCHED_VCPU) ++#error "FIXME: CPU down code doesn't work yet with VCPUs" ++#endif + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ +- kthread_bind(cpu_rq(cpu)->migration_thread, +- any_online_cpu(cpu_online_map)); +- kthread_stop(cpu_rq(cpu)->migration_thread); +- cpu_rq(cpu)->migration_thread = NULL; ++ migration_thread_bind(vcpu_rq(cpu)->migration_thread, this_vcpu()); ++ migration_thread_stop(vcpu_rq(cpu)); + break; + case CPU_DEAD: + migrate_live_tasks(cpu); +- rq = cpu_rq(cpu); +- kthread_stop(rq->migration_thread); +- rq->migration_thread = NULL; ++ rq = vcpu_rq(cpu); ++ migration_thread_stop(rq); ++#ifdef CONFIG_HOTPLUG_CPU + /* Idle task back to normal (off runqueue, low prio) */ + rq = task_rq_lock(rq->idle, &flags); + deactivate_task(rq->idle, rq); +@@ -4734,6 +5881,7 @@ static int migration_call(struct notifie + task_rq_unlock(rq, &flags); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); ++#endif + + /* No need to migrate the tasks: it was best-effort if + * they didn't do lock_cpu_hotplug(). Just wake up +@@ -4748,11 +5896,19 @@ static int migration_call(struct notifie + } + spin_unlock_irq(&rq->lock); + break; +-#endif + } + return NOTIFY_OK; + } + ++static int migration_call(struct notifier_block *nfb, unsigned long action, ++ void *hcpu) ++{ ++ if (action == CPU_UP_PREPARE) ++ init_boot_vcpus((long)hcpu); ++ /* we need to translate pcpu to vcpu */ ++ return vmigration_call(nfb, action, vsched_default_vcpu((long)hcpu)); ++} ++ + /* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +@@ -4770,7 +5926,6 @@ int __init migration_init(void) + register_cpu_notifier(&migration_notifier); + return 0; + } +-#endif + + #ifdef CONFIG_SMP + #undef SCHED_DOMAIN_DEBUG +@@ -4798,7 +5953,7 @@ static void sched_domain_debug(struct sc + printk(KERN_DEBUG); + for (i = 0; i < level + 1; i++) + printk(" "); +- printk("domain %d: ", level); ++ printk("domain %d, flags %x: ", level, sd->flags); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); +@@ -4923,7 +6078,7 @@ static int sd_parent_degenerate(struct s + */ + static void cpu_attach_domain(struct sched_domain *sd, int cpu) + { +- runqueue_t *rq = cpu_rq(cpu); ++ runqueue_t *rq = vcpu_rq(vsched_default_vcpu(cpu)); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ +@@ -4940,6 +6095,7 @@ static void cpu_attach_domain(struct sch + + sched_domain_debug(sd, cpu); + ++ rcu_assign_pointer(pcpu(cpu)->sd, sd); + rcu_assign_pointer(rq->sd, sd); + } + +@@ -5118,7 +6274,7 @@ static unsigned long domain_distance(int + unsigned long distance = 0; + struct sched_domain *sd; + +- for_each_domain(cpu1, sd) { ++ for_each_pdomain(pcpu(cpu1)->sd, sd) { + WARN_ON(!cpu_isset(cpu1, sd->span)); + if (cpu_isset(cpu2, sd->span)) + return distance; +@@ -5440,7 +6596,7 @@ static void calibrate_migration_costs(co + */ + for_each_cpu_mask(cpu, *cpu_map) { + distance = 0; +- for_each_domain(cpu, sd) { ++ for_each_pdomain(pcpu(cpu)->sd, sd) { + sd->cache_hot_time = migration_cost[distance]; + distance++; + } +@@ -6012,42 +7168,398 @@ int in_sched_functions(unsigned long add + && addr < (unsigned long)__sched_text_end); + } + +-void __init sched_init(void) ++static void init_rq(struct runqueue *rq, int cpu) ++{ ++ int j, k; ++ prio_array_t *array; ++ ++ spin_lock_init(&rq->lock); ++ rq->nr_running = 0; ++ rq->active = rq->arrays; ++ rq->expired = rq->arrays + 1; ++ rq->best_expired_prio = MAX_PRIO; ++ ++#ifdef CONFIG_SMP ++ rq->sd = NULL; ++ for (j = 0; j < 3; j++) ++ rq->cpu_load[j] = 0; ++ rq->active_balance = 0; ++#endif ++ rq->push_cpu = 0; ++ rq->migration_thread = NULL; ++ INIT_LIST_HEAD(&rq->migration_queue); ++ rq->cpu = cpu; ++ atomic_set(&rq->nr_iowait, 0); ++ ++ for (j = 0; j < 2; j++) { ++ array = rq->arrays + j; ++ for (k = 0; k < MAX_PRIO; k++) { ++ INIT_LIST_HEAD(array->queue + k); ++ __clear_bit(k, array->bitmap); ++ } ++ // delimiter for bitsearch ++ __set_bit(MAX_PRIO, array->bitmap); ++ } ++} ++ ++#if defined(CONFIG_SCHED_VCPU) || defined(CONFIG_FAIRSCHED) ++static void init_vcpu(vcpu_t vcpu, int id) ++{ ++ memset(vcpu, 0, sizeof(struct vcpu_info)); ++ vcpu->id = id; ++#ifdef CONFIG_SCHED_VCPU ++ vcpu->last_pcpu = id; ++#endif ++ init_rq(vcpu_rq(vcpu), id); ++} ++ ++/* both rq and vsched lock should be taken */ ++static void __install_vcpu(struct vcpu_scheduler *vsched, vcpu_t vcpu) ++{ ++ int id; ++ ++ id = vcpu->id; ++ vcpu->vsched = vsched; ++ vsched->vcpu[id] = vcpu; ++ vcpu->last_pcpu = id; ++ wmb(); ++ /* FIXME: probably locking should be reworked, e.g. ++ we don't have corresponding rmb(), so we need to update mask ++ only after quiscent state */ ++ /* init_boot_vcpu() should be remade if RCU is used here */ ++ list_add(&vcpu->list, &vsched->idle_list); ++ cpu_set(id, vsched->vcpu_online_map); ++ vsched->num_online_vcpus++; ++} ++ ++static int install_vcpu(vcpu_t vcpu, struct vcpu_scheduler *vsched) + { + runqueue_t *rq; +- int i, j, k; ++ unsigned long flags; ++ int res = 0; + +- for_each_cpu(i) { +- prio_array_t *array; ++ rq = vcpu_rq(vcpu); ++ spin_lock_irqsave(&rq->lock, flags); ++ spin_lock(&fairsched_lock); + +- rq = cpu_rq(i); +- spin_lock_init(&rq->lock); +- rq->nr_running = 0; +- rq->active = rq->arrays; +- rq->expired = rq->arrays + 1; +- rq->best_expired_prio = MAX_PRIO; ++ if (vsched->vcpu[vcpu->id] != NULL) ++ res = -EBUSY; ++ else ++ __install_vcpu(vsched, vcpu); + +-#ifdef CONFIG_SMP +- rq->sd = NULL; +- for (j = 1; j < 3; j++) +- rq->cpu_load[j] = 0; +- rq->active_balance = 0; +- rq->push_cpu = 0; +- rq->migration_thread = NULL; +- INIT_LIST_HEAD(&rq->migration_queue); +-#endif +- atomic_set(&rq->nr_iowait, 0); +- +- for (j = 0; j < 2; j++) { +- array = rq->arrays + j; +- for (k = 0; k < MAX_PRIO; k++) { +- INIT_LIST_HEAD(array->queue + k); +- __clear_bit(k, array->bitmap); +- } +- // delimiter for bitsearch +- __set_bit(MAX_PRIO, array->bitmap); ++ spin_unlock(&fairsched_lock); ++ spin_unlock_irqrestore(&rq->lock, flags); ++ return res; ++} ++ ++static int __add_vcpu(struct vcpu_scheduler *vsched, int id) ++{ ++ vcpu_t vcpu; ++ int res; ++ ++ res = -ENOMEM; ++ vcpu = kmalloc(sizeof(struct vcpu_info), GFP_KERNEL); ++ if (vcpu == NULL) ++ goto out; ++ ++ init_vcpu(vcpu, id); ++ vcpu_rq(vcpu)->curr = this_pcpu()->idle; ++ res = install_vcpu(vcpu, vsched); ++ if (res < 0) ++ goto out_free; ++ return 0; ++ ++out_free: ++ kfree(vcpu); ++out: ++ return res; ++} ++ ++void vsched_init(struct vcpu_scheduler *vsched, int id) ++{ ++ memset(vsched, 0, sizeof(*vsched)); ++ ++ INIT_LIST_HEAD(&vsched->idle_list); ++ INIT_LIST_HEAD(&vsched->active_list); ++ INIT_LIST_HEAD(&vsched->running_list); ++ vsched->num_online_vcpus = 0; ++ vsched->vcpu_online_map = CPU_MASK_NONE; ++ vsched->vcpu_running_map = CPU_MASK_NONE; ++ vsched->pcpu_running_map = CPU_MASK_NONE; ++ vsched->id = id; ++} ++ ++#ifdef CONFIG_FAIRSCHED ++ ++/* No locks supposed to be held */ ++static void vsched_del_vcpu(vcpu_t vcpu); ++static int vsched_add_vcpu(struct vcpu_scheduler *vsched) ++{ ++ int res, err; ++ vcpu_t vcpu; ++ int id; ++ static DECLARE_MUTEX(id_mutex); ++ ++ down(&id_mutex); ++ id = find_first_zero_bit(vsched->vcpu_online_map.bits, NR_CPUS); ++ if (id >= NR_CPUS) { ++ err = -EBUSY; ++ goto out_up; ++ } ++ ++ err = __add_vcpu(vsched, id); ++ if (err < 0) ++ goto out_up; ++ ++ vcpu = vsched_vcpu(vsched, id); ++ err = -ENOMEM; ++ ++ res = vmigration_call(&migration_notifier, CPU_UP_PREPARE, vcpu); ++ if (res != NOTIFY_OK) ++ goto out_del_up; ++ ++ res = vmigration_call(&migration_notifier, CPU_ONLINE, vcpu); ++ if (res != NOTIFY_OK) ++ goto out_cancel_del_up; ++ ++ err = 0; ++ ++out_up: ++ up(&id_mutex); ++ return err; ++ ++out_cancel_del_up: ++ vmigration_call(&migration_notifier, CPU_UP_CANCELED, vcpu); ++out_del_up: ++ vsched_del_vcpu(vcpu); ++ goto out_up; ++} ++ ++static void vsched_del_vcpu(vcpu_t vcpu) ++{ ++ struct vcpu_scheduler *vsched; ++ runqueue_t *rq; ++ ++ vsched = vcpu_vsched(vcpu); ++ rq = vcpu_rq(vcpu); ++ ++ spin_lock_irq(&rq->lock); ++ spin_lock(&fairsched_lock); ++ cpu_clear(vcpu->id, vsched->vcpu_online_map); ++ vsched->num_online_vcpus--; ++ spin_unlock(&fairsched_lock); ++ spin_unlock_irq(&rq->lock); ++ ++ /* ++ * FIXME: ideas for VCPU hotplug: ++ * ++ * - push_cpu should be checked/cleanuped ++ * - serialization ++ */ ++ ++ /* ++ * all tasks should migrate from this VCPU somewhere, ++ * also, since this moment VCPU is offline, so migration_thread ++ * won't accept any new tasks... ++ */ ++ vmigration_call(&migration_notifier, CPU_DEAD, vcpu); ++ BUG_ON(rq->nr_running != 0); ++ ++ /* vcpu_put() is called after deactivate_task. This loop makes sure ++ * that vcpu_put() was finished and vcpu can be freed */ ++ while ((volatile int)vcpu->running) ++ yield(); ++ ++ BUG_ON(vcpu->active); /* should be in idle_list */ ++ BUG_ON(vcpu_rq(vcpu)->prev_mm != NULL); ++ ++ spin_lock_irq(&fairsched_lock); ++ list_del(&vcpu->list); ++ vsched_vcpu(vsched, vcpu->id) = NULL; ++ spin_unlock_irq(&fairsched_lock); ++ ++ kfree(vcpu); ++} ++ ++int vsched_mvpr(struct task_struct *p, struct vcpu_scheduler *vsched) ++{ ++ vcpu_t dest_vcpu; ++ int id; ++ int res; ++ ++ res = 0; ++ while(1) { ++ /* FIXME: we suppose here that vcpu can't dissapear on the fly */ ++ for(id = first_cpu(vsched->vcpu_online_map); id < NR_CPUS; ++ id++) { ++ if ((vsched->vcpu[id] != NULL) && ++ !vcpu_isset(vsched->vcpu[id], p->cpus_allowed)) ++ continue; ++ else ++ break; ++ } ++ if (id >= NR_CPUS) { ++ res = -EINVAL; ++ goto out; ++ } ++ ++ dest_vcpu = vsched_vcpu(vsched, id); ++ while(1) { ++ sched_migrate_task(p, dest_vcpu); ++ if (task_vsched_id(p) == vsched_id(vsched)) ++ goto out; ++ if (!vcpu_isset(vsched->vcpu[id], p->cpus_allowed)) ++ break; + } + } ++out: ++ return res; ++} ++ ++void vsched_fairsched_link(struct vcpu_scheduler *vsched, ++ struct fairsched_node *node) ++{ ++ vsched->node = node; ++ node->vsched = vsched; ++} ++ ++void vsched_fairsched_unlink(struct vcpu_scheduler *vsched, ++ struct fairsched_node *node) ++{ ++ vsched->node = NULL; ++ node->vsched = NULL; ++} ++ ++int vsched_create(int id, struct fairsched_node *node) ++{ ++ struct vcpu_scheduler *vsched; ++ int i, res; ++ ++ vsched = kmalloc(sizeof(*vsched), GFP_KERNEL); ++ if (vsched == NULL) ++ return -ENOMEM; ++ ++ vsched_init(vsched, node->id); ++ vsched_fairsched_link(vsched, node); ++ ++ for(i = 0; i < num_online_cpus(); i++) { ++ res = vsched_add_vcpu(vsched); ++ if (res < 0) ++ goto err_add; ++ } ++ return 0; ++ ++err_add: ++ vsched_destroy(vsched); ++ return res; ++} ++ ++int vsched_destroy(struct vcpu_scheduler *vsched) ++{ ++ vcpu_t vcpu; ++ ++ if (vsched == NULL) ++ return 0; ++ ++ spin_lock_irq(&fairsched_lock); ++ while(1) { ++ if (!list_empty(&vsched->running_list)) ++ vcpu = list_entry(vsched->running_list.next, ++ struct vcpu_info, list); ++ else if (!list_empty(&vsched->active_list)) ++ vcpu = list_entry(vsched->active_list.next, ++ struct vcpu_info, list); ++ else if (!list_empty(&vsched->idle_list)) ++ vcpu = list_entry(vsched->idle_list.next, ++ struct vcpu_info, list); ++ else ++ break; ++ spin_unlock_irq(&fairsched_lock); ++ vsched_del_vcpu(vcpu); ++ spin_lock_irq(&fairsched_lock); ++ } ++ if (vsched->num_online_vcpus) ++ goto err_busy; ++ spin_unlock_irq(&fairsched_lock); ++ ++ vsched_fairsched_unlink(vsched, vsched->node); ++ kfree(vsched); ++ return 0; ++ ++err_busy: ++ printk(KERN_ERR "BUG in vsched_destroy, vsched id %d\n", ++ vsched->id); ++ spin_unlock_irq(&fairsched_lock); ++ return -EBUSY; ++ ++} ++#endif /* defined(CONFIG_FAIRSCHED) */ ++#endif /* defined(CONFIG_SCHED_VCPU) || defined(CONFIG_FAIRSCHED) */ ++ ++static void init_boot_vcpu(void) ++{ ++ int res; ++ ++ /* ++ * We setup boot_vcpu and it's runqueue until init_idle() happens ++ * on cpu0. This is required since timer interrupts can happen ++ * between sched_init() and init_idle(). ++ */ ++ init_vcpu(&boot_idle_vcpu, 0); ++ vcpu_rq(&boot_idle_vcpu)->curr = current; ++ res = install_vcpu(&boot_idle_vcpu, &idle_vsched); ++ if (res < 0) ++ panic("Can't install boot idle vcpu"); ++ ++ init_vcpu(&boot_vcpu, 0); ++ vcpu_rq(&boot_vcpu)->curr = current; ++ res = install_vcpu(&boot_vcpu, &default_vsched); ++ if (res < 0) ++ panic("Can't install boot vcpu"); ++ ++ this_pcpu()->vcpu = &boot_idle_vcpu; ++ this_pcpu()->vsched = &idle_vsched; ++} ++ ++static void init_pcpu(int id) ++{ ++ struct pcpu_info *pcpu; ++ ++ pcpu = pcpu(id); ++ pcpu->id = id; ++#ifdef CONFIG_SMP ++ pcpu->sd = NULL; ++#endif ++ ++#ifndef CONFIG_SCHED_VCPU ++ init_vcpu(vcpu(id), id); ++#endif ++} ++ ++static void init_pcpus(void) ++{ ++ int i; ++ for (i = 0; i < NR_CPUS; i++) ++ init_pcpu(i); ++} ++ ++void __init sched_init(void) ++{ ++ init_pcpus(); ++#if defined(CONFIG_SCHED_VCPU) ++ vsched_init(&idle_vsched, -1); ++ vsched_init(&default_vsched, 0); ++#if defined(CONFIG_FAIRSCHED) ++ fairsched_init_early(); ++ vsched_fairsched_link(&idle_vsched, &fairsched_idle_node); ++ vsched_fairsched_link(&default_vsched, &fairsched_init_node); ++#endif ++ init_boot_vcpu(); ++#else ++#if defined(CONFIG_FAIRSCHED) ++ fairsched_init_early(); ++#endif ++#endif + + /* + * The boot idle thread does lazy MMU switching as well: +@@ -6064,6 +7576,149 @@ void __init sched_init(void) + init_idle(current, smp_processor_id()); + } + ++#ifdef CONFIG_SCHED_VCPU ++static void show_vcpu_list(struct vcpu_scheduler *vsched, struct list_head *lh) ++{ ++ cpumask_t m; ++ vcpu_t vcpu; ++ int i; ++ ++ cpus_clear(m); ++ list_for_each_entry(vcpu, lh, list) ++ cpu_set(vcpu->id, m); ++ ++ for (i = 0; i < NR_CPUS; i++) ++ if (cpu_isset(i, m)) ++ printk("%d ", i); ++} ++ ++#define PRINT(s, sz, fmt...) \ ++ do { \ ++ int __out; \ ++ __out = scnprintf(*s, *sz, fmt); \ ++ *s += __out; \ ++ *sz -= __out; \ ++ } while(0) ++ ++static void show_rq_array(prio_array_t *array, char *header, char **s, int *sz) ++{ ++ struct list_head *list; ++ task_t *p; ++ int k, h; ++ ++ h = 0; ++ for (k = 0; k < MAX_PRIO; k++) { ++ list = array->queue + k; ++ if (list_empty(list)) ++ continue; ++ ++ if (!h) { ++ PRINT(s, sz, header); ++ h = 1; ++ } ++ ++ PRINT(s, sz, " prio %d (", k); ++ list_for_each_entry(p, list, run_list) ++ PRINT(s, sz, "%s[%d] ", p->comm, p->pid); ++ PRINT(s, sz, ")"); ++ } ++ if (h) ++ PRINT(s, sz, "\n"); ++} ++ ++static void show_vcpu(vcpu_t vcpu) ++{ ++ runqueue_t *rq; ++ char buf[1024], *s; ++ unsigned long flags; ++ int sz; ++ ++ if (vcpu == NULL) ++ return; ++ ++ rq = vcpu_rq(vcpu); ++ spin_lock_irqsave(&rq->lock, flags); ++ printk(" vcpu %d: last_pcpu %d, state %s%s\n", ++ vcpu->id, vcpu->last_pcpu, ++ vcpu->active ? "A" : "", ++ vcpu->running ? "R" : ""); ++ ++ printk(" rq: running %lu, load {%lu,%lu,%lu}, sw %Lu, sd %p, curr %p\n", ++ rq->nr_running, ++#ifdef CONFIG_SMP ++ rq->cpu_load[0], rq->cpu_load[1], rq->cpu_load[2], ++#else ++ 0LU, 0LU, 0LU, ++#endif ++ rq->nr_switches, ++#ifdef CONFIG_SMP ++ rq->sd, ++#else ++ NULL, ++#endif ++ rq->curr ++ ); ++ ++ s = buf; ++ sz = sizeof(buf) - 1; ++ ++ show_rq_array(rq->active, " active:", &s, &sz); ++ show_rq_array(rq->expired, " expired:", &s, &sz); ++ spin_unlock_irqrestore(&rq->lock, flags); ++ ++ *s = 0; ++ printk(buf); ++} ++ ++static inline void fairsched_show_node(struct vcpu_scheduler *vsched) ++{ ++#ifdef CONFIG_FAIRSCHED ++ struct fairsched_node *node; ++ ++ node = vsched->node; ++ printk("fsnode: ready %d run %d cpu %d vsched %p, pcpu %d\n", ++ node->nr_ready, node->nr_runnable, node->nr_pcpu, ++ node->vsched, smp_processor_id()); ++#endif ++} ++ ++static void __show_vsched(struct vcpu_scheduler *vsched) ++{ ++ char mask[NR_CPUS + 1]; ++ int i; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&fairsched_lock, flags); ++ printk("vsched id=%d\n", vsched_id(vsched)); ++ fairsched_show_node(vsched); ++ ++ printk(" idle cpus "); ++ show_vcpu_list(vsched, &vsched->idle_list); ++ printk("; active cpus "); ++ show_vcpu_list(vsched, &vsched->active_list); ++ printk("; running cpus "); ++ show_vcpu_list(vsched, &vsched->running_list); ++ printk("\n"); ++ ++ cpumask_scnprintf(mask, NR_CPUS, vsched->vcpu_online_map); ++ printk(" num_online_cpus=%d, mask=%s (w=%d)\n", ++ vsched->num_online_vcpus, mask, ++ cpus_weight(vsched->vcpu_online_map)); ++ spin_unlock_irqrestore(&fairsched_lock, flags); ++ ++ for (i = 0; i < NR_CPUS; i++) ++ show_vcpu(vsched->vcpu[i]); ++} ++ ++void show_vsched(void) ++{ ++ oops_in_progress = 1; ++ __show_vsched(&idle_vsched); ++ __show_vsched(&default_vsched); ++ oops_in_progress = 0; ++} ++#endif /* CONFIG_SCHED_VCPU */ ++ + #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + void __might_sleep(char *file, int line) + { +@@ -6095,7 +7750,7 @@ void normalize_rt_tasks(void) + runqueue_t *rq; + + read_lock_irq(&tasklist_lock); +- for_each_process (p) { ++ for_each_process_all (p) { + if (!rt_task(p)) + continue; + +@@ -6136,7 +7791,7 @@ void normalize_rt_tasks(void) + */ + task_t *curr_task(int cpu) + { +- return cpu_curr(cpu); ++ return vcpu_rq(pcpu(cpu)->vcpu)->curr; + } + + /** +@@ -6156,7 +7811,7 @@ task_t *curr_task(int cpu) + */ + void set_curr_task(int cpu, task_t *p) + { +- cpu_curr(cpu) = p; ++ vcpu_rq(pcpu(cpu)->vcpu)->curr = p; + } + + #endif +diff -upr linux-2.6.16.orig/kernel/signal.c linux-2.6.16-026test015/kernel/signal.c +--- linux-2.6.16.orig/kernel/signal.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/signal.c 2006-07-04 14:41:39.000000000 +0400 +@@ -25,17 +25,20 @@ + #include <linux/posix-timers.h> + #include <linux/signal.h> + #include <linux/audit.h> ++#include <linux/kmem_cache.h> + #include <linux/capability.h> + #include <asm/param.h> + #include <asm/uaccess.h> + #include <asm/unistd.h> + #include <asm/siginfo.h> ++#include <ub/ub_misc.h> + + /* + * SLAB caches for signal bits. + */ + +-static kmem_cache_t *sigqueue_cachep; ++kmem_cache_t *sigqueue_cachep; ++EXPORT_SYMBOL_GPL(sigqueue_cachep); + + /* + * In POSIX a signal is sent either to a specific thread (Linux task) +@@ -221,6 +224,7 @@ fastcall void recalc_sigpending_tsk(stru + else + clear_tsk_thread_flag(t, TIF_SIGPENDING); + } ++EXPORT_SYMBOL_GPL(recalc_sigpending_tsk); + + void recalc_sigpending(void) + { +@@ -271,8 +275,13 @@ static struct sigqueue *__sigqueue_alloc + atomic_inc(&t->user->sigpending); + if (override_rlimit || + atomic_read(&t->user->sigpending) <= +- t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) ++ t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { + q = kmem_cache_alloc(sigqueue_cachep, flags); ++ if (q && ub_siginfo_charge(q, get_task_ub(t))) { ++ kmem_cache_free(sigqueue_cachep, q); ++ q = NULL; ++ } ++ } + if (unlikely(q == NULL)) { + atomic_dec(&t->user->sigpending); + } else { +@@ -289,6 +298,7 @@ static void __sigqueue_free(struct sigqu + return; + atomic_dec(&q->user->sigpending); + free_uid(q->user); ++ ub_siginfo_uncharge(q); + kmem_cache_free(sigqueue_cachep, q); + } + +@@ -378,8 +388,11 @@ void __exit_signal(struct task_struct *t + wake_up_process(sig->group_exit_task); + sig->group_exit_task = NULL; + } +- if (tsk == sig->curr_target) ++ if (tsk == sig->curr_target) { + sig->curr_target = next_thread(tsk); ++ if (tsk == sig->curr_target) ++ sig->curr_target = NULL; ++ } + tsk->signal = NULL; + /* + * Accumulate here the counters for all threads but the +@@ -524,7 +537,16 @@ static int __dequeue_signal(struct sigpe + { + int sig = 0; + +- sig = next_signal(pending, mask); ++ /* SIGKILL must have priority, otherwise it is quite easy ++ * to create an unkillable process, sending sig < SIGKILL ++ * to self */ ++ if (unlikely(sigismember(&pending->signal, SIGKILL))) { ++ if (!sigismember(mask, SIGKILL)) ++ sig = SIGKILL; ++ } ++ ++ if (likely(!sig)) ++ sig = next_signal(pending, mask); + if (sig) { + if (current->notifier) { + if (sigismember(current->notifier_mask, sig)) { +@@ -618,6 +640,7 @@ void signal_wake_up(struct task_struct * + if (!wake_up_state(t, mask)) + kick_process(t); + } ++EXPORT_SYMBOL_GPL(signal_wake_up); + + /* + * Remove signals in mask from the pending set and queue. +@@ -838,7 +861,7 @@ static int send_signal(int sig, struct s + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_USER; +- q->info.si_pid = current->pid; ++ q->info.si_pid = virt_pid(current); + q->info.si_uid = current->uid; + break; + case (unsigned long) SEND_SIG_PRIV: +@@ -975,7 +998,6 @@ __group_complete_signal(int sig, struct + if (t == NULL) + /* restart balancing at this thread */ + t = p->signal->curr_target = p; +- BUG_ON(t->tgid != p->tgid); + + while (!wants_signal(sig, t)) { + t = next_thread(t); +@@ -1159,13 +1181,18 @@ int __kill_pg_info(int sig, struct sigin + if (pgrp <= 0) + return -EINVAL; + ++ /* Use __vpid_to_pid(). This function is used under write_lock ++ * tasklist_lock. */ ++ if (is_virtual_pid(pgrp)) ++ pgrp = __vpid_to_pid(pgrp); ++ + success = 0; + retval = -ESRCH; +- do_each_task_pid(pgrp, PIDTYPE_PGID, p) { ++ do_each_task_pid_ve(pgrp, PIDTYPE_PGID, p) { + int err = group_send_sig_info(sig, info, p); + success |= !err; + retval = err; +- } while_each_task_pid(pgrp, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(pgrp, PIDTYPE_PGID, p); + return success ? 0 : retval; + } + +@@ -1193,7 +1220,7 @@ kill_proc_info(int sig, struct siginfo * + read_lock(&tasklist_lock); + acquired_tasklist_lock = 1; + } +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + error = -ESRCH; + if (p) + error = group_send_sig_info(sig, info, p); +@@ -1214,7 +1241,7 @@ int kill_proc_info_as_uid(int sig, struc + return ret; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (!p) { + ret = -ESRCH; + goto out_unlock; +@@ -1253,8 +1280,8 @@ static int kill_something_info(int sig, + struct task_struct * p; + + read_lock(&tasklist_lock); +- for_each_process(p) { +- if (p->pid > 1 && p->tgid != current->tgid) { ++ for_each_process_ve(p) { ++ if (virt_pid(p) > 1 && p->tgid != current->tgid) { + int err = group_send_sig_info(sig, info, p); + ++count; + if (err != -EPERM) +@@ -1562,9 +1589,17 @@ void do_notify_parent(struct task_struct + BUG_ON(!tsk->ptrace && + (tsk->group_leader != tsk || !thread_group_empty(tsk))); + ++#ifdef CONFIG_VE ++ /* Allow to send only SIGCHLD from VE */ ++ if (sig != SIGCHLD && ++ tsk->ve_task_info.owner_env != ++ tsk->parent->ve_task_info.owner_env) ++ sig = SIGCHLD; ++#endif ++ + info.si_signo = sig; + info.si_errno = 0; +- info.si_pid = tsk->pid; ++ info.si_pid = get_task_pid_ve(tsk, tsk->parent->ve_task_info.owner_env); + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ +@@ -1629,7 +1664,7 @@ static void do_notify_parent_cldstop(str + + info.si_signo = SIGCHLD; + info.si_errno = 0; +- info.si_pid = tsk->pid; ++ info.si_pid = get_task_pid_ve(tsk, VE_TASK_INFO(parent)->owner_env); + info.si_uid = tsk->uid; + + /* FIXME: find out whether or not this is supposed to be c*time. */ +@@ -1763,7 +1798,9 @@ finish_stop(int stop_count) + read_unlock(&tasklist_lock); + + out: ++ set_stop_state(current); + schedule(); ++ clear_stop_state(current); + /* + * Now we don't run again until continued. + */ +@@ -1940,11 +1977,13 @@ relock: + ptrace_signal_deliver(regs, cookie); + + /* Let the debugger run. */ ++ set_pn_state(current, PN_STOP_SIGNAL); + ptrace_stop(signr, signr, info); ++ clear_pn_state(current); + +- /* We're back. Did the debugger cancel the sig or group_exit? */ ++ /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; +- if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) ++ if (signr == 0) + continue; + + current->exit_code = 0; +@@ -1957,7 +1996,7 @@ relock: + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; +- info->si_pid = current->parent->pid; ++ info->si_pid = virt_pid(current->parent); + info->si_uid = current->parent->uid; + } + +@@ -1988,8 +2027,14 @@ relock: + continue; + + /* Init gets no signals it doesn't want. */ +- if (current->pid == 1) ++ if (virt_pid(current) == 1) { ++ /* Allow SIGKILL for non-root VE */ ++#ifdef CONFIG_VE ++ if (current->pid == 1 || ++ signr != SIGKILL) ++#endif + continue; ++ } + + if (sig_kernel_stop(signr)) { + /* +@@ -2307,7 +2352,6 @@ sys_rt_sigtimedwait(const sigset_t __use + + timeout = schedule_timeout_interruptible(timeout); + +- try_to_freeze(); + spin_lock_irq(¤t->sighand->siglock); + sig = dequeue_signal(current, &these, &info); + current->blocked = current->real_blocked; +@@ -2340,7 +2384,7 @@ sys_kill(int pid, int sig) + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_USER; +- info.si_pid = current->tgid; ++ info.si_pid = virt_tgid(current); + info.si_uid = current->uid; + + return kill_something_info(sig, &info, pid); +@@ -2356,12 +2400,12 @@ static int do_tkill(int tgid, int pid, i + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; +- info.si_pid = current->tgid; ++ info.si_pid = virt_tgid(current); + info.si_uid = current->uid; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); +- if (p && (tgid <= 0 || p->tgid == tgid)) { ++ p = find_task_by_pid_ve(pid); ++ if (p && (tgid <= 0 || virt_tgid(p) == tgid)) { + error = check_kill_permission(sig, &info, p); + /* + * The null signal is a permissions and process existence +diff -upr linux-2.6.16.orig/kernel/softirq.c linux-2.6.16-026test015/kernel/softirq.c +--- linux-2.6.16.orig/kernel/softirq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/softirq.c 2006-07-04 14:41:38.000000000 +0400 +@@ -13,10 +13,13 @@ + #include <linux/mm.h> + #include <linux/notifier.h> + #include <linux/percpu.h> ++#include <linux/sysctl.h> + #include <linux/cpu.h> + #include <linux/kthread.h> + #include <linux/rcupdate.h> + ++#include <ub/beancounter.h> ++ + #include <asm/irq.h> + /* + - No shared variables, all the data are CPU local. +@@ -44,6 +47,8 @@ EXPORT_SYMBOL(irq_stat); + static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; + + static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); ++static DEFINE_PER_CPU(struct task_struct *, ksoftirqd_wakeup); ++static int ksoftirqd_stat[NR_CPUS]; + + /* + * we cannot loop indefinitely here to avoid userspace starvation, +@@ -54,7 +59,7 @@ static DEFINE_PER_CPU(struct task_struct + static inline void wakeup_softirqd(void) + { + /* Interrupts are disabled: no need to stop preemption */ +- struct task_struct *tsk = __get_cpu_var(ksoftirqd); ++ struct task_struct *tsk = __get_cpu_var(ksoftirqd_wakeup); + + if (tsk && tsk->state != TASK_RUNNING) + wake_up_process(tsk); +@@ -73,10 +78,14 @@ static inline void wakeup_softirqd(void) + + asmlinkage void __do_softirq(void) + { ++ struct user_beancounter *ub; + struct softirq_action *h; + __u32 pending; + int max_restart = MAX_SOFTIRQ_RESTART; + int cpu; ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(get_ve0()); + + pending = local_softirq_pending(); + +@@ -90,6 +99,7 @@ restart: + + h = softirq_vec; + ++ ub = set_exec_ub(get_ub0()); + do { + if (pending & 1) { + h->action(h); +@@ -98,6 +108,7 @@ restart: + h++; + pending >>= 1; + } while (pending); ++ (void)set_exec_ub(ub); + + local_irq_disable(); + +@@ -108,6 +119,7 @@ restart: + if (pending) + wakeup_softirqd(); + ++ (void)set_exec_env(envid); + __local_bh_enable(); + } + +@@ -483,6 +495,52 @@ static int __devinit cpu_callback(struct + return NOTIFY_OK; + } + ++static int proc_ksoftirqd(ctl_table *ctl, int write, struct file *filp, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret, cpu; ++ ++ ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); ++ if (!write) ++ return ret; ++ ++ for_each_online_cpu(cpu) { ++ per_cpu(ksoftirqd_wakeup, cpu) = ++ ksoftirqd_stat[cpu] ? per_cpu(ksoftirqd, cpu) : NULL; ++ } ++ return ret; ++} ++ ++static int sysctl_ksoftirqd(ctl_table *table, int *name, int nlen, ++ void *oldval, size_t *oldlenp, void *newval, size_t newlen, ++ void **context) ++{ ++ return -EINVAL; ++} ++ ++static ctl_table debug_table[] = { ++ { ++ .ctl_name = 1246, ++ .procname = "ksoftirqd", ++ .data = ksoftirqd_stat, ++ .maxlen = sizeof(ksoftirqd_stat), ++ .mode = 0644, ++ .proc_handler = &proc_ksoftirqd, ++ .strategy = &sysctl_ksoftirqd ++ }, ++ {0} ++}; ++ ++static ctl_table root_table[] = { ++ { ++ .ctl_name = CTL_DEBUG, ++ .procname = "debug", ++ .mode = 0555, ++ .child = debug_table ++ }, ++ {0} ++}; ++ + static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback + }; +@@ -493,5 +551,6 @@ __init int spawn_ksoftirqd(void) + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); ++ register_sysctl_table(root_table, 0); + return 0; + } +diff -upr linux-2.6.16.orig/kernel/stop_machine.c linux-2.6.16-026test015/kernel/stop_machine.c +--- linux-2.6.16.orig/kernel/stop_machine.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/stop_machine.c 2006-07-04 14:41:39.000000000 +0400 +@@ -96,7 +96,7 @@ static int stop_machine(void) + stopmachine_state = STOPMACHINE_WAIT; + + for_each_online_cpu(i) { +- if (i == raw_smp_processor_id()) ++ if (i == task_cpu(current)) + continue; + ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); + if (ret < 0) +@@ -178,7 +178,7 @@ struct task_struct *__stop_machine_run(i + + /* If they don't care which CPU fn runs on, bind to any online one. */ + if (cpu == NR_CPUS) +- cpu = raw_smp_processor_id(); ++ cpu = task_cpu(current); + + p = kthread_create(do_stop, &smdata, "kstopmachine"); + if (!IS_ERR(p)) { +diff -upr linux-2.6.16.orig/kernel/sys.c linux-2.6.16-026test015/kernel/sys.c +--- linux-2.6.16.orig/kernel/sys.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/sys.c 2006-07-04 14:41:39.000000000 +0400 +@@ -11,6 +11,7 @@ + #include <linux/mman.h> + #include <linux/smp_lock.h> + #include <linux/notifier.h> ++#include <linux/virtinfo.h> + #include <linux/reboot.h> + #include <linux/prctl.h> + #include <linux/init.h> +@@ -236,6 +237,94 @@ int capable(int cap) + EXPORT_SYMBOL(capable); + #endif + ++static DECLARE_MUTEX(virtinfo_sem); ++static struct vnotifier_block *virtinfo_chain[VIRT_TYPES]; ++ ++void virtinfo_notifier_register(int type, struct vnotifier_block *nb) ++{ ++ struct vnotifier_block **p; ++ ++ down(&virtinfo_sem); ++ for (p = &virtinfo_chain[type]; ++ *p != NULL && nb->priority < (*p)->priority; ++ p = &(*p)->next); ++ nb->next = *p; ++ smp_wmb(); ++ *p = nb; ++ up(&virtinfo_sem); ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_register); ++ ++struct virtinfo_cnt_struct { ++ volatile unsigned long exit[NR_CPUS]; ++ volatile unsigned long entry; ++}; ++static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt); ++ ++void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb) ++{ ++ struct vnotifier_block **p; ++ int entry_cpu, exit_cpu; ++ unsigned long cnt, ent; ++ ++ down(&virtinfo_sem); ++ for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next); ++ *p = nb->next; ++ smp_mb(); ++ ++ for_each_cpu_mask(entry_cpu, cpu_possible_map) { ++ while (1) { ++ cnt = 0; ++ for_each_cpu_mask(exit_cpu, cpu_possible_map) ++ cnt += ++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]; ++ smp_rmb(); ++ ent = per_cpu(virtcnt, entry_cpu).entry; ++ if (cnt == ent) ++ break; ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(HZ / 100); ++ } ++ } ++ up(&virtinfo_sem); ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_unregister); ++ ++int virtinfo_notifier_call(int type, unsigned long n, void *data) ++{ ++ int ret; ++ int entry_cpu, exit_cpu; ++ struct vnotifier_block *nb; ++ ++ entry_cpu = get_cpu(); ++ per_cpu(virtcnt, entry_cpu).entry++; ++ smp_wmb(); ++ put_cpu(); ++ ++ nb = virtinfo_chain[type]; ++ ret = NOTIFY_DONE; ++ while (nb) ++ { ++ ret = nb->notifier_call(nb, n, data, ret); ++ if(ret & NOTIFY_STOP_MASK) { ++ ret &= ~NOTIFY_STOP_MASK; ++ break; ++ } ++ nb = nb->next; ++ } ++ ++ exit_cpu = get_cpu(); ++ smp_wmb(); ++ per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++; ++ put_cpu(); ++ ++ return ret; ++} ++ ++EXPORT_SYMBOL(virtinfo_notifier_call); ++ + static int set_one_prio(struct task_struct *p, int niceval, int error) + { + int no_nice; +@@ -281,17 +370,19 @@ asmlinkage long sys_setpriority(int whic + switch (which) { + case PRIO_PROCESS: + if (!who) +- who = current->pid; +- p = find_task_by_pid(who); ++ who = virt_pid(current); ++ p = find_task_by_pid_ve(who); + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ else ++ who = vpid_to_pid(who); ++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = current->user; +@@ -301,10 +392,10 @@ asmlinkage long sys_setpriority(int whic + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->uid == who) + error = set_one_prio(p, niceval, error); +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + if (who != current->uid) + free_uid(user); /* For find_user() */ + break; +@@ -334,8 +425,8 @@ asmlinkage long sys_getpriority(int whic + switch (which) { + case PRIO_PROCESS: + if (!who) +- who = current->pid; +- p = find_task_by_pid(who); ++ who = virt_pid(current); ++ p = find_task_by_pid_ve(who); + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) +@@ -345,11 +436,13 @@ asmlinkage long sys_getpriority(int whic + case PRIO_PGRP: + if (!who) + who = process_group(current); +- do_each_task_pid(who, PIDTYPE_PGID, p) { ++ else ++ who = vpid_to_pid(who); ++ do_each_task_pid_ve(who, PIDTYPE_PGID, p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; +- } while_each_task_pid(who, PIDTYPE_PGID, p); ++ } while_each_task_pid_ve(who, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = current->user; +@@ -359,13 +452,13 @@ asmlinkage long sys_getpriority(int whic + if ((who != current->uid) && !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + +- do_each_thread(g, p) ++ do_each_thread_ve(g, p) + if (p->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } +- while_each_thread(g, p); ++ while_each_thread_ve(g, p); + if (who != current->uid) + free_uid(user); /* for find_user() */ + break; +@@ -497,6 +590,35 @@ asmlinkage long sys_reboot(int magic1, i + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + ++#ifdef CONFIG_VE ++ if (!ve_is_super(get_exec_env())) ++ switch (cmd) { ++ case LINUX_REBOOT_CMD_RESTART: ++ case LINUX_REBOOT_CMD_HALT: ++ case LINUX_REBOOT_CMD_POWER_OFF: ++ case LINUX_REBOOT_CMD_RESTART2: { ++ struct siginfo info; ++ ++ info.si_errno = 0; ++ info.si_code = SI_KERNEL; ++ info.si_pid = virt_pid(current); ++ info.si_uid = current->uid; ++ info.si_signo = SIGKILL; ++ ++ /* Sending to real init is safe */ ++ send_sig_info(SIGKILL, &info, ++ get_exec_env()->init_entry); ++ } ++ ++ case LINUX_REBOOT_CMD_CAD_ON: ++ case LINUX_REBOOT_CMD_CAD_OFF: ++ return 0; ++ ++ default: ++ return -EINVAL; ++ } ++#endif ++ + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ +@@ -686,7 +808,7 @@ asmlinkage long sys_setgid(gid_t gid) + return 0; + } + +-static int set_user(uid_t new_ruid, int dumpclear) ++int set_user(uid_t new_ruid, int dumpclear) + { + struct user_struct *new_user; + +@@ -711,6 +833,7 @@ static int set_user(uid_t new_ruid, int + current->uid = new_ruid; + return 0; + } ++EXPORT_SYMBOL(set_user); + + /* + * Unprivileged users may change the real uid to the effective uid +@@ -1079,7 +1202,12 @@ asmlinkage long sys_times(struct tms __u + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } ++#ifndef CONFIG_VE + return (long) jiffies_64_to_clock_t(get_jiffies_64()); ++#else ++ return (long) jiffies_64_to_clock_t(get_jiffies_64() - ++ get_exec_env()->start_jiffies); ++#endif + } + + /* +@@ -1100,21 +1228,24 @@ asmlinkage long sys_setpgid(pid_t pid, p + struct task_struct *p; + struct task_struct *group_leader = current->group_leader; + int err = -EINVAL; ++ int _pgid; + + if (!pid) +- pid = group_leader->pid; ++ pid = virt_pid(group_leader); + if (!pgid) + pgid = pid; + if (pgid < 0) + return -EINVAL; + ++ _pgid = vpid_to_pid(pgid); ++ + /* From this point forward we keep holding onto the tasklist lock + * so that our parent does not change from under us. -DaveM + */ + write_lock_irq(&tasklist_lock); + + err = -ESRCH; +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + if (!p) + goto out; + +@@ -1139,25 +1270,35 @@ asmlinkage long sys_setpgid(pid_t pid, p + if (p->signal->leader) + goto out; + +- if (pgid != pid) { ++ pgid = virt_pid(p); ++ if (_pgid != p->pid) { + struct task_struct *p; + +- do_each_task_pid(pgid, PIDTYPE_PGID, p) { +- if (p->signal->session == group_leader->signal->session) ++ do_each_task_pid_ve(_pgid, PIDTYPE_PGID, p) { ++ if (p->signal->session == group_leader->signal->session) { ++ pgid = virt_pgid(p); + goto ok_pgid; +- } while_each_task_pid(pgid, PIDTYPE_PGID, p); ++ } ++ } while_each_task_pid_ve(_pgid, PIDTYPE_PGID, p); + goto out; + } + + ok_pgid: +- err = security_task_setpgid(p, pgid); ++ err = security_task_setpgid(p, _pgid); + if (err) + goto out; + +- if (process_group(p) != pgid) { ++ if (process_group(p) != _pgid) { + detach_pid(p, PIDTYPE_PGID); +- p->signal->pgrp = pgid; +- attach_pid(p, PIDTYPE_PGID, pgid); ++ p->signal->pgrp = _pgid; ++ set_virt_pgid(p, pgid); ++ attach_pid(p, PIDTYPE_PGID, _pgid); ++ if (atomic_read(&p->signal->count) != 1) { ++ task_t *t; ++ for (t = next_thread(p); t != p; t = next_thread(t)) { ++ set_virt_pgid(t, pgid); ++ } ++ } + } + + err = 0; +@@ -1170,19 +1311,19 @@ out: + asmlinkage long sys_getpgid(pid_t pid) + { + if (!pid) { +- return process_group(current); ++ return virt_pgid(current); + } else { + int retval; + struct task_struct *p; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + + retval = -ESRCH; + if (p) { + retval = security_task_getpgid(p); + if (!retval) +- retval = process_group(p); ++ retval = virt_pgid(p); + } + read_unlock(&tasklist_lock); + return retval; +@@ -1194,7 +1335,7 @@ asmlinkage long sys_getpgid(pid_t pid) + asmlinkage long sys_getpgrp(void) + { + /* SMP - assuming writes are word atomic this is fine */ +- return process_group(current); ++ return virt_pgid(current); + } + + #endif +@@ -1202,19 +1343,19 @@ asmlinkage long sys_getpgrp(void) + asmlinkage long sys_getsid(pid_t pid) + { + if (!pid) { +- return current->signal->session; ++ return virt_sid(current); + } else { + int retval; + struct task_struct *p; + + read_lock(&tasklist_lock); +- p = find_task_by_pid(pid); ++ p = find_task_by_pid_ve(pid); + + retval = -ESRCH; + if(p) { + retval = security_task_getsid(p); + if (!retval) +- retval = p->signal->session; ++ retval = virt_sid(p); + } + read_unlock(&tasklist_lock); + return retval; +@@ -1236,9 +1377,20 @@ asmlinkage long sys_setsid(void) + + group_leader->signal->leader = 1; + __set_special_pids(group_leader->pid, group_leader->pid); ++ set_virt_pgid(group_leader, virt_pid(group_leader)); ++ set_virt_sid(group_leader, virt_pid(group_leader)); + group_leader->signal->tty = NULL; + group_leader->signal->tty_old_pgrp = 0; +- err = process_group(group_leader); ++ if (atomic_read(&group_leader->signal->count) != 1) { ++ task_t *t; ++ for (t = next_thread(group_leader); t != group_leader; ++ t = next_thread(t)) { ++ set_virt_pgid(t, virt_pid(group_leader)); ++ set_virt_sid(t, virt_pid(group_leader)); ++ } ++ } ++ ++ err = virt_pgid(group_leader); + out: + write_unlock_irq(&tasklist_lock); + up(&tty_sem); +@@ -1518,7 +1670,7 @@ asmlinkage long sys_newuname(struct new_ + int errno = 0; + + down_read(&uts_sem); +- if (copy_to_user(name,&system_utsname,sizeof *name)) ++ if (copy_to_user(name,&ve_utsname,sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +@@ -1529,15 +1681,15 @@ asmlinkage long sys_sethostname(char __u + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { +- memcpy(system_utsname.nodename, tmp, len); +- system_utsname.nodename[len] = 0; ++ memcpy(ve_utsname.nodename, tmp, len); ++ ve_utsname.nodename[len] = 0; + errno = 0; + } + up_write(&uts_sem); +@@ -1553,11 +1705,11 @@ asmlinkage long sys_gethostname(char __u + if (len < 0) + return -EINVAL; + down_read(&uts_sem); +- i = 1 + strlen(system_utsname.nodename); ++ i = 1 + strlen(ve_utsname.nodename); + if (i > len) + i = len; + errno = 0; +- if (copy_to_user(name, system_utsname.nodename, i)) ++ if (copy_to_user(name, ve_utsname.nodename, i)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +@@ -1574,7 +1726,7 @@ asmlinkage long sys_setdomainname(char _ + int errno; + char tmp[__NEW_UTS_LEN]; + +- if (!capable(CAP_SYS_ADMIN)) ++ if (!capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; +@@ -1582,8 +1734,8 @@ asmlinkage long sys_setdomainname(char _ + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { +- memcpy(system_utsname.domainname, tmp, len); +- system_utsname.domainname[len] = 0; ++ memcpy(ve_utsname.domainname, tmp, len); ++ ve_utsname.domainname[len] = 0; + errno = 0; + } + up_write(&uts_sem); +@@ -1657,7 +1809,19 @@ asmlinkage long sys_setrlimit(unsigned i + (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + new_rlim.rlim_cur <= cputime_to_secs( + current->signal->it_prof_expires))) { +- cputime_t cputime = secs_to_cputime(new_rlim.rlim_cur); ++ unsigned long rlim_cur = new_rlim.rlim_cur; ++ cputime_t cputime; ++ ++ if (rlim_cur == 0) { ++ /* ++ * The caller is asking for an immediate RLIMIT_CPU ++ * expiry. But we use the zero value to mean "it was ++ * never set". So let's cheat and make it one second ++ * instead ++ */ ++ rlim_cur = 1; ++ } ++ cputime = secs_to_cputime(rlim_cur); + read_lock(&tasklist_lock); + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, +diff -upr linux-2.6.16.orig/kernel/sysctl.c linux-2.6.16-026test015/kernel/sysctl.c +--- linux-2.6.16.orig/kernel/sysctl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/sysctl.c 2006-07-04 14:41:39.000000000 +0400 +@@ -25,6 +25,8 @@ + #include <linux/slab.h> + #include <linux/sysctl.h> + #include <linux/proc_fs.h> ++#include <linux/ve_owner.h> ++#include <linux/ve.h> + #include <linux/capability.h> + #include <linux/ctype.h> + #include <linux/utsname.h> +@@ -63,6 +65,7 @@ extern int max_threads; + extern int sysrq_enabled; + extern int core_uses_pid; + extern int suid_dumpable; ++extern int sysctl_at_vsyscall; + extern char core_pattern[]; + extern int cad_pid; + extern int pid_max; +@@ -72,6 +75,12 @@ extern int printk_ratelimit_burst; + extern int pid_max_min, pid_max_max; + extern int sysctl_drop_caches; + extern int percpu_pagelist_fraction; ++#ifdef CONFIG_VE ++int glob_virt_pids = 1; ++EXPORT_SYMBOL(glob_virt_pids); ++#endif ++ ++extern int ve_area_access_check; /* fs/namei.c */ + + #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + int unknown_nmi_panic; +@@ -101,6 +110,10 @@ extern int msg_ctlmnb; + extern int msg_ctlmni; + extern int sem_ctls[]; + #endif ++#ifdef CONFIG_SCHED_VCPU ++extern u32 vcpu_sched_timeslice; ++extern u32 vcpu_timeslice; ++#endif + + #ifdef __sparc__ + extern char reboot_command []; +@@ -108,6 +121,8 @@ extern int stop_a_enabled; + extern int scons_pwroff; + #endif + ++extern int alloc_fail_warn; ++ + #ifdef __hppa__ + extern int pwrsw_enabled; + extern int unaligned_enabled; +@@ -122,6 +137,7 @@ extern int spin_retry; + #endif + + extern int sysctl_hz_timer; ++int decode_call_traces = 1; + + #ifdef CONFIG_BSD_PROCESS_ACCT + extern int acct_parm[]; +@@ -131,10 +147,14 @@ extern int acct_parm[]; + extern int no_unaligned_warning; + #endif + ++#ifdef CONFIG_FAIRSCHED ++extern int fairsched_max_latency; ++int fsch_sysctl_latency(ctl_table *ctl, int write, struct file *filp, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++#endif ++ + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, + ctl_table *, void **); +-static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +- void __user *buffer, size_t *lenp, loff_t *ppos); + + static ctl_table root_table[]; + static struct ctl_table_header root_table_header = +@@ -178,6 +198,8 @@ static void register_proc_table(ctl_tabl + static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); + #endif + ++extern struct new_utsname virt_utsname; ++ + /* The default sysctl tables: */ + + static ctl_table root_table[] = { +@@ -276,6 +298,15 @@ static ctl_table kern_table[] = { + .strategy = &sysctl_string, + }, + { ++ .ctl_name = KERN_VIRT_OSRELEASE, ++ .procname = "virt_osrelease", ++ .data = virt_utsname.release, ++ .maxlen = sizeof(virt_utsname.release), ++ .mode = 0644, ++ .proc_handler = &proc_doutsstring, ++ .strategy = &sysctl_string, ++ }, ++ { + .ctl_name = KERN_PANIC, + .procname = "panic", + .data = &panic_timeout, +@@ -353,6 +384,22 @@ static ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + #endif ++ { ++ .ctl_name = KERN_SILENCE_LEVEL, ++ .procname = "silence-level", ++ .data = &console_silence_loglevel, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { ++ .ctl_name = KERN_ALLOC_FAIL_WARN, ++ .procname = "alloc_fail_warn", ++ .data = &alloc_fail_warn, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, + #ifdef __hppa__ + { + .ctl_name = KERN_HPPA_PWRSW, +@@ -579,6 +626,24 @@ static ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_VCPU ++ { ++ .ctl_name = KERN_VCPU_SCHED_TIMESLICE, ++ .procname = "vcpu_sched_timeslice", ++ .data = &vcpu_sched_timeslice, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_VCPU_TIMESLICE, ++ .procname = "vcpu_timeslice", ++ .data = &vcpu_timeslice, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { + .ctl_name = KERN_PIDMAX, + .procname = "pid_max", +@@ -590,6 +655,16 @@ static ctl_table kern_table[] = { + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, ++#ifdef CONFIG_VE ++ { ++ .ctl_name = KERN_VIRT_PIDS, ++ .procname = "virt_pids", ++ .data = &glob_virt_pids, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++#endif + { + .ctl_name = KERN_PANIC_ON_OOPS, + .procname = "panic_on_oops", +@@ -683,6 +758,16 @@ static ctl_table kern_table[] = { + .proc_handler = &proc_dointvec, + }, + #endif ++#ifdef CONFIG_FAIRSCHED ++ { ++ .ctl_name = KERN_FAIRSCHED_MAX_LATENCY, ++ .procname = "fairsched-max-latency", ++ .data = &fairsched_max_latency, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &fsch_sysctl_latency ++ }, ++#endif + { .ctl_name = 0 } + }; + +@@ -1046,10 +1131,26 @@ static ctl_table fs_table[] = { + .mode = 0644, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = FS_AT_VSYSCALL, ++ .procname = "vsyscall", ++ .data = &sysctl_at_vsyscall, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, + { .ctl_name = 0 } + }; + + static ctl_table debug_table[] = { ++ { ++ .ctl_name = DBG_DECODE_CALLTRACES, ++ .procname = "decode_call_traces", ++ .data = &decode_call_traces, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, + { .ctl_name = 0 } + }; + +@@ -1113,6 +1214,7 @@ int do_sysctl(int __user *name, int nlen + { + struct list_head *tmp; + int error = -ENOTDIR; ++ struct ve_struct *ve; + + if (nlen <= 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; +@@ -1121,13 +1223,24 @@ int do_sysctl(int __user *name, int nlen + if (!oldlenp || get_user(old_len, oldlenp)) + return -EFAULT; + } ++ ve = get_exec_env(); + spin_lock(&sysctl_lock); ++#ifdef CONFIG_VE ++ tmp = ve->sysctl_lh.next; ++#else + tmp = &root_table_header.ctl_entry; ++#endif + do { +- struct ctl_table_header *head = +- list_entry(tmp, struct ctl_table_header, ctl_entry); ++ struct ctl_table_header *head; + void *context = NULL; + ++#ifdef CONFIG_VE ++ if (tmp == &ve->sysctl_lh) ++ /* second pass over global variables */ ++ tmp = &root_table_header.ctl_entry; ++#endif ++ ++ head = list_entry(tmp, struct ctl_table_header, ctl_entry); + if (!use_table(head)) + continue; + +@@ -1181,10 +1294,14 @@ static int test_perm(int mode, int op) + static inline int ctl_perm(ctl_table *table, int op) + { + int error; ++ int mode = table->mode; ++ + error = security_sysctl(table, op); + if (error) + return error; +- return test_perm(table->mode, op); ++ if (!ve_accessible(table->owner_env, get_exec_env())) ++ mode &= ~0222; /* disable write access */ ++ return test_perm(mode, op); + } + + static int parse_table(int __user *name, int nlen, +@@ -1350,6 +1467,8 @@ struct ctl_table_header *register_sysctl + int insert_at_head) + { + struct ctl_table_header *tmp; ++ struct list_head *lh; ++ + tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); + if (!tmp) + return NULL; +@@ -1358,17 +1477,52 @@ struct ctl_table_header *register_sysctl + tmp->used = 0; + tmp->unregistering = NULL; + spin_lock(&sysctl_lock); ++#ifdef CONFIG_VE ++ lh = &get_exec_env()->sysctl_lh; ++#else ++ lh = &root_table_header.ctl_entry; ++#endif + if (insert_at_head) +- list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); ++ list_add(&tmp->ctl_entry, lh); + else +- list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); ++ list_add_tail(&tmp->ctl_entry, lh); + spin_unlock(&sysctl_lock); + #ifdef CONFIG_PROC_FS ++#ifdef CONFIG_VE ++ register_proc_table(table, get_exec_env()->proc_sys_root, tmp); ++#else + register_proc_table(table, proc_sys_root, tmp); + #endif ++#endif + return tmp; + } + ++void free_sysctl_clone(ctl_table *clone) ++{ ++ kfree(clone); ++} ++ ++ctl_table *clone_sysctl_template(ctl_table *tmpl, int nr) ++{ ++ int i; ++ ctl_table *clone; ++ ++ clone = kmalloc(nr * sizeof(ctl_table), GFP_KERNEL); ++ if (clone == NULL) ++ return NULL; ++ ++ memcpy(clone, tmpl, nr * sizeof(ctl_table)); ++ for (i = 0; i < nr; i++) { ++ if (tmpl[i].ctl_name == 0) ++ continue; ++ clone[i].owner_env = get_exec_env(); ++ if (tmpl[i].child == NULL) ++ continue; ++ clone[i].child = clone + (tmpl[i].child - tmpl); ++ } ++ return clone; ++} ++ + /** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table +@@ -1382,8 +1536,12 @@ void unregister_sysctl_table(struct ctl_ + spin_lock(&sysctl_lock); + start_unregistering(header); + #ifdef CONFIG_PROC_FS ++#ifdef CONFIG_VE ++ unregister_proc_table(header->ctl_table, get_exec_env()->proc_sys_root); ++#else + unregister_proc_table(header->ctl_table, proc_sys_root); + #endif ++#endif + spin_unlock(&sysctl_lock); + kfree(header); + } +@@ -1469,11 +1627,6 @@ static void unregister_proc_table(ctl_ta + * its fields. We are under sysctl_lock here. + */ + de->data = NULL; +- +- /* Don't unregister proc entries that are still being used.. */ +- if (atomic_read(&de->count)) +- continue; +- + table->de = NULL; + remove_proc_entry(table->procname, root); + } +@@ -1615,7 +1768,7 @@ int proc_dostring(ctl_table *table, int + * to observe. Should this be in kernel/sys.c ???? + */ + +-static int proc_doutsstring(ctl_table *table, int write, struct file *filp, ++int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) + { + int r; +@@ -2190,7 +2343,7 @@ int proc_dostring(ctl_table *table, int + return -ENOSYS; + } + +-static int proc_doutsstring(ctl_table *table, int write, struct file *filp, ++int proc_doutsstring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) + { + return -ENOSYS; +@@ -2494,6 +2647,14 @@ void unregister_sysctl_table(struct ctl_ + { + } + ++ctl_table * clone_sysctl_template(ctl_table *tmpl, int nr) ++{ ++ return NULL; ++} ++ ++void free_sysctl_clone(ctl_table *tmpl) ++{ ++} + #endif /* CONFIG_SYSCTL */ + + /* +@@ -2506,6 +2667,7 @@ EXPORT_SYMBOL(proc_dointvec_minmax); + EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + EXPORT_SYMBOL(proc_dostring); ++EXPORT_SYMBOL(proc_doutsstring); + EXPORT_SYMBOL(proc_doulongvec_minmax); + EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + EXPORT_SYMBOL(register_sysctl_table); +@@ -2514,3 +2676,5 @@ EXPORT_SYMBOL(sysctl_jiffies); + EXPORT_SYMBOL(sysctl_ms_jiffies); + EXPORT_SYMBOL(sysctl_string); + EXPORT_SYMBOL(unregister_sysctl_table); ++EXPORT_SYMBOL(clone_sysctl_template); ++EXPORT_SYMBOL(free_sysctl_clone); +diff -upr linux-2.6.16.orig/kernel/timer.c linux-2.6.16-026test015/kernel/timer.c +--- linux-2.6.16.orig/kernel/timer.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/timer.c 2006-07-04 14:41:38.000000000 +0400 +@@ -460,7 +460,11 @@ static inline void __run_timers(tvec_bas + spin_unlock_irq(&base->t_base.lock); + { + int preempt_count = preempt_count(); ++ struct ve_struct *ve; ++ ++ ve = set_exec_env(get_ve0()); + fn(data); ++ (void)set_exec_env(ve); + if (preempt_count != preempt_count()) { + printk(KERN_WARNING "huh, entered %p " + "with preempt_count %08x, exited" +@@ -868,6 +872,23 @@ EXPORT_SYMBOL(avenrun); + * calc_load - given tick count, update the avenrun load estimates. + * This is called while holding a write_lock on xtime_lock. + */ ++ ++static void calc_load_ve(void) ++{ ++ unsigned long flags, nr_unint; ++ ++ nr_unint = nr_uninterruptible() * FIXED_1; ++ spin_lock_irqsave(&kstat_glb_lock, flags); ++ CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint); ++ CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); ++ CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); ++ ++#ifdef CONFIG_VE ++ do_update_load_avg_ve(); ++#endif ++} ++ + static inline void calc_load(unsigned long ticks) + { + unsigned long active_tasks; /* fixed-point */ +@@ -880,6 +901,7 @@ static inline void calc_load(unsigned lo + CALC_LOAD(avenrun[0], EXP_1, active_tasks); + CALC_LOAD(avenrun[1], EXP_5, active_tasks); + CALC_LOAD(avenrun[2], EXP_15, active_tasks); ++ calc_load_ve(); + } + } + +@@ -990,7 +1012,7 @@ asmlinkage unsigned long sys_alarm(unsig + */ + asmlinkage long sys_getpid(void) + { +- return current->tgid; ++ return virt_tgid(current); + } + + /* +@@ -1012,12 +1034,13 @@ asmlinkage long sys_getpid(void) + asmlinkage long sys_getppid(void) + { + int pid; ++#ifndef CONFIG_DEBUG_SLAB + struct task_struct *me = current; + struct task_struct *parent; + + parent = me->group_leader->real_parent; + for (;;) { +- pid = parent->tgid; ++ pid = virt_tgid(parent); + #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + { + struct task_struct *old = parent; +@@ -1034,6 +1057,16 @@ asmlinkage long sys_getppid(void) + #endif + break; + } ++#else ++ /* ++ * ->real_parent could be released before dereference and ++ * we accessed freed kernel memory, which faults with debugging on. ++ * Keep it simple and stupid. ++ */ ++ read_lock(&tasklist_lock); ++ pid = virt_tgid(current->group_leader->real_parent); ++ read_unlock(&tasklist_lock); ++#endif + return pid; + } + +@@ -1164,7 +1197,7 @@ EXPORT_SYMBOL(schedule_timeout_uninterru + /* Thread ID - the internal kernel "pid" */ + asmlinkage long sys_gettid(void) + { +- return current->pid; ++ return virt_pid(current); + } + + /* +@@ -1176,11 +1209,12 @@ asmlinkage long sys_sysinfo(struct sysin + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + unsigned long seq; ++ unsigned long *__avenrun; ++ struct timespec tp; + + memset((char *)&val, 0, sizeof(struct sysinfo)); + + do { +- struct timespec tp; + seq = read_seqbegin(&xtime_lock); + + /* +@@ -1197,14 +1231,25 @@ asmlinkage long sys_sysinfo(struct sysin + tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; + tp.tv_sec++; + } +- val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); +- +- val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); +- val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); +- val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); ++ } while (read_seqretry(&xtime_lock, seq)); + ++ if (ve_is_super(get_exec_env())) { ++ val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); ++ __avenrun = &avenrun[0]; + val.procs = nr_threads; +- } while (read_seqretry(&xtime_lock, seq)); ++ } ++#ifdef CONFIG_VE ++ else { ++ struct ve_struct *ve; ++ ve = get_exec_env(); ++ __avenrun = &ve->avenrun[0]; ++ val.procs = atomic_read(&ve->pcounter); ++ val.uptime = tp.tv_sec - ve->start_timespec.tv_sec; ++ } ++#endif ++ val.loads[0] = __avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); ++ val.loads[1] = __avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); ++ val.loads[2] = __avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + + si_meminfo(&val); + si_swapinfo(&val); +diff -upr linux-2.6.16.orig/kernel/ub/Kconfig linux-2.6.16-026test015/kernel/ub/Kconfig +--- linux-2.6.16.orig/kernel/ub/Kconfig 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/Kconfig 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,89 @@ ++# ++# User resources part (UBC) ++# ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++menu "User resources" ++ ++config USER_RESOURCE ++ bool "Enable user resource accounting" ++ default y ++ help ++ This patch provides accounting and allows to configure ++ limits for user's consumption of exhaustible system resources. ++ The most important resource controlled by this patch is unswappable ++ memory (either mlock'ed or used by internal kernel structures and ++ buffers). The main goal of this patch is to protect processes ++ from running short of important resources because of an accidental ++ misbehavior of processes or malicious activity aiming to ``kill'' ++ the system. It's worth to mention that resource limits configured ++ by setrlimit(2) do not give an acceptable level of protection ++ because they cover only small fraction of resources and work on a ++ per-process basis. Per-process accounting doesn't prevent malicious ++ users from spawning a lot of resource-consuming processes. ++ ++config USER_RSS_ACCOUNTING ++ bool "Account physical memory usage" ++ default y ++ depends on USER_RESOURCE ++ help ++ This allows to estimate per beancounter physical memory usage. ++ Implemented alghorithm accounts shared pages of memory as well, ++ dividing them by number of beancounter which use the page. ++ ++config USER_SWAP_ACCOUNTING ++ bool "Account swap usage" ++ default y ++ depends on USER_RESOURCE ++ help ++ This allows accounting of swap usage. ++ ++config USER_RESOURCE_PROC ++ bool "Report resource usage in /proc" ++ default y ++ depends on USER_RESOURCE ++ help ++ Allows a system administrator to inspect resource accounts and limits. ++ ++config UBC_DEBUG ++ bool "User resources debug features" ++ default n ++ depends on USER_RESOURCE ++ help ++ Enables to setup debug features for user resource accounting ++ ++config UBC_DEBUG_KMEM ++ bool "Debug kmemsize with cache counters" ++ default n ++ depends on UBC_DEBUG ++ help ++ Adds /proc/user_beancounters_debug entry to get statistics ++ about cache usage of each beancounter ++ ++config UBC_KEEP_UNUSED ++ bool "Keep unused beancounter alive" ++ default y ++ depends on UBC_DEBUG ++ help ++ If on, unused beancounters are kept on the hash and maxheld value ++ can be looked through. ++ ++config UBC_DEBUG_ITEMS ++ bool "Account resources in items rather than in bytes" ++ default y ++ depends on UBC_DEBUG ++ help ++ When true some of the resources (e.g. kmemsize) are accounted ++ in items instead of bytes. ++ ++config UBC_UNLIMITED ++ bool "Use unlimited ubc settings" ++ default y ++ depends on UBC_DEBUG ++ help ++ When ON all limits and barriers are set to max values. ++ ++endmenu +diff -upr linux-2.6.16.orig/kernel/ub/Makefile linux-2.6.16-026test015/kernel/ub/Makefile +--- linux-2.6.16.orig/kernel/ub/Makefile 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/Makefile 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,20 @@ ++# ++# User resources part (UBC) ++# ++# Copyright (C) 2005 SWsoft ++# All rights reserved. ++# ++# Licensing governed by "linux/COPYING.SWsoft" file. ++ ++obj-y := ub_sys.o ++obj-$(CONFIG_USER_RESOURCE) += beancounter.o ++obj-$(CONFIG_USER_RESOURCE) += ub_dcache.o ++obj-$(CONFIG_USER_RESOURCE) += ub_mem.o ++obj-$(CONFIG_USER_RESOURCE) += ub_misc.o ++obj-$(CONFIG_USER_RESOURCE) += ub_net.o ++obj-$(CONFIG_USER_RESOURCE) += ub_pages.o ++obj-$(CONFIG_USER_RESOURCE) += ub_stat.o ++# obj-$(CONFIG_USER_RESOURCE) += ub_oom.o ++ ++obj-$(CONFIG_USER_RSS_ACCOUNTING) += ub_page_bc.o ++obj-$(CONFIG_USER_RESOURCE_PROC) += ub_proc.o +diff -upr linux-2.6.16.orig/kernel/ub/beancounter.c linux-2.6.16-026test015/kernel/ub/beancounter.c +--- linux-2.6.16.orig/kernel/ub/beancounter.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/beancounter.c 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,675 @@ ++/* ++ * linux/kernel/ub/beancounter.c ++ * ++ * Copyright (C) 1998 Alan Cox ++ * 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg> ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * - more intelligent limit check in mremap(): currently the new size is ++ * charged and _then_ old size is uncharged ++ * (almost done: !move_vma case is completely done, ++ * move_vma in its current implementation requires too many conditions to ++ * do things right, because it may be not only expansion, but shrinking ++ * also, plus do_munmap will require an additional parameter...) ++ * - problem: bad pmd page handling ++ * - consider /proc redesign ++ * - TCP/UDP ports ++ * + consider whether __charge_beancounter_locked should be inline ++ * ++ * Changes: ++ * 1999/08/17 Marcelo Tosatti <marcelo@conectiva.com.br> ++ * - Set "barrier" and "limit" parts of limits atomically. ++ * 1999/10/06 Marcelo Tosatti <marcelo@conectiva.com.br> ++ * - setublimit system call. ++ */ ++ ++#include <linux/slab.h> ++#include <linux/module.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_vmpages.h> ++ ++static kmem_cache_t *ub_cachep; ++static struct user_beancounter default_beancounter; ++struct user_beancounter ub0; ++ ++const char *ub_rnames[] = { ++ "kmemsize", /* 0 */ ++ "lockedpages", ++ "privvmpages", ++ "shmpages", ++ "dummy", ++ "numproc", /* 5 */ ++ "physpages", ++ "vmguarpages", ++ "oomguarpages", ++ "numtcpsock", ++ "numflock", /* 10 */ ++ "numpty", ++ "numsiginfo", ++ "tcpsndbuf", ++ "tcprcvbuf", ++ "othersockbuf", /* 15 */ ++ "dgramrcvbuf", ++ "numothersock", ++ "dcachesize", ++ "numfile", ++ "dummy", /* 20 */ ++ "dummy", ++ "dummy", ++ "numiptent", ++ "unused_privvmpages", /* UB_RESOURCES */ ++ "tmpfs_respages", ++ "swap_pages", ++ "held_pages", ++}; ++ ++static void init_beancounter_struct(struct user_beancounter *ub); ++static void init_beancounter_store(struct user_beancounter *ub); ++static void init_beancounter_nolimits(struct user_beancounter *ub); ++ ++void print_ub_uid(struct user_beancounter *ub, char *buf, int size) ++{ ++ if (ub->parent != NULL) ++ snprintf(buf, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid); ++ else ++ snprintf(buf, size, "%u", ub->ub_uid); ++} ++EXPORT_SYMBOL(print_ub_uid); ++ ++#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1)) ++#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17) ++struct ub_hash_slot ub_hash[UB_HASH_SIZE]; ++spinlock_t ub_hash_lock; ++EXPORT_SYMBOL(ub_hash); ++EXPORT_SYMBOL(ub_hash_lock); ++ ++/* ++ * Per user resource beancounting. Resources are tied to their luid. ++ * The resource structure itself is tagged both to the process and ++ * the charging resources (a socket doesn't want to have to search for ++ * things at irq time for example). Reference counters keep things in ++ * hand. ++ * ++ * The case where a user creates resource, kills all his processes and ++ * then starts new ones is correctly handled this way. The refcounters ++ * will mean the old entry is still around with resource tied to it. ++ */ ++struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) ++{ ++ struct user_beancounter *new_ub, *ub; ++ unsigned long flags; ++ struct ub_hash_slot *slot; ++ ++ slot = &ub_hash[ub_hash_fun(uid)]; ++ new_ub = NULL; ++ ++retry: ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = slot->ubh_beans; ++ while (ub != NULL && (ub->ub_uid != uid || ub->parent != NULL)) ++ ub = ub->ub_next; ++ ++ if (ub != NULL) { ++ /* found */ ++ get_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ if (new_ub != NULL) ++ kmem_cache_free(ub_cachep, new_ub); ++ return ub; ++ } ++ ++ if (!create) { ++ /* no ub found */ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return NULL; ++ } ++ ++ if (new_ub != NULL) { ++ /* install new ub */ ++ new_ub->ub_next = slot->ubh_beans; ++ slot->ubh_beans = new_ub; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return new_ub; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ /* alloc new ub */ ++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, ++ GFP_KERNEL); ++ if (new_ub == NULL) ++ return NULL; ++ ++ ub_debug(UBD_ALLOC, "Creating ub %p in slot %p\n", new_ub, slot); ++ memcpy(new_ub, &default_beancounter, sizeof(*new_ub)); ++ init_beancounter_struct(new_ub); ++ new_ub->ub_uid = uid; ++ goto retry; ++} ++EXPORT_SYMBOL(get_beancounter_byuid); ++ ++struct user_beancounter *get_subbeancounter_byid(struct user_beancounter *p, ++ int id, int create) ++{ ++ struct user_beancounter *new_ub, *ub; ++ unsigned long flags; ++ struct ub_hash_slot *slot; ++ ++ slot = &ub_hash[ub_subhash_fun(p, id)]; ++ new_ub = NULL; ++ ++retry: ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = slot->ubh_beans; ++ while (ub != NULL && (ub->parent != p || ub->ub_uid != id)) ++ ub = ub->ub_next; ++ ++ if (ub != NULL) { ++ /* found */ ++ get_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ if (new_ub != NULL) { ++ put_beancounter(new_ub->parent); ++ kmem_cache_free(ub_cachep, new_ub); ++ } ++ return ub; ++ } ++ ++ if (!create) { ++ /* no ub found */ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return NULL; ++ } ++ ++ if (new_ub != NULL) { ++ /* install new ub */ ++ get_beancounter(new_ub); ++ new_ub->ub_next = slot->ubh_beans; ++ slot->ubh_beans = new_ub; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return new_ub; ++ } ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ /* alloc new ub */ ++ new_ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, ++ GFP_KERNEL); ++ if (new_ub == NULL) ++ return NULL; ++ ++ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", new_ub, slot); ++ memset(new_ub, 0, sizeof(*new_ub)); ++ init_beancounter_nolimits(new_ub); ++ init_beancounter_store(new_ub); ++ init_beancounter_struct(new_ub); ++ atomic_set(&new_ub->ub_refcount, 0); ++ new_ub->ub_uid = id; ++ new_ub->parent = get_beancounter(p); ++ goto retry; ++} ++EXPORT_SYMBOL(get_subbeancounter_byid); ++ ++struct user_beancounter *subbeancounter_findcreate(struct user_beancounter *p, ++ int id) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ struct ub_hash_slot *slot; ++ ++ slot = &ub_hash[ub_subhash_fun(p, id)]; ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ ub = slot->ubh_beans; ++ while (ub != NULL && (ub->parent != p || ub->ub_uid != id)) ++ ub = ub->ub_next; ++ ++ if (ub != NULL) { ++ /* found */ ++ get_beancounter(ub); ++ goto done; ++ } ++ ++ /* alloc new ub */ ++ /* Can be called from non-atomic contexts. Den */ ++ ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, GFP_ATOMIC); ++ if (ub == NULL) ++ goto done; ++ ++ ub_debug(UBD_ALLOC, "Creating sub %p in slot %p\n", ub, slot); ++ memset(ub, 0, sizeof(*ub)); ++ init_beancounter_nolimits(ub); ++ init_beancounter_store(ub); ++ init_beancounter_struct(ub); ++ atomic_set(&ub->ub_refcount, 0); ++ ub->ub_uid = id; ++ ub->parent = get_beancounter(p); ++ ++ /* install new ub */ ++ get_beancounter(ub); ++ ub->ub_next = slot->ubh_beans; ++ slot->ubh_beans = ub; ++ ++done: ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return ub; ++} ++EXPORT_SYMBOL(subbeancounter_findcreate); ++#ifndef CONFIG_UBC_KEEP_UNUSED ++ ++static int verify_res(struct user_beancounter *ub, int resource, ++ unsigned long held) ++{ ++ char id[64]; ++ ++ if (likely(held == 0)) ++ return 1; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_WARNING "Ub %s helds %lu in %s on put\n", ++ id, held, ub_rnames[resource]); ++ return 0; ++} ++ ++static inline void verify_held(struct user_beancounter *ub) ++{ ++ int i, clean; ++ ++ clean = 1; ++ for (i = 0; i < UB_RESOURCES; i++) ++ clean &= verify_res(ub, i, ub->ub_parms[i].held); ++ ++ clean &= verify_res(ub, UB_UNUSEDPRIVVM, ub->ub_unused_privvmpages); ++ clean &= verify_res(ub, UB_TMPFSPAGES, ub->ub_tmpfs_respages); ++ clean &= verify_res(ub, UB_SWAPPAGES, ub->ub_swap_pages); ++ clean &= verify_res(ub, UB_HELDPAGES, (unsigned long)ub->ub_held_pages); ++ ++ ub_debug_trace(!clean, 5, 60*HZ); ++} ++ ++static void __unhash_beancounter(struct user_beancounter *ub) ++{ ++ struct user_beancounter **ubptr; ++ struct ub_hash_slot *slot; ++ ++ if (ub->parent != NULL) ++ slot = &ub_hash[ub_subhash_fun(ub->parent, ub->ub_uid)]; ++ else ++ slot = &ub_hash[ub_hash_fun(ub->ub_uid)]; ++ ubptr = &slot->ubh_beans; ++ ++ while (*ubptr != NULL) { ++ if (*ubptr == ub) { ++ verify_held(ub); ++ *ubptr = ub->ub_next; ++ return; ++ } ++ ubptr = &((*ubptr)->ub_next); ++ } ++ printk(KERN_ERR "Invalid beancounter %p, luid=%d on free, slot %p\n", ++ ub, ub->ub_uid, slot); ++} ++#endif ++ ++void __put_beancounter(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ struct user_beancounter *parent; ++ ++again: ++ parent = ub->parent; ++ ub_debug(UBD_ALLOC, "__put bc %p (cnt %d) for %.20s pid %d " ++ "cur %08lx cpu %d.\n", ++ ub, atomic_read(&ub->ub_refcount), ++ current->comm, current->pid, ++ (unsigned long)current, smp_processor_id()); ++ ++ /* equevalent to atomic_dec_and_lock_irqsave() */ ++ local_irq_save(flags); ++ if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) { ++ if (unlikely(atomic_read(&ub->ub_refcount) < 0)) ++ printk(KERN_ERR "UB: Bad ub refcount: ub=%p, " ++ "luid=%d, ref=%d\n", ++ ub, ub->ub_uid, ++ atomic_read(&ub->ub_refcount)); ++ local_irq_restore(flags); ++ return; ++ } ++ ++ if (unlikely(ub == get_ub0())) { ++ printk(KERN_ERR "Trying to put ub0\n"); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ return; ++ } ++ ++#ifndef CONFIG_UBC_KEEP_UNUSED ++ __unhash_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ub_free_counters(ub); ++ kmem_cache_free(ub_cachep, ub); ++#else ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++#endif ++ ub = parent; ++ if (ub != NULL) ++ goto again; ++} ++EXPORT_SYMBOL(__put_beancounter); ++ ++/* ++ * Generic resource charging stuff ++ */ ++ ++int __charge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict) ++{ ++ ub_debug_resource(resource, "Charging %lu for %d of %p with %lu\n", ++ val, resource, ub, ub->ub_parms[resource].held); ++ /* ++ * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition ++ * at the moment is possible so an overflow is impossible. ++ */ ++ ub->ub_parms[resource].held += val; ++ ++ switch (strict) { ++ case UB_HARD: ++ if (ub->ub_parms[resource].held > ++ ub->ub_parms[resource].barrier) ++ break; ++ case UB_SOFT: ++ if (ub->ub_parms[resource].held > ++ ub->ub_parms[resource].limit) ++ break; ++ case UB_FORCE: ++ ub_adjust_maxheld(ub, resource); ++ return 0; ++ default: ++ BUG(); ++ } ++ ++ if (strict == UB_SOFT && ub_ratelimit(&ub->ub_limit_rl)) ++ printk(KERN_INFO "Fatal resource shortage: %s, UB %d.\n", ++ ub_rnames[resource], ub->ub_uid); ++ ub->ub_parms[resource].failcnt++; ++ ub->ub_parms[resource].held -= val; ++ return -ENOMEM; ++} ++ ++int charge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val, enum severity strict) ++{ ++ int retval; ++ struct user_beancounter *p, *q; ++ unsigned long flags; ++ ++ retval = -EINVAL; ++ if (val > UB_MAXVALUE) ++ goto out; ++ ++ local_irq_save(flags); ++ for (p = ub; p != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ retval = __charge_beancounter_locked(p, resource, val, strict); ++ spin_unlock(&p->ub_lock); ++ if (retval) ++ goto unroll; ++ } ++out_restore: ++ local_irq_restore(flags); ++out: ++ return retval; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) { ++ spin_lock(&q->ub_lock); ++ __uncharge_beancounter_locked(q, resource, val); ++ spin_unlock(&q->ub_lock); ++ } ++ goto out_restore; ++} ++ ++EXPORT_SYMBOL(charge_beancounter); ++ ++void charge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ struct user_beancounter *p; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (p = ub; p->parent != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ __charge_beancounter_locked(p, resource, val, UB_FORCE); ++ spin_unlock(&p->ub_lock); ++ } ++ local_irq_restore(flags); ++} ++ ++EXPORT_SYMBOL(charge_beancounter_notop); ++ ++void uncharge_warn(struct user_beancounter *ub, int resource, ++ unsigned long val, unsigned long held) ++{ ++ char id[64]; ++ ++ print_ub_uid(ub, id, sizeof(id)); ++ printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n", ++ val, held, ub_rnames[resource], id); ++ ub_debug_trace(1, 10, 10*HZ); ++} ++ ++void __uncharge_beancounter_locked(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ ub_debug_resource(resource, "Uncharging %lu for %d of %p with %lu\n", ++ val, resource, ub, ub->ub_parms[resource].held); ++ if (ub->ub_parms[resource].held < val) { ++ uncharge_warn(ub, resource, ++ val, ub->ub_parms[resource].held); ++ val = ub->ub_parms[resource].held; ++ } ++ ub->ub_parms[resource].held -= val; ++} ++ ++void uncharge_beancounter(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ unsigned long flags; ++ struct user_beancounter *p; ++ ++ for (p = ub; p != NULL; p = p->parent) { ++ spin_lock_irqsave(&p->ub_lock, flags); ++ __uncharge_beancounter_locked(p, resource, val); ++ spin_unlock_irqrestore(&p->ub_lock, flags); ++ } ++} ++ ++EXPORT_SYMBOL(uncharge_beancounter); ++ ++void uncharge_beancounter_notop(struct user_beancounter *ub, ++ int resource, unsigned long val) ++{ ++ struct user_beancounter *p; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (p = ub; p->parent != NULL; p = p->parent) { ++ spin_lock(&p->ub_lock); ++ __uncharge_beancounter_locked(p, resource, val); ++ spin_unlock(&p->ub_lock); ++ } ++ local_irq_restore(flags); ++} ++ ++EXPORT_SYMBOL(uncharge_beancounter_notop); ++ ++ ++/* ++ * Rate limiting stuff. ++ */ ++int ub_ratelimit(struct ub_rate_info *p) ++{ ++ unsigned long cjif, djif; ++ unsigned long flags; ++ static spinlock_t ratelimit_lock = SPIN_LOCK_UNLOCKED; ++ long new_bucket; ++ ++ spin_lock_irqsave(&ratelimit_lock, flags); ++ cjif = jiffies; ++ djif = cjif - p->last; ++ if (djif < p->interval) { ++ if (p->bucket >= p->burst) { ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 0; ++ } ++ p->bucket++; ++ } else { ++ new_bucket = p->bucket - (djif / (unsigned)p->interval); ++ if (new_bucket < 0) ++ new_bucket = 0; ++ p->bucket = new_bucket + 1; ++ } ++ p->last = cjif; ++ spin_unlock_irqrestore(&ratelimit_lock, flags); ++ return 1; ++} ++EXPORT_SYMBOL(ub_ratelimit); ++ ++ ++/* ++ * Initialization ++ * ++ * struct user_beancounter contains ++ * - limits and other configuration settings, ++ * with a copy stored for accounting purposes, ++ * - structural fields: lists, spinlocks and so on. ++ * ++ * Before these parts are initialized, the structure should be memset ++ * to 0 or copied from a known clean structure. That takes care of a lot ++ * of fields not initialized explicitly. ++ */ ++ ++static void init_beancounter_struct(struct user_beancounter *ub) ++{ ++ ub->ub_magic = UB_MAGIC; ++ atomic_set(&ub->ub_refcount, 1); ++ spin_lock_init(&ub->ub_lock); ++ INIT_LIST_HEAD(&ub->ub_tcp_sk_list); ++ INIT_LIST_HEAD(&ub->ub_other_sk_list); ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ INIT_LIST_HEAD(&ub->ub_cclist); ++#endif ++} ++ ++static void init_beancounter_store(struct user_beancounter *ub) ++{ ++ int k; ++ ++ for (k = 0; k < UB_RESOURCES; k++) { ++ memcpy(&ub->ub_store[k], &ub->ub_parms[k], ++ sizeof(struct ubparm)); ++ } ++} ++ ++static void init_beancounter_nolimits(struct user_beancounter *ub) ++{ ++ int k; ++ ++ for (k = 0; k < UB_RESOURCES; k++) { ++ ub->ub_parms[k].limit = UB_MAXVALUE; ++ /* FIXME: whether this is right for physpages and guarantees? */ ++ ub->ub_parms[k].barrier = UB_MAXVALUE; ++ } ++ ++ /* FIXME: set unlimited rate? */ ++ ub->ub_limit_rl.burst = 4; ++ ub->ub_limit_rl.interval = 300*HZ; ++} ++ ++static void init_beancounter_syslimits(struct user_beancounter *ub, ++ unsigned long mp) ++{ ++ extern int max_threads; ++ int k; ++ ++ ub->ub_parms[UB_KMEMSIZE].limit = ++ mp > (192*1024*1024 >> PAGE_SHIFT) ? ++ 32*1024*1024 : (mp << PAGE_SHIFT) / 6; ++ ub->ub_parms[UB_LOCKEDPAGES].limit = 8; ++ ub->ub_parms[UB_PRIVVMPAGES].limit = UB_MAXVALUE; ++ ub->ub_parms[UB_SHMPAGES].limit = 64; ++ ub->ub_parms[UB_NUMPROC].limit = max_threads / 2; ++ ub->ub_parms[UB_NUMTCPSOCK].limit = 1024; ++ ub->ub_parms[UB_TCPSNDBUF].limit = 1024*4*1024; /* 4k per socket */ ++ ub->ub_parms[UB_TCPRCVBUF].limit = 1024*6*1024; /* 6k per socket */ ++ ub->ub_parms[UB_NUMOTHERSOCK].limit = 256; ++ ub->ub_parms[UB_DGRAMRCVBUF].limit = 256*4*1024; /* 4k per socket */ ++ ub->ub_parms[UB_OTHERSOCKBUF].limit = 256*8*1024; /* 8k per socket */ ++ ub->ub_parms[UB_NUMFLOCK].limit = 1024; ++ ub->ub_parms[UB_NUMPTY].limit = 16; ++ ub->ub_parms[UB_NUMSIGINFO].limit = 1024; ++ ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024; ++ ub->ub_parms[UB_NUMFILE].limit = 1024; ++ ++ for (k = 0; k < UB_RESOURCES; k++) ++ ub->ub_parms[k].barrier = ub->ub_parms[k].limit; ++ ++ ub->ub_limit_rl.burst = 4; ++ ub->ub_limit_rl.interval = 300*HZ; ++} ++ ++void __init ub_init_ub0(void) ++{ ++ struct user_beancounter *ub; ++ ++ init_cache_counters(); ++ ub = get_ub0(); ++ memset(ub, 0, sizeof(*ub)); ++ ub->ub_uid = 0; ++ init_beancounter_nolimits(ub); ++ init_beancounter_store(ub); ++ init_beancounter_struct(ub); ++ ++ memset(¤t->task_bc, 0, sizeof(struct task_beancounter)); ++ (void)set_exec_ub(get_ub0()); ++ current->task_bc.fork_sub = get_beancounter(get_ub0()); ++ init_mm.mm_ub = get_beancounter(ub); ++} ++ ++void __init ub_hash_init(void) ++{ ++ struct ub_hash_slot *slot; ++ ++ spin_lock_init(&ub_hash_lock); ++ /* insert ub0 into the hash */ ++ slot = &ub_hash[ub_hash_fun(get_ub0()->ub_uid)]; ++ slot->ubh_beans = get_ub0(); ++} ++ ++void __init ub_init_cache(unsigned long mempages) ++{ ++ extern int skbc_cache_init(void); ++ int res; ++ ++ res = 0; /* skbc_cache_init(); */ ++ ub_cachep = kmem_cache_create("user_beancounters", ++ sizeof(struct user_beancounter), ++ 0, SLAB_HWCACHE_ALIGN, NULL, NULL); ++ if (res < 0 || ub_cachep == NULL) ++ panic("Can't create ubc caches\n"); ++ ++ memset(&default_beancounter, 0, sizeof(default_beancounter)); ++#ifdef CONFIG_UBC_UNLIMITED ++ init_beancounter_nolimits(&default_beancounter); ++#else ++ init_beancounter_syslimits(&default_beancounter, mempages); ++#endif ++ init_beancounter_store(&default_beancounter); ++ init_beancounter_struct(&default_beancounter); ++ ++ ub_hash_init(); ++} +diff -upr linux-2.6.16.orig/kernel/ub/ub_dcache.c linux-2.6.16-026test015/kernel/ub/ub_dcache.c +--- linux-2.6.16.orig/kernel/ub/ub_dcache.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_dcache.c 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,325 @@ ++/* ++ * kernel/ub/ub_dcache.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/dcache.h> ++#include <linux/slab.h> ++#include <linux/kmem_cache.h> ++#include <linux/fs.h> ++#include <linux/err.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++#include <ub/ub_dcache.h> ++ ++/* ++ * Locking ++ * traverse dcache_lock d_lock ++ * ub_dentry_charge + + + ++ * ub_dentry_uncharge + - + ++ * ub_dentry_charge_nofail + + - ++ * ++ * d_inuse is atomic so that we can inc dentry's parent d_inuse in ++ * ub_dentry_charhe with the only dentry's d_lock held. ++ * ++ * Race in uncharge vs charge_nofail is handled with dcache_lock. ++ * Race in charge vs charge_nofail is inessential since they both inc d_inuse. ++ * Race in uncharge vs charge is handled by altering d_inuse under d_lock. ++ * ++ * Race with d_move is handled this way: ++ * - charge_nofail and uncharge are protected by dcache_lock; ++ * - charge works only with dentry and dentry->d_parent->d_inuse, so ++ * it's enough to lock only the dentry. ++ */ ++ ++/* ++ * Beancounting ++ * UB argument must NOT be NULL ++ */ ++ ++static int do_charge_dcache(struct user_beancounter *ub, unsigned long size, ++ enum severity sv) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size), sv)) ++ goto out_mem; ++ if (__charge_beancounter_locked(ub, UB_DCACHESIZE, size, sv)) ++ goto out_dcache; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return 0; ++ ++out_dcache: ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); ++out_mem: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return -ENOMEM; ++} ++ ++static void do_uncharge_dcache(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, CHARGE_SIZE(size)); ++ __uncharge_beancounter_locked(ub, UB_DCACHESIZE, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static int charge_dcache(struct user_beancounter *ub, unsigned long size, ++ enum severity sv) ++{ ++ struct user_beancounter *p, *q; ++ ++ for (p = ub; p != NULL; p = p->parent) { ++ if (do_charge_dcache(p, size, sv)) ++ goto unroll; ++ } ++ return 0; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) ++ do_uncharge_dcache(q, size); ++ return -ENOMEM; ++} ++ ++void uncharge_dcache(struct user_beancounter *ub, unsigned long size) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_uncharge_dcache(ub, size); ++} ++ ++static inline void charge_dcache_forced(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ charge_dcache(ub, size, UB_FORCE); ++} ++ ++static inline void d_forced_charge(struct dentry_beancounter *d_bc) ++{ ++ d_bc->d_ub = get_beancounter(get_exec_ub()); ++ if (d_bc->d_ub == NULL) ++ return; ++ ++ charge_dcache_forced(d_bc->d_ub, d_bc->d_ubsize); ++} ++ ++static inline void d_uncharge(struct dentry_beancounter *d_bc) ++{ ++ if (d_bc->d_ub == NULL) ++ return; ++ ++ uncharge_dcache(d_bc->d_ub, d_bc->d_ubsize); ++ put_beancounter(d_bc->d_ub); ++ d_bc->d_ub = NULL; ++} ++ ++/* ++ * Alloc / free dentry_beancounter ++ */ ++ ++static inline int d_alloc_beancounter(struct dentry *d) ++{ ++ return 0; ++} ++ ++static inline void d_free_beancounter(struct dentry_beancounter *d_bc) ++{ ++} ++ ++static inline unsigned long d_charge_size(struct dentry *dentry) ++{ ++ /* dentry's d_name is already set to appropriate value (see d_alloc) */ ++ return inode_cachep->objuse + dentry_cache->objuse + ++ (dname_external(dentry) ? ++ kmem_obj_memusage((void *)dentry->d_name.name) : 0); ++} ++ ++/* ++ * dentry mark in use operation ++ * d_lock is held ++ */ ++ ++static int d_inc_inuse(struct dentry *dentry) ++{ ++ struct user_beancounter *ub; ++ struct dentry_beancounter *d_bc; ++ ++ if (dentry != dentry->d_parent) { ++ struct dentry *parent; ++ ++ /* ++ * Increment d_inuse of parent. ++ * It can't change since dentry->d_lock is held. ++ */ ++ parent = dentry->d_parent; ++ if (ub_dget_testone(parent)) ++ BUG(); ++ } ++ ++ d_bc = &dentry->dentry_bc; ++ ub = get_beancounter(get_exec_ub()); ++ ++ if (ub != NULL && charge_dcache(ub, d_bc->d_ubsize, UB_SOFT)) ++ goto out_err; ++ ++ d_bc->d_ub = ub; ++ return 0; ++ ++out_err: ++ put_beancounter(ub); ++ d_bc->d_ub = NULL; ++ return -ENOMEM; ++} ++ ++/* ++ * no locks ++ */ ++int ub_dentry_alloc(struct dentry *dentry) ++{ ++ int err; ++ struct dentry_beancounter *d_bc; ++ ++ err = d_alloc_beancounter(dentry); ++ if (err < 0) ++ return err; ++ ++ d_bc = &dentry->dentry_bc; ++ d_bc->d_ub = get_beancounter(get_exec_ub()); ++ atomic_set(&d_bc->d_inuse, INUSE_INIT); /* see comment in ub_dcache.h */ ++ d_bc->d_ubsize = d_charge_size(dentry); ++ ++ err = 0; ++ if (d_bc->d_ub != NULL && ++ charge_dcache(d_bc->d_ub, d_bc->d_ubsize, UB_HARD)) { ++ put_beancounter(d_bc->d_ub); ++ d_free_beancounter(d_bc); ++ err = -ENOMEM; ++ } ++ ++ return err; ++} ++ ++/* ++ * Charge / uncharge functions. ++ * ++ * We take d_lock to protect dentry_bc from concurrent acces ++ * when simultaneous __d_lookup and d_put happens on one dentry. ++ */ ++ ++/* ++ * no dcache_lock, d_lock and rcu_read_lock are held ++ * drops d_lock, rcu_read_lock and returns error if any ++ */ ++int ub_dentry_charge(struct dentry *dentry) ++{ ++ int err; ++ ++ err = 0; ++ if (ub_dget_testone(dentry)) ++ err = d_inc_inuse(dentry); ++ ++ /* ++ * d_lock and rcu_read_lock are dropped here ++ * (see also __d_lookup) ++ */ ++ spin_unlock(&dentry->d_lock); ++ rcu_read_unlock(); ++ ++ if (!err) ++ return 0; ++ ++ /* ++ * d_invlaidate is required for real_lookup ++ * since it tries to create new dentry on ++ * d_lookup failure. ++ */ ++ if (!d_invalidate(dentry)) ++ return err; ++ ++ /* didn't succeeded, force dentry to be charged */ ++ d_forced_charge(&dentry->dentry_bc); ++ return 0; ++} ++ ++/* ++ * dcache_lock is held ++ * no d_locks, sequentaly takes and drops from dentry upward ++ */ ++void ub_dentry_uncharge(struct dentry *dentry) ++{ ++ struct dentry *parent; ++ ++ /* go up until status is changed and root is not reached */ ++ while (1) { ++ /* ++ * We need d_lock here to handle ++ * the race with ub_dentry_charge ++ */ ++ spin_lock(&dentry->d_lock); ++ if (!ub_dput_testzero(dentry)) { ++ spin_unlock(&dentry->d_lock); ++ break; ++ } ++ ++ /* state transition 0 => -1 */ ++ d_uncharge(&dentry->dentry_bc); ++ parent = dentry->d_parent; ++ spin_unlock(&dentry->d_lock); ++ ++ /* ++ * dcache_lock is held (see comment in __dget_locked) ++ * so we can safely move upwards. ++ */ ++ if (dentry == parent) ++ break; ++ dentry = parent; ++ } ++} ++ ++/* ++ * forced version. for dget in clean cache, when error is not an option ++ * ++ * dcache_lock is held ++ * no d_locks ++ */ ++void ub_dentry_charge_nofail(struct dentry *dentry) ++{ ++ struct dentry *parent; ++ ++ /* go up until status is changed and root is not reached */ ++ while (1) { ++ if (!ub_dget_testone(dentry)) ++ break; ++ ++ /* ++ * state transition -1 => 0 ++ * ++ * No need to lock dentry before atomic_inc ++ * like we do in ub_dentry_uncharge. ++ * We can't race with ub_dentry_uncharge due ++ * to dcache_lock. The only possible race with ++ * ub_dentry_charge is OK since they both ++ * do atomic_inc. ++ */ ++ d_forced_charge(&dentry->dentry_bc); ++ /* ++ * dcache_lock is held (see comment in __dget_locked) ++ * so we can safely move upwards. ++ */ ++ parent = dentry->d_parent; ++ ++ if (dentry == parent) ++ break; ++ dentry = parent; ++ } ++} +diff -upr linux-2.6.16.orig/kernel/ub/ub_mem.c linux-2.6.16-026test015/kernel/ub/ub_mem.c +--- linux-2.6.16.orig/kernel/ub/ub_mem.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_mem.c 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,384 @@ ++/* ++ * kernel/ub/ub_mem.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/slab.h> ++#include <linux/kmem_cache.h> ++#include <linux/kmem_slab.h> ++#include <linux/highmem.h> ++#include <linux/vmalloc.h> ++#include <linux/mm.h> ++#include <linux/gfp.h> ++#include <linux/swap.h> ++#include <linux/spinlock.h> ++#include <linux/sched.h> ++#include <linux/module.h> ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++#include <ub/ub_hash.h> ++ ++/* ++ * Initialization ++ */ ++ ++/* ++ * Slab accounting ++ */ ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ ++#define CC_HASH_SIZE 1024 ++static struct ub_cache_counter *cc_hash[CC_HASH_SIZE]; ++spinlock_t cc_lock; ++ ++static void __free_cache_counters(struct user_beancounter *ub, ++ kmem_cache_t *cachep) ++{ ++ struct ub_cache_counter *cc, **pprev, *del; ++ int i; ++ unsigned long flags; ++ ++ del = NULL; ++ spin_lock_irqsave(&cc_lock, flags); ++ for (i = 0; i < CC_HASH_SIZE; i++) { ++ pprev = &cc_hash[i]; ++ cc = cc_hash[i]; ++ while (cc != NULL) { ++ if (cc->ub != ub && cc->cachep != cachep) { ++ pprev = &cc->next; ++ cc = cc->next; ++ continue; ++ } ++ ++ list_del(&cc->ulist); ++ *pprev = cc->next; ++ cc->next = del; ++ del = cc; ++ cc = *pprev; ++ } ++ } ++ spin_unlock_irqrestore(&cc_lock, flags); ++ ++ while (del != NULL) { ++ cc = del->next; ++ kfree(del); ++ del = cc; ++ } ++} ++ ++void ub_free_counters(struct user_beancounter *ub) ++{ ++ __free_cache_counters(ub, NULL); ++} ++ ++void ub_kmemcache_free(kmem_cache_t *cachep) ++{ ++ __free_cache_counters(NULL, cachep); ++} ++ ++void __init init_cache_counters(void) ++{ ++ memset(cc_hash, 0, CC_HASH_SIZE * sizeof(cc_hash[0])); ++ spin_lock_init(&cc_lock); ++} ++ ++#define cc_hash_fun(ub, cachep) ( \ ++ (((unsigned long)(ub) >> L1_CACHE_SHIFT) ^ \ ++ ((unsigned long)(ub) >> (BITS_PER_LONG / 2)) ^ \ ++ ((unsigned long)(cachep) >> L1_CACHE_SHIFT) ^ \ ++ ((unsigned long)(cachep) >> (BITS_PER_LONG / 2)) \ ++ ) & (CC_HASH_SIZE - 1)) ++ ++static int change_slab_charged(struct user_beancounter *ub, void *objp, ++ unsigned long val, int mask) ++{ ++ struct ub_cache_counter *cc, *new_cnt, **pprev; ++ kmem_cache_t *cachep; ++ unsigned long flags; ++ ++ cachep = virt_to_cache(objp); ++ new_cnt = NULL; ++ ++again: ++ spin_lock_irqsave(&cc_lock, flags); ++ cc = cc_hash[cc_hash_fun(ub, cachep)]; ++ while (cc) { ++ if (cc->ub == ub && cc->cachep == cachep) ++ goto found; ++ cc = cc->next; ++ } ++ ++ if (new_cnt != NULL) ++ goto insert; ++ ++ spin_unlock_irqrestore(&cc_lock, flags); ++ ++ new_cnt = kmalloc(sizeof(*new_cnt), mask & ~__GFP_UBC); ++ if (new_cnt == NULL) ++ return -ENOMEM; ++ ++ new_cnt->counter = 0; ++ new_cnt->ub = ub; ++ new_cnt->cachep = cachep; ++ goto again; ++ ++insert: ++ pprev = &cc_hash[cc_hash_fun(ub, cachep)]; ++ new_cnt->next = *pprev; ++ *pprev = new_cnt; ++ list_add(&new_cnt->ulist, &ub->ub_cclist); ++ cc = new_cnt; ++ new_cnt = NULL; ++ ++found: ++ cc->counter += val; ++ spin_unlock_irqrestore(&cc_lock, flags); ++ if (new_cnt) ++ kfree(new_cnt); ++ return 0; ++} ++ ++static inline int inc_slab_charged(struct user_beancounter *ub, ++ void *objp, int mask) ++{ ++ return change_slab_charged(ub, objp, 1, mask); ++} ++ ++static inline void dec_slab_charged(struct user_beancounter *ub, void *objp) ++{ ++ if (change_slab_charged(ub, objp, -1, 0) < 0) ++ BUG(); ++} ++ ++#include <linux/vmalloc.h> ++ ++static inline int inc_pages_charged(struct user_beancounter *ub, ++ struct page *pg, int order) ++{ ++ int cpu; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].pages_charged += (1 << order); ++ put_cpu(); ++ return 0; ++} ++ ++static inline void dec_pages_charged(struct user_beancounter *ub, ++ struct page *pg, int order) ++{ ++ int cpu; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].pages_charged -= (1 << order); ++ put_cpu(); ++} ++ ++void inc_vmalloc_charged(struct vm_struct *vm, int flags) ++{ ++ int cpu; ++ struct user_beancounter *ub; ++ ++ if (!(flags & __GFP_UBC)) ++ return; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].vmalloc_charged += vm->nr_pages; ++ put_cpu(); ++} ++ ++void dec_vmalloc_charged(struct vm_struct *vm) ++{ ++ int cpu; ++ struct user_beancounter *ub; ++ ++ ub = page_ub(vm->pages[0]); ++ if (ub == NULL) ++ return; ++ ++ cpu = get_cpu(); ++ ub->ub_stat[cpu].vmalloc_charged -= vm->nr_pages; ++ put_cpu(); ++} ++ ++#else ++#define inc_slab_charged(ub, o, m) (0) ++#define dec_slab_charged(ub, o) do { } while (0) ++#define inc_pages_charged(ub, pg, o) (0) ++#define dec_pages_charged(ub, pg, o) do { } while (0) ++#endif ++ ++static inline struct user_beancounter **slab_ub_ref(void *objp) ++{ ++ kmem_cache_t *cachep; ++ struct slab *slabp; ++ int objnr; ++ ++ cachep = virt_to_cache(objp); ++ BUG_ON(!(cachep->flags & SLAB_UBC)); ++ slabp = virt_to_slab(objp); ++ objnr = (objp - slabp->s_mem) / cachep->buffer_size; ++ return slab_ubcs(cachep, slabp) + objnr; ++} ++ ++struct user_beancounter *slab_ub(void *objp) ++{ ++ struct user_beancounter **ub_ref; ++ ++ ub_ref = slab_ub_ref(objp); ++ return *ub_ref; ++} ++ ++EXPORT_SYMBOL(slab_ub); ++ ++static inline int should_charge(void *objp, int flags) ++{ ++ kmem_cache_t *cachep; ++ ++ cachep = virt_to_cache(objp); ++ if (!(cachep->flags & SLAB_UBC)) ++ return 0; ++ if ((cachep->flags & SLAB_NO_CHARGE) && !(flags & __GFP_UBC)) ++ return 0; ++ return 1; ++} ++ ++#define should_uncharge(objp) should_charge(objp, __GFP_UBC) ++ ++int ub_slab_charge(void *objp, int flags) ++{ ++ unsigned int size; ++ struct user_beancounter *ub; ++ ++ if (!should_charge(objp, flags)) ++ return 0; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (ub == NULL) ++ return 0; ++ ++ size = CHARGE_SIZE(kmem_obj_memusage(objp)); ++ if (charge_beancounter(ub, UB_KMEMSIZE, size, ++ (flags & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) ++ goto out_err; ++ ++ if (inc_slab_charged(ub, objp, flags) < 0) { ++ uncharge_beancounter(ub, UB_KMEMSIZE, size); ++ goto out_err; ++ } ++ *slab_ub_ref(objp) = ub; ++ return 0; ++ ++out_err: ++ put_beancounter(ub); ++ return -ENOMEM; ++} ++ ++void ub_slab_uncharge(void *objp) ++{ ++ unsigned int size; ++ struct user_beancounter **ub_ref; ++ ++ if (!should_uncharge(objp)) ++ return; ++ ++ ub_ref = slab_ub_ref(objp); ++ if (*ub_ref == NULL) ++ return; ++ ++ dec_slab_charged(*ub_ref, objp); ++ size = CHARGE_SIZE(kmem_obj_memusage(objp)); ++ uncharge_beancounter(*ub_ref, UB_KMEMSIZE, size); ++ put_beancounter(*ub_ref); ++ *ub_ref = NULL; ++} ++ ++/* ++ * Pages accounting ++ */ ++ ++inline int ub_page_charge(struct page *page, int order, int mask) ++{ ++ struct user_beancounter *ub; ++ ++ ub = NULL; ++ if (!(mask & __GFP_UBC)) ++ goto out; ++ ++ ub = get_beancounter(get_exec_ub()); ++ if (ub == NULL) ++ goto out; ++ ++ if (charge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order), ++ (mask & __GFP_SOFT_UBC ? UB_SOFT : UB_HARD))) ++ goto err; ++ if (inc_pages_charged(ub, page, order) < 0) { ++ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order)); ++ goto err; ++ } ++out: ++ BUG_ON(page_ub(page) != NULL); ++ page_ub(page) = ub; ++ return 0; ++ ++err: ++ BUG_ON(page_ub(page) != NULL); ++ put_beancounter(ub); ++ return -ENOMEM; ++} ++ ++inline void ub_page_uncharge(struct page *page, int order) ++{ ++ struct user_beancounter *ub; ++ ++ ub = page_ub(page); ++ if (ub == NULL) ++ return; ++ ++ dec_pages_charged(ub, page, order); ++ BUG_ON(ub->ub_magic != UB_MAGIC); ++ uncharge_beancounter(ub, UB_KMEMSIZE, CHARGE_ORDER(order)); ++ put_beancounter(ub); ++ page_ub(page) = NULL; ++} ++ ++/* ++ * takes init_mm.page_table_lock ++ * some outer lock to protect pages from vmalloced area must be held ++ */ ++struct user_beancounter *vmalloc_ub(void *obj) ++{ ++ struct page *pg; ++ ++ pg = vmalloc_to_page(obj); ++ if (pg == NULL) ++ return NULL; ++ ++ return page_ub(pg); ++} ++ ++EXPORT_SYMBOL(vmalloc_ub); ++ ++struct user_beancounter *mem_ub(void *obj) ++{ ++ struct user_beancounter *ub; ++ ++ if ((unsigned long)obj >= VMALLOC_START && ++ (unsigned long)obj < VMALLOC_END) ++ ub = vmalloc_ub(obj); ++ else ++ ub = slab_ub(obj); ++ ++ return ub; ++} ++ ++EXPORT_SYMBOL(mem_ub); +diff -upr linux-2.6.16.orig/kernel/ub/ub_misc.c linux-2.6.16-026test015/kernel/ub/ub_misc.c +--- linux-2.6.16.orig/kernel/ub/ub_misc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_misc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,244 @@ ++/* ++ * kernel/ub/ub_misc.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/tty.h> ++#include <linux/tty_driver.h> ++#include <linux/signal.h> ++#include <linux/slab.h> ++#include <linux/fs.h> ++#include <linux/sched.h> ++#include <linux/kmem_cache.h> ++#include <linux/module.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> ++ ++/* ++ * Task staff ++ */ ++ ++static void init_task_sub(struct task_struct *tsk, ++ struct task_beancounter *old_bc) ++{ ++ struct task_beancounter *new_bc; ++ struct user_beancounter *sub; ++ ++ new_bc = &tsk->task_bc; ++ sub = old_bc->fork_sub; ++ new_bc->fork_sub = get_beancounter(sub); ++ new_bc->task_fnode = NULL; ++ new_bc->task_freserv = old_bc->task_freserv; ++ old_bc->task_freserv = NULL; ++ memset(&new_bc->task_data, 0, sizeof(new_bc->task_data)); ++} ++ ++int ub_task_charge(struct task_struct *parent, struct task_struct *task) ++{ ++ struct task_beancounter *old_bc; ++ struct task_beancounter *new_bc; ++ struct user_beancounter *ub; ++ ++ old_bc = &parent->task_bc; ++#if 0 ++ if (old_bc->exec_ub == NULL) { ++ /* FIXME: this won't work if task_bc is outside task_struct */ ++ init_task_sub(task, old_bc); ++ return 0; ++ } ++#endif ++ ub = old_bc->fork_sub; ++ ++ if (charge_beancounter(ub, UB_NUMPROC, 1, UB_HARD) < 0) ++ return -ENOMEM; ++ ++ new_bc = &task->task_bc; ++ new_bc->task_ub = get_beancounter(ub); ++ new_bc->exec_ub = get_beancounter(ub); ++ init_task_sub(task, old_bc); ++ return 0; ++} ++ ++void ub_task_uncharge(struct task_struct *task) ++{ ++ struct task_beancounter *task_bc; ++ ++ task_bc = &task->task_bc; ++ if (task_bc->task_ub != NULL) ++ uncharge_beancounter(task_bc->task_ub, UB_NUMPROC, 1); ++ ++ put_beancounter(task_bc->exec_ub); ++ put_beancounter(task_bc->task_ub); ++ put_beancounter(task_bc->fork_sub); ++ /* can't be freed elsewhere, failures possible in the middle of fork */ ++ if (task_bc->task_freserv != NULL) ++ kfree(task_bc->task_freserv); ++ ++ task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc; ++} ++ ++/* ++ * Files and file locks. ++ */ ++ ++int ub_file_charge(struct file *f) ++{ ++ struct user_beancounter *ub; ++ ++ /* No need to get_beancounter here since it's already got in slab */ ++ ub = slab_ub(f); ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_NUMFILE, 1, UB_HARD); ++} ++ ++void ub_file_uncharge(struct file *f) ++{ ++ struct user_beancounter *ub; ++ ++ /* Ub will be put in slab */ ++ ub = slab_ub(f); ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_NUMFILE, 1); ++} ++ ++int ub_flock_charge(struct file_lock *fl, int hard) ++{ ++ struct user_beancounter *ub; ++ int err; ++ ++ /* No need to get_beancounter here since it's already got in slab */ ++ ub = slab_ub(fl); ++ if (ub == NULL) ++ return 0; ++ ++ err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT); ++ if (!err) ++ fl->fl_charged = 1; ++ return err; ++} ++ ++void ub_flock_uncharge(struct file_lock *fl) ++{ ++ struct user_beancounter *ub; ++ ++ /* Ub will be put in slab */ ++ ub = slab_ub(fl); ++ if (ub == NULL || !fl->fl_charged) ++ return; ++ ++ uncharge_beancounter(ub, UB_NUMFLOCK, 1); ++ fl->fl_charged = 0; ++} ++ ++/* ++ * Signal handling ++ */ ++ ++static int do_ub_siginfo_charge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_beancounter_locked(ub, UB_KMEMSIZE, size, UB_HARD)) ++ goto out_kmem; ++ ++ if (__charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, UB_HARD)) ++ goto out_num; ++ ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return 0; ++ ++out_num: ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); ++out_kmem: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return -ENOMEM; ++} ++ ++static void do_ub_siginfo_uncharge(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); ++ __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub) ++{ ++ unsigned long size; ++ struct user_beancounter *p, *q; ++ ++ size = CHARGE_SIZE(kmem_obj_memusage(sq)); ++ for (p = ub; p != NULL; p = p->parent) { ++ if (do_ub_siginfo_charge(p, size)) ++ goto unroll; ++ } ++ ++ sq->sig_ub = get_beancounter(ub); ++ return 0; ++ ++unroll: ++ for (q = ub; q != p; q = q->parent) ++ do_ub_siginfo_uncharge(q, size); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(ub_siginfo_charge); ++ ++void ub_siginfo_uncharge(struct sigqueue *sq) ++{ ++ unsigned long size; ++ struct user_beancounter *ub, *p; ++ ++ p = ub = sq->sig_ub; ++ sq->sig_ub = NULL; ++ size = CHARGE_SIZE(kmem_obj_memusage(sq)); ++ for (; ub != NULL; ub = ub->parent) ++ do_ub_siginfo_uncharge(ub, size); ++ put_beancounter(p); ++} ++ ++/* ++ * PTYs ++ */ ++ ++int ub_pty_charge(struct tty_struct *tty) ++{ ++ struct user_beancounter *ub; ++ int retval; ++ ++ ub = slab_ub(tty); ++ retval = 0; ++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && ++ !test_bit(TTY_CHARGED, &tty->flags)) { ++ retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD); ++ if (!retval) ++ set_bit(TTY_CHARGED, &tty->flags); ++ } ++ return retval; ++} ++ ++void ub_pty_uncharge(struct tty_struct *tty) ++{ ++ struct user_beancounter *ub; ++ ++ ub = slab_ub(tty); ++ if (ub && tty->driver->subtype == PTY_TYPE_MASTER && ++ test_bit(TTY_CHARGED, &tty->flags)) { ++ uncharge_beancounter(ub, UB_NUMPTY, 1); ++ clear_bit(TTY_CHARGED, &tty->flags); ++ } ++} +diff -upr linux-2.6.16.orig/kernel/ub/ub_net.c linux-2.6.16-026test015/kernel/ub/ub_net.c +--- linux-2.6.16.orig/kernel/ub/ub_net.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_net.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,1044 @@ ++/* ++ * linux/kernel/ub/ub_net.c ++ * ++ * Copyright (C) 1998-2004 Andrey V. Savochkin <saw@saw.sw.com.sg> ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * - sizeof(struct inode) charge ++ * = tcp_mem_schedule() feedback based on ub limits ++ * + measures so that one socket won't exhaust all send buffers, ++ * see bug in bugzilla ++ * = sk->socket check for NULL in snd_wakeups ++ * (tcp_write_space checks for NULL itself) ++ * + in tcp_close(), orphaned socket abortion should be based on ubc ++ * resources (same in tcp_out_of_resources) ++ * Beancounter should also have separate orphaned socket counter... ++ * + for rcv, in-order segment should be accepted ++ * if only barrier is exceeded ++ * = tcp_rmem_schedule() feedback based on ub limits ++ * - repair forward_alloc mechanism for receive buffers ++ * It's idea is that some buffer space is pre-charged so that receive fast ++ * path doesn't need to take spinlocks and do other heavy stuff ++ * + tcp_prune_queue actions based on ub limits ++ * + window adjustments depending on available buffers for receive ++ * - window adjustments depending on available buffers for send ++ * + race around usewreserv ++ * + avoid allocating new page for each tiny-gram, see letter from ANK ++ * + rename ub_sock_lock ++ * + sk->sleep wait queue probably can be used for all wakeups, and ++ * sk->ub_wait is unnecessary ++ * + for UNIX sockets, the current algorithm will lead to ++ * UB_UNIX_MINBUF-sized messages only for non-blocking case ++ * - charge for af_packet sockets ++ * + all datagram sockets should be charged to NUMUNIXSOCK ++ * - we do not charge for skb copies and clones staying in device queues ++ * + live-lock if number of sockets is big and buffer limits are small ++ * [diff-ubc-dbllim3] ++ * - check that multiple readers/writers on the same socket won't cause fatal ++ * consequences ++ * - check allocation/charge orders ++ * + There is potential problem with callback_lock. In *snd_wakeup we take ++ * beancounter first, in sock_def_error_report - callback_lock first. ++ * then beancounter. This is not a problem if callback_lock taken ++ * readonly, but anyway... ++ * - SKB_CHARGE_SIZE doesn't include the space wasted by slab allocator ++ * General kernel problems: ++ * - in tcp_sendmsg(), if allocation fails, non-blocking sockets with ASYNC ++ * notification won't get signals ++ * - datagram_poll looks racy ++ * ++ */ ++ ++#include <linux/net.h> ++#include <linux/slab.h> ++#include <linux/kmem_cache.h> ++#include <linux/gfp.h> ++#include <linux/err.h> ++#include <linux/socket.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++ ++#include <net/sock.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_net.h> ++#include <ub/ub_debug.h> ++ ++ ++/* Skb truesize definition. Bad place. Den */ ++ ++static inline int skb_chargesize_head(struct sk_buff *skb) ++{ ++ return skb_charge_size(skb->end - skb->head + ++ sizeof(struct skb_shared_info)); ++} ++ ++int skb_charge_fullsize(struct sk_buff *skb) ++{ ++ int chargesize; ++ struct sk_buff *skbfrag; ++ ++ chargesize = skb_chargesize_head(skb) + ++ PAGE_SIZE * skb_shinfo(skb)->nr_frags; ++ if (likely(skb_shinfo(skb)->frag_list == NULL)) ++ return chargesize; ++ for (skbfrag = skb_shinfo(skb)->frag_list; ++ skbfrag != NULL; ++ skbfrag = skbfrag->next) { ++ chargesize += skb_charge_fullsize(skbfrag); ++ } ++ return chargesize; ++} ++EXPORT_SYMBOL(skb_charge_fullsize); ++ ++static int ub_sock_makewreserv_locked(struct sock *sk, ++ int bufid, int sockid, unsigned long size); ++ ++int __ub_too_many_orphans(struct sock *sk, int count) ++{ ++ struct user_beancounter *ub; ++ ++ if (sock_has_ubc(sk)) { ++ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent); ++ if (count >= ub->ub_parms[UB_NUMTCPSOCK].barrier >> 2) ++ return 1; ++ } ++ return 0; ++} ++ ++/* ++ * Queueing ++ */ ++ ++static void ub_sock_snd_wakeup(struct user_beancounter *ub) ++{ ++ struct list_head *p; ++ struct sock_beancounter *skbc; ++ struct sock *sk; ++ struct user_beancounter *cub; ++ unsigned long added; ++ ++ while (!list_empty(&ub->ub_other_sk_list)) { ++ p = ub->ub_other_sk_list.next; ++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); ++ sk = skbc_sock(skbc); ++ ub_debug(UBD_NET_SLEEP, "Found sock to wake up\n"); ++ added = -skbc->poll_reserv; ++ if (ub_sock_makewreserv_locked(sk, UB_OTHERSOCKBUF, ++ UB_NUMOTHERSOCK, skbc->ub_waitspc)) ++ break; ++ added += skbc->poll_reserv; ++ ++ /* ++ * See comments in ub_tcp_snd_wakeup. ++ * Locking note: both unix_write_space and ++ * sock_def_write_space take callback_lock themselves. ++ * We take it here just to be on the safe side and to ++ * act the same way as ub_tcp_snd_wakeup does. ++ */ ++ sk->sk_write_space(sk); ++ ++ list_del_init(&skbc->ub_sock_list); ++ ++ if (skbc->ub != ub && added) { ++ cub = get_beancounter(skbc->ub); ++ spin_unlock(&ub->ub_lock); ++ charge_beancounter_notop(cub, UB_OTHERSOCKBUF, added); ++ put_beancounter(cub); ++ spin_lock(&ub->ub_lock); ++ } ++ } ++} ++ ++static void ub_tcp_snd_wakeup(struct user_beancounter *ub) ++{ ++ struct list_head *p; ++ struct sock *sk; ++ struct sock_beancounter *skbc; ++ struct socket *sock; ++ struct user_beancounter *cub; ++ unsigned long added; ++ ++ while (!list_empty(&ub->ub_tcp_sk_list)) { ++ p = ub->ub_tcp_sk_list.next; ++ skbc = list_entry(p, struct sock_beancounter, ub_sock_list); ++ sk = skbc_sock(skbc); ++ ++ added = 0; ++ sock = sk->sk_socket; ++ if (sock == NULL) ++ /* sk being destroyed */ ++ goto cont; ++ ++ ub_debug(UBD_NET_SLEEP, ++ "Checking queue, waiting %lu, reserv %lu\n", ++ skbc->ub_waitspc, skbc->poll_reserv); ++ added = -skbc->poll_reserv; ++ if (ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, ++ UB_NUMTCPSOCK, skbc->ub_waitspc)) ++ break; ++ added += skbc->poll_reserv; ++ ++ /* ++ * Send async notifications and wake up. ++ * Locking note: we get callback_lock here because ++ * tcp_write_space is over-optimistic about calling context ++ * (socket lock is presumed). So we get the lock here although ++ * it belongs to the callback. ++ */ ++ sk->sk_write_space(sk); ++ ++cont: ++ list_del_init(&skbc->ub_sock_list); ++ ++ if (skbc->ub != ub && added) { ++ cub = get_beancounter(skbc->ub); ++ spin_unlock(&ub->ub_lock); ++ charge_beancounter_notop(cub, UB_TCPSNDBUF, added); ++ put_beancounter(cub); ++ spin_lock(&ub->ub_lock); ++ } ++ } ++} ++ ++void ub_sock_snd_queue_add(struct sock *sk, int res, unsigned long size) ++{ ++ unsigned long flags; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long added_reserv; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ skbc = sock_bc(sk); ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub_debug(UBD_NET_SLEEP, "attempt to charge for %lu\n", size); ++ added_reserv = -skbc->poll_reserv; ++ if (!ub_sock_makewreserv_locked(sk, res, bid2sid(res), size)) { ++ /* ++ * It looks a bit hackish, but it is compatible with both ++ * wait_for_xx_ubspace and poll. ++ * This __set_current_state is equivalent to a wakeup event ++ * right after spin_unlock_irqrestore. ++ */ ++ __set_current_state(TASK_RUNNING); ++ added_reserv += skbc->poll_reserv; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, res, added_reserv); ++ return; ++ } ++ ++ ub_debug(UBD_NET_SLEEP, "Adding sk to queue\n"); ++ skbc->ub_waitspc = size; ++ if (!list_empty(&skbc->ub_sock_list)) { ++ ub_debug(UBD_NET_SOCKET, ++ "re-adding socket to beancounter %p.\n", ub); ++ goto out; ++ } ++ ++ switch (res) { ++ case UB_TCPSNDBUF: ++ list_add_tail(&skbc->ub_sock_list, ++ &ub->ub_tcp_sk_list); ++ break; ++ case UB_OTHERSOCKBUF: ++ list_add_tail(&skbc->ub_sock_list, ++ &ub->ub_other_sk_list); ++ break; ++ default: ++ BUG(); ++ } ++out: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++ ++/* ++ * Helpers ++ */ ++ ++void ub_skb_set_charge(struct sk_buff *skb, struct sock *sk, ++ unsigned long size, int resource) ++{ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ if (sock_bc(sk)->ub == NULL) ++ BUG(); ++ skb_bc(skb)->ub = sock_bc(sk)->ub; ++ skb_bc(skb)->charged = size; ++ skb_bc(skb)->resource = resource; ++ ++ /* Ugly. Ugly. Skb in sk writequeue can live without ref to sk */ ++ if (skb->sk == NULL) ++ skb->sk = sk; ++} ++ ++static inline void ub_skb_set_uncharge(struct sk_buff *skb) ++{ ++ skb_bc(skb)->ub = NULL; ++ skb_bc(skb)->charged = 0; ++ skb_bc(skb)->resource = 0; ++} ++ ++static inline void __uncharge_sockbuf(struct sock_beancounter *skbc, ++ struct user_beancounter *ub, int resource, unsigned long size) ++{ ++ if (ub != NULL) ++ __uncharge_beancounter_locked(ub, resource, size); ++ ++ if (skbc != NULL) { ++ if (skbc->ub_wcharged > size) ++ skbc->ub_wcharged -= size; ++ else ++ skbc->ub_wcharged = 0; ++ } ++} ++ ++static void ub_update_rmem_thres(struct sock_beancounter *skub) ++{ ++ struct user_beancounter *ub; ++ ++ if (skub && skub->ub) { ++ for (ub = skub->ub; ub->parent != NULL; ub = ub->parent); ++ ub->ub_rmem_thres = ub->ub_parms[UB_TCPRCVBUF].barrier / ++ (ub->ub_parms[UB_NUMTCPSOCK].held + 1); ++ } ++} ++inline int ub_skb_alloc_bc(struct sk_buff *skb, int gfp_mask) ++{ ++ memset(skb_bc(skb), 0, sizeof(struct skb_beancounter)); ++ return 0; ++} ++ ++inline void ub_skb_free_bc(struct sk_buff *skb) ++{ ++} ++ ++ ++/* ++ * Charge socket number ++ */ ++ ++static inline int sk_alloc_beancounter(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ ++ skbc = sock_bc(sk); ++ memset(skbc, 0, sizeof(struct sock_beancounter)); ++ return 0; ++} ++ ++static inline void sk_free_beancounter(struct sock *sk) ++{ ++} ++ ++static int __sock_charge(struct sock *sk, int res) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return 0; ++ if (sk_alloc_beancounter(sk) < 0) ++ return -ENOMEM; ++ ++ skbc = sock_bc(sk); ++ INIT_LIST_HEAD(&skbc->ub_sock_list); ++ ++ if (charge_beancounter(ub, res, 1, UB_HARD) < 0) ++ goto out_limit; ++ ++ /* TCP listen sock or process keeps referrence to UB */ ++ skbc->ub = get_beancounter(ub); ++ return 0; ++ ++out_limit: ++ sk_free_beancounter(sk); ++ return -ENOMEM; ++} ++ ++int ub_tcp_sock_charge(struct sock *sk) ++{ ++ int ret; ++ ++ ret = __sock_charge(sk, UB_NUMTCPSOCK); ++ ub_update_rmem_thres(sock_bc(sk)); ++ ++ return ret; ++} ++ ++int ub_other_sock_charge(struct sock *sk) ++{ ++ return __sock_charge(sk, UB_NUMOTHERSOCK); ++} ++ ++EXPORT_SYMBOL(ub_other_sock_charge); ++ ++int ub_sock_charge(struct sock *sk, int family, int type) ++{ ++ return (IS_TCP_SOCK(family, type) ? ++ ub_tcp_sock_charge(sk) : ub_other_sock_charge(sk)); ++} ++EXPORT_SYMBOL(ub_sock_charge); ++ ++/* ++ * Uncharge socket number ++ */ ++ ++void ub_sock_uncharge(struct sock *sk) ++{ ++ int is_tcp_sock; ++ unsigned long flags; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long reserv; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ is_tcp_sock = IS_TCP_SOCK(sk->sk_family, sk->sk_type); ++ skbc = sock_bc(sk); ++ ub_debug(UBD_NET_SOCKET, "Calling ub_sock_uncharge on %p\n", sk); ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (!list_empty(&skbc->ub_sock_list)) { ++ ub_debug(UBD_NET_SOCKET, ++ "ub_sock_uncharge: removing from ub(%p) queue.\n", ++ skbc); ++ list_del_init(&skbc->ub_sock_list); ++ } ++ ++ reserv = skbc->poll_reserv; ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), ++ reserv); ++ __uncharge_beancounter_locked(ub, ++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); ++ ++ /* The check sk->sk_family != PF_NETLINK is made as the skb is ++ * queued to the kernel end of socket while changed to the user one. ++ * Den */ ++ if (skbc->ub_wcharged > reserv && ++ sk->sk_family != PF_NETLINK) { ++ skbc->ub_wcharged -= reserv; ++ printk(KERN_WARNING ++ "ub_sock_uncharge: wch=%lu for ub %p (%d).\n", ++ skbc->ub_wcharged, skbc->ub, skbc->ub->ub_uid); ++ } else ++ skbc->ub_wcharged = 0; ++ skbc->poll_reserv = 0; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_TCPSNDBUF : UB_OTHERSOCKBUF), ++ reserv); ++ uncharge_beancounter_notop(skbc->ub, ++ (is_tcp_sock ? UB_NUMTCPSOCK : UB_NUMOTHERSOCK), 1); ++ ++ put_beancounter(skbc->ub); ++ sk_free_beancounter(sk); ++} ++ ++/* ++ * Send - receive buffers ++ */ ++ ++/* Special case for netlink_dump - (un)charges precalculated size */ ++int ub_nlrcvbuf_charge(struct sk_buff *skb, struct sock *sk) ++{ ++ int ret; ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ ret = charge_beancounter(sock_bc(sk)->ub, ++ UB_DGRAMRCVBUF, chargesize, UB_HARD); ++ if (ret < 0) ++ return ret; ++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); ++ return ret; ++} ++ ++/* ++ * Poll reserv accounting ++ */ ++static int ub_sock_makewreserv_locked(struct sock *sk, ++ int bufid, int sockid, unsigned long size) ++{ ++ unsigned long wcharge_added; ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ ++ if (!sock_has_ubc(sk)) ++ goto out; ++ ++ skbc = sock_bc(sk); ++ if (skbc->poll_reserv >= size) /* no work to be done */ ++ goto out; ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ ub->ub_parms[bufid].held += size - skbc->poll_reserv; ++ ++ wcharge_added = 0; ++ /* ++ * Logic: ++ * 1) when used memory hits barrier, we set wmem_pressure; ++ * wmem_pressure is reset under barrier/2; ++ * between barrier/2 and barrier we limit per-socket buffer growth; ++ * 2) each socket is guaranteed to get (limit-barrier)/maxsockets ++ * calculated on the base of memory eaten after the barrier is hit ++ */ ++ skbc = sock_bc(sk); ++ if (!ub_hfbarrier_hit(ub, bufid)) { ++ if (ub->ub_wmem_pressure) ++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 0 " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ++ ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_wmem_pressure = 0; ++ } ++ if (ub_barrier_hit(ub, bufid)) { ++ if (!ub->ub_wmem_pressure) ++ ub_debug(UBD_NET_SEND, "makewres: pressure -> 1 " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ++ ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ ub->ub_wmem_pressure = 1; ++ wcharge_added = size - skbc->poll_reserv; ++ skbc->ub_wcharged += wcharge_added; ++ if (skbc->ub_wcharged * ub->ub_parms[sockid].limit + ++ ub->ub_parms[bufid].barrier > ++ ub->ub_parms[bufid].limit) ++ goto unroll; ++ } ++ if (ub->ub_parms[bufid].held > ub->ub_parms[bufid].limit) ++ goto unroll; ++ ++ ub_adjust_maxheld(ub, bufid); ++ skbc->poll_reserv = size; ++out: ++ return 0; ++ ++unroll: ++ ub_debug(UBD_NET_SEND, ++ "makewres: deny " ++ "sk %p sz %lu pr %lu hd %lu wc %lu sb %d.\n", ++ sk, size, skbc->poll_reserv, ub->ub_parms[bufid].held, ++ skbc->ub_wcharged, sk->sk_sndbuf); ++ skbc->ub_wcharged -= wcharge_added; ++ ub->ub_parms[bufid].failcnt++; ++ ub->ub_parms[bufid].held -= size - skbc->poll_reserv; ++ return -ENOMEM; ++} ++ ++int ub_sock_make_wreserv(struct sock *sk, int bufid, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ unsigned long added_reserv; ++ int err; ++ ++ skbc = sock_bc(sk); ++ ++ /* ++ * This function provides that there is sufficient reserve upon return ++ * only if sk has only one user. We can check poll_reserv without ++ * serialization and avoid locking if the reserve already exists. ++ */ ++ if (!sock_has_ubc(sk) || skbc->poll_reserv >= size) ++ return 0; ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ added_reserv = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size); ++ added_reserv += skbc->poll_reserv; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, bufid, added_reserv); ++ ++ return err; ++} ++ ++int ub_sock_get_wreserv(struct sock *sk, int bufid, unsigned long size) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long flags; ++ unsigned long added_reserv; ++ int err; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ skbc = sock_bc(sk); ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ added_reserv = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, bufid, bid2sid(bufid), size); ++ added_reserv += skbc->poll_reserv; ++ if (!err) ++ skbc->poll_reserv -= size; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (added_reserv) ++ charge_beancounter_notop(skbc->ub, bufid, added_reserv); ++ ++ return err; ++} ++ ++void ub_sock_ret_wreserv(struct sock *sk, int bufid, ++ unsigned long size, unsigned long ressize) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long extra; ++ unsigned long flags; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ extra = 0; ++ skbc = sock_bc(sk); ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ skbc->poll_reserv += size; ++ if (skbc->poll_reserv > ressize) { ++ extra = skbc->poll_reserv - ressize; ++ __uncharge_beancounter_locked(ub, bufid, extra); ++ ++ if (skbc->ub_wcharged > skbc->poll_reserv - ressize) ++ skbc->ub_wcharged -= skbc->poll_reserv - ressize; ++ else ++ skbc->ub_wcharged = 0; ++ skbc->poll_reserv = ressize; ++ } ++ ++ ub_tcp_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ if (extra) ++ uncharge_beancounter_notop(skbc->ub, bufid, extra); ++} ++ ++long ub_sock_wait_for_space(struct sock *sk, long timeo, unsigned long size) ++{ ++ DECLARE_WAITQUEUE(wait, current); ++ ++ add_wait_queue(sk->sk_sleep, &wait); ++ for (;;) { ++ if (signal_pending(current)) ++ break; ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (!ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size)) ++ break; ++ ++ if (sk->sk_shutdown & SEND_SHUTDOWN) ++ break; ++ if (sk->sk_err) ++ break; ++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, size); ++ timeo = schedule_timeout(timeo); ++ } ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(sk->sk_sleep, &wait); ++ return timeo; ++} ++ ++int ub_sock_makewres_other(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_make_wreserv(sk, UB_OTHERSOCKBUF, size); ++} ++ ++int ub_sock_makewres_tcp(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_make_wreserv(sk, UB_TCPSNDBUF, size); ++} ++ ++int ub_sock_getwres_other(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_get_wreserv(sk, UB_OTHERSOCKBUF, size); ++} ++ ++int ub_sock_getwres_tcp(struct sock *sk, unsigned long size) ++{ ++ return ub_sock_get_wreserv(sk, UB_TCPSNDBUF, size); ++} ++ ++void ub_sock_retwres_other(struct sock *sk, unsigned long size, ++ unsigned long ressize) ++{ ++ ub_sock_ret_wreserv(sk, UB_OTHERSOCKBUF, size, ressize); ++} ++ ++void ub_sock_retwres_tcp(struct sock *sk, unsigned long size, ++ unsigned long ressize) ++{ ++ ub_sock_ret_wreserv(sk, UB_TCPSNDBUF, size, ressize); ++} ++ ++void ub_sock_sndqueueadd_other(struct sock *sk, unsigned long sz) ++{ ++ ub_sock_snd_queue_add(sk, UB_OTHERSOCKBUF, sz); ++} ++ ++void ub_sock_sndqueueadd_tcp(struct sock *sk, unsigned long sz) ++{ ++ ub_sock_snd_queue_add(sk, UB_TCPSNDBUF, sz); ++} ++ ++void ub_sock_sndqueuedel(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ unsigned long flags; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ skbc = sock_bc(sk); ++ ++ /* race with write_space callback of other socket */ ++ spin_lock_irqsave(&skbc->ub->ub_lock, flags); ++ list_del_init(&skbc->ub_sock_list); ++ spin_unlock_irqrestore(&skbc->ub->ub_lock, flags); ++} ++ ++/* ++ * UB_DGRAMRCVBUF ++ */ ++ ++int ub_sockrcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ if (charge_beancounter(sock_bc(sk)->ub, UB_DGRAMRCVBUF, ++ chargesize, UB_HARD)) ++ return -ENOMEM; ++ ++ ub_skb_set_charge(skb, sk, chargesize, UB_DGRAMRCVBUF); ++ return 0; ++} ++ ++EXPORT_SYMBOL(ub_sockrcvbuf_charge); ++ ++static void ub_sockrcvbuf_uncharge(struct sk_buff *skb) ++{ ++ uncharge_beancounter(skb_bc(skb)->ub, UB_DGRAMRCVBUF, ++ skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++/* ++ * UB_TCPRCVBUF ++ */ ++static int charge_tcprcvbuf(struct sock *sk, struct sk_buff *skb, ++ enum severity strict) ++{ ++ int retval; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ /* ++ * Memory pressure reactions: ++ * 1) set UB_RMEM_KEEP (clearing UB_RMEM_EXPAND) ++ * 2) set UB_RMEM_SHRINK and tcp_clamp_window() ++ * tcp_collapse_queues() if rmem_alloc > rcvbuf ++ * 3) drop OFO, tcp_purge_ofo() ++ * 4) drop all. ++ * Currently, we do #2 and #3 at once (which means that current ++ * collapsing of OFO queue in tcp_collapse_queues() is a waste of time, ++ * for example...) ++ * On memory pressure we jump from #0 to #3, and when the pressure ++ * subsides, to #1. ++ */ ++ retval = 0; ++ chargesize = skb_charge_fullsize(skb); ++ ++ for (ub = sock_bc(sk)->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_parms[UB_TCPRCVBUF].held += chargesize; ++ if (ub->ub_parms[UB_TCPRCVBUF].held > ++ ub->ub_parms[UB_TCPRCVBUF].barrier && ++ strict != UB_FORCE) ++ goto excess; ++ ub_adjust_maxheld(ub, UB_TCPRCVBUF); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++out: ++ if (retval == 0) { ++ charge_beancounter_notop(sock_bc(sk)->ub, UB_TCPRCVBUF, ++ chargesize); ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPRCVBUF); ++ } ++ return retval; ++ ++excess: ++ ub->ub_rmem_pressure = UB_RMEM_SHRINK; ++ if (strict == UB_HARD) ++ retval = -ENOMEM; ++ if (ub->ub_parms[UB_TCPRCVBUF].held > ub->ub_parms[UB_TCPRCVBUF].limit) ++ retval = -ENOMEM; ++ /* ++ * We try to leave numsock*maxadvmss as a reserve for sockets not ++ * queueing any data yet (if the difference between the barrier and the ++ * limit is enough for this reserve). ++ */ ++ if (ub->ub_parms[UB_TCPRCVBUF].held + ++ ub->ub_parms[UB_NUMTCPSOCK].limit * ub->ub_maxadvmss ++ > ub->ub_parms[UB_TCPRCVBUF].limit && ++ atomic_read(&sk->sk_rmem_alloc)) ++ retval = -ENOMEM; ++ if (retval) { ++ ub->ub_parms[UB_TCPRCVBUF].held -= chargesize; ++ ub->ub_parms[UB_TCPRCVBUF].failcnt++; ++ } ++ ub_adjust_maxheld(ub, UB_TCPRCVBUF); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ goto out; ++} ++ ++int ub_tcprcvbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcprcvbuf(sk, skb, UB_HARD); ++} ++ ++int ub_tcprcvbuf_charge_forced(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcprcvbuf(sk, skb, UB_FORCE); ++} ++EXPORT_SYMBOL(ub_tcprcvbuf_charge_forced); ++ ++static void ub_tcprcvbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ unsigned long held, bar; ++ int prev_pres; ++ struct user_beancounter *ub; ++ ++ for (ub = skb_bc(skb)->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (ub->ub_parms[UB_TCPRCVBUF].held < skb_bc(skb)->charged) { ++ printk(KERN_ERR "Uncharging %d for tcprcvbuf of %p with %lu\n", ++ skb_bc(skb)->charged, ++ ub, ub->ub_parms[UB_TCPRCVBUF].held); ++ /* ass-saving bung */ ++ skb_bc(skb)->charged = ub->ub_parms[UB_TCPRCVBUF].held; ++ } ++ ub->ub_parms[UB_TCPRCVBUF].held -= skb_bc(skb)->charged; ++ held = ub->ub_parms[UB_TCPRCVBUF].held; ++ bar = ub->ub_parms[UB_TCPRCVBUF].barrier; ++ prev_pres = ub->ub_rmem_pressure; ++ if (held <= bar - (bar >> 2)) ++ ub->ub_rmem_pressure = UB_RMEM_EXPAND; ++ else if (held <= bar) ++ ub->ub_rmem_pressure = UB_RMEM_KEEP; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(skb_bc(skb)->ub, UB_TCPRCVBUF, ++ skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++ ++/* ++ * UB_OTHERSOCKBUF ++ */ ++ ++static void ub_socksndbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub, *cub; ++ struct sock_beancounter *sk_bc; ++ ++ /* resource was set. no check for ub required */ ++ cub = skb_bc(skb)->ub; ++ for (ub = cub; ub->parent != NULL; ub = ub->parent); ++ skb_bc(skb)->ub = NULL; ++ if (skb->sk != NULL) ++ sk_bc = sock_bc(skb->sk); ++ else ++ sk_bc = NULL; ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_sockbuf(sk_bc, ub, UB_OTHERSOCKBUF, ++ skb_bc(skb)->charged); ++ ub_sock_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(cub, UB_OTHERSOCKBUF, skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++static void ub_tcpsndbuf_uncharge(struct sk_buff *skb) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub, *cub; ++ ++ /* resource can be not set, called manually */ ++ cub = skb_bc(skb)->ub; ++ if (cub == NULL) ++ return; ++ for (ub = cub; ub->parent != NULL; ub = ub->parent); ++ skb_bc(skb)->ub = NULL; ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_sockbuf(sock_bc(skb->sk), ub, UB_TCPSNDBUF, ++ skb_bc(skb)->charged); ++ ub_tcp_snd_wakeup(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ uncharge_beancounter_notop(cub, UB_TCPSNDBUF, skb_bc(skb)->charged); ++ ub_skb_set_uncharge(skb); ++} ++ ++void ub_skb_uncharge(struct sk_buff *skb) ++{ ++ switch (skb_bc(skb)->resource) { ++ case UB_TCPSNDBUF: ++ ub_tcpsndbuf_uncharge(skb); ++ break; ++ case UB_TCPRCVBUF: ++ ub_tcprcvbuf_uncharge(skb); ++ break; ++ case UB_DGRAMRCVBUF: ++ ub_sockrcvbuf_uncharge(skb); ++ break; ++ case UB_OTHERSOCKBUF: ++ ub_socksndbuf_uncharge(skb); ++ break; ++ } ++} ++ ++EXPORT_SYMBOL(ub_skb_uncharge); /* due to skb_orphan()/conntracks */ ++ ++/* ++ * TCP send buffers accouting. Paged part ++ */ ++int ub_sock_tcp_chargepage(struct sock *sk) ++{ ++ struct sock_beancounter *skbc; ++ struct user_beancounter *ub; ++ unsigned long added; ++ unsigned long flags; ++ int err; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ skbc = sock_bc(sk); ++ ++ for (ub = skbc->ub; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ /* Try to charge full page */ ++ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK, ++ PAGE_SIZE); ++ if (err == 0) { ++ skbc->poll_reserv -= PAGE_SIZE; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, PAGE_SIZE); ++ return 0; ++ } ++ ++ /* Try to charge page enough to satisfy sys_select. The possible ++ overdraft for the rest of the page is generally better then ++ requesting full page in tcp_poll. This should not happen ++ frequently. Den */ ++ added = -skbc->poll_reserv; ++ err = ub_sock_makewreserv_locked(sk, UB_TCPSNDBUF, UB_NUMTCPSOCK, ++ SOCK_MIN_UBCSPACE); ++ if (err < 0) { ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return err; ++ } ++ __charge_beancounter_locked(ub, UB_TCPSNDBUF, ++ PAGE_SIZE - skbc->poll_reserv, ++ UB_FORCE); ++ added += PAGE_SIZE; ++ skbc->poll_reserv = 0; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ charge_beancounter_notop(skbc->ub, UB_TCPSNDBUF, added); ++ ++ return 0; ++ ++} ++ ++void ub_sock_tcp_detachpage(struct sock *sk) ++{ ++ struct sk_buff *skb; ++ ++ if (!sock_has_ubc(sk)) ++ return; ++ ++ /* The page is just detached from socket. The last skb in queue ++ with paged part holds referrence to it */ ++ skb = skb_peek_tail(&sk->sk_write_queue); ++ if (skb == NULL) { ++ /* If the queue is empty - all data is sent and page is about ++ to be freed */ ++ uncharge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, PAGE_SIZE); ++ return; ++ } ++ /* Last skb is a good aproximation for a last skb with paged part */ ++ skb_bc(skb)->charged += PAGE_SIZE; ++} ++ ++static int charge_tcpsndbuf(struct sock *sk, struct sk_buff *skb, ++ enum severity strict) ++{ ++ int ret; ++ unsigned long chargesize; ++ ++ if (!sock_has_ubc(sk)) ++ return 0; ++ ++ chargesize = skb_charge_fullsize(skb); ++ ret = charge_beancounter(sock_bc(sk)->ub, UB_TCPSNDBUF, chargesize, ++ strict); ++ if (ret < 0) ++ return ret; ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); ++ sock_bc(sk)->ub_wcharged += chargesize; ++ return ret; ++} ++ ++int ub_tcpsndbuf_charge(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcpsndbuf(sk, skb, UB_HARD); ++} ++ ++int ub_tcpsndbuf_charge_forced(struct sock *sk, struct sk_buff *skb) ++{ ++ return charge_tcpsndbuf(sk, skb, UB_FORCE); ++} ++EXPORT_SYMBOL(ub_tcpsndbuf_charge_forced); ++ ++/* ++ * Initialization staff ++ */ ++int __init skbc_cache_init(void) ++{ ++ return 0; ++} +diff -upr linux-2.6.16.orig/kernel/ub/ub_page_bc.c linux-2.6.16-026test015/kernel/ub/ub_page_bc.c +--- linux-2.6.16.orig/kernel/ub/ub_page_bc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_page_bc.c 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,439 @@ ++/* ++ * kernel/ub/ub_page_bc.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/slab.h> ++#include <linux/mm.h> ++#include <linux/gfp.h> ++#include <linux/vmalloc.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_vmpages.h> ++#include <ub/ub_page.h> ++ ++static kmem_cache_t *pb_cachep; ++static spinlock_t pb_lock = SPIN_LOCK_UNLOCKED; ++static struct page_beancounter **pb_hash_table; ++static unsigned int pb_hash_mask; ++ ++/* ++ * Auxiliary staff ++ */ ++ ++static inline struct page_beancounter *next_page_pb(struct page_beancounter *p) ++{ ++ return list_entry(p->page_list.next, struct page_beancounter, ++ page_list); ++} ++ ++static inline struct page_beancounter *prev_page_pb(struct page_beancounter *p) ++{ ++ return list_entry(p->page_list.prev, struct page_beancounter, ++ page_list); ++} ++ ++/* ++ * Held pages manipulation ++ */ ++static inline void set_held_pages(struct user_beancounter *bc) ++{ ++ /* all three depend on ub_held_pages */ ++ __ub_update_physpages(bc); ++ __ub_update_oomguarpages(bc); ++ __ub_update_privvm(bc); ++} ++ ++static inline void do_dec_held_pages(struct user_beancounter *ub, int value) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_held_pages -= value; ++ set_held_pages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static void dec_held_pages(struct user_beancounter *ub, int value) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_dec_held_pages(ub, value); ++} ++ ++static inline void do_inc_held_pages(struct user_beancounter *ub, int value) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_held_pages += value; ++ set_held_pages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++static void inc_held_pages(struct user_beancounter *ub, int value) ++{ ++ for (; ub != NULL; ub = ub->parent) ++ do_inc_held_pages(ub, value); ++} ++ ++/* ++ * Alloc - free ++ */ ++ ++inline int pb_alloc(struct page_beancounter **pbc) ++{ ++ *pbc = kmem_cache_alloc(pb_cachep, GFP_KERNEL); ++ if (*pbc != NULL) { ++ (*pbc)->next_hash = NULL; ++ (*pbc)->pb_magic = PB_MAGIC; ++ } ++ return (*pbc == NULL); ++} ++ ++inline void pb_free(struct page_beancounter **pb) ++{ ++ if (*pb != NULL) { ++ kmem_cache_free(pb_cachep, *pb); ++ *pb = NULL; ++ } ++} ++ ++void pb_free_list(struct page_beancounter **p_pb) ++{ ++ struct page_beancounter *list, *pb; ++ ++ list = *p_pb; ++ if (list == PBC_COPY_SAME) ++ return; ++ ++ while (list) { ++ pb = list; ++ list = list->next_hash; ++ pb_free(&pb); ++ } ++ *p_pb = NULL; ++} ++ ++/* ++ * head -> <new objs> -> <old objs> -> ... ++ */ ++static int __alloc_list(struct page_beancounter **head, int num) ++{ ++ struct page_beancounter *pb; ++ ++ while (num > 0) { ++ if (pb_alloc(&pb)) ++ return -1; ++ pb->next_hash = *head; ++ *head = pb; ++ num--; ++ } ++ ++ return num; ++} ++ ++/* ++ * Ensure that the list contains at least num elements. ++ * p_pb points to an initialized list, may be of the zero length. ++ * ++ * mm->page_table_lock should be held ++ */ ++int pb_alloc_list(struct page_beancounter **p_pb, int num) ++{ ++ struct page_beancounter *list; ++ ++ for (list = *p_pb; list != NULL && num; list = list->next_hash, num--); ++ if (!num) ++ return 0; ++ ++ /* ++ * *p_pb(after) *p_pb (before) ++ * \ \ ++ * <new objs> -...-> <old objs> -> ... ++ */ ++ if (__alloc_list(p_pb, num) < 0) ++ goto nomem; ++ return 0; ++ ++nomem: ++ pb_free_list(p_pb); ++ return -ENOMEM; ++} ++ ++/* ++ * Allocates a page_beancounter for each ++ * user_beancounter in a hash ++ */ ++int pb_alloc_all(struct page_beancounter **pbs) ++{ ++ int i, need_alloc; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ need_alloc = 0; ++ for_each_beancounter(i, ub) ++ need_alloc++; ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ if (!__alloc_list(pbs, need_alloc)) ++ return 0; ++ ++ pb_free_list(pbs); ++ return -ENOMEM; ++} ++ ++/* ++ * Hash routines ++ */ ++ ++static inline int pb_hash(struct user_beancounter *ub, struct page *page) ++{ ++ return (page_to_pfn(page) + (ub->ub_uid << 10)) & pb_hash_mask; ++} ++ ++/* pb_lock should be held */ ++static inline void insert_pb(struct page_beancounter *p, struct page *page, ++ struct user_beancounter *ub, int hash) ++{ ++ p->page = page; ++ p->ub = get_beancounter(ub); ++ p->next_hash = pb_hash_table[hash]; ++ pb_hash_table[hash] = p; ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ ub->ub_stat[smp_processor_id()].pbcs++; ++#endif ++} ++ ++/* ++ * Heart ++ */ ++ ++static int __pb_dup_ref(struct page *page, struct user_beancounter *bc, ++ int hash) ++{ ++ struct page_beancounter *p; ++ ++ for (p = pb_hash_table[hash]; ++ p != NULL && (p->page != page || p->ub != bc); ++ p = p->next_hash); ++ if (p == NULL) ++ return -1; ++ ++ PB_COUNT_INC(p->refcount); ++ return 0; ++} ++ ++static void __pb_add_ref(struct page *page, struct user_beancounter *bc, ++ struct page_beancounter **ppb, int hash) ++{ ++ struct page_beancounter *head, *p; ++ int shift; ++ ++ p = *ppb; ++ *ppb = p->next_hash; ++ ++ insert_pb(p, page, bc, hash); ++ head = page_pbc(page); ++ ++ if (head != NULL) { ++ /* ++ * Move the first element to the end of the list. ++ * List head (pb_head) is set to the next entry. ++ * Note that this code works even if head is the only element ++ * on the list (because it's cyclic). ++ */ ++ BUG_ON(head->pb_magic != PB_MAGIC); ++ page_pbc(page) = next_page_pb(head); ++ PB_SHIFT_INC(head->refcount); ++ shift = PB_SHIFT_GET(head->refcount); ++ /* ++ * Update user beancounter, the share of head has been changed. ++ * Note that the shift counter is taken after increment. ++ */ ++ dec_held_pages(head->ub, UB_PAGE_WEIGHT >> shift); ++ /* add the new page beancounter to the end of the list */ ++ list_add_tail(&p->page_list, &page_pbc(page)->page_list); ++ } else { ++ page_pbc(page) = p; ++ shift = 0; ++ INIT_LIST_HEAD(&p->page_list); ++ } ++ ++ p->refcount = PB_REFCOUNT_MAKE(shift, 1); ++ /* update user beancounter for the new page beancounter */ ++ inc_held_pages(bc, UB_PAGE_WEIGHT >> shift); ++} ++ ++void pb_add_ref(struct page *page, struct mm_struct *mm, ++ struct page_beancounter **p_pb) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ if (__pb_dup_ref(page, bc, hash)) ++ __pb_add_ref(page, bc, p_pb, hash); ++ spin_unlock(&pb_lock); ++} ++ ++void pb_dup_ref(struct page *page, struct mm_struct *mm, ++ struct page_beancounter **p_pb) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ if (page_pbc(page) == NULL) ++ /* ++ * pages like ZERO_PAGE must not be accounted in pbc ++ * so on fork we just skip them ++ */ ++ goto out_unlock; ++ ++ if (unlikely(*p_pb != PBC_COPY_SAME)) ++ __pb_add_ref(page, bc, p_pb, hash); ++ else if (unlikely(__pb_dup_ref(page, bc, hash))) ++ WARN_ON(1); ++out_unlock: ++ spin_unlock(&pb_lock); ++} ++ ++void pb_remove_ref(struct page *page, struct mm_struct *mm) ++{ ++ int hash; ++ struct user_beancounter *bc; ++ struct page_beancounter *p, **q; ++ int shift, shiftt; ++ ++ bc = mm->mm_ub; ++ if (bc == NULL) ++ return; ++ ++ if (!PageAnon(page) && is_shmem_mapping(page->mapping)) ++ return; ++ ++ hash = pb_hash(bc, page); ++ ++ spin_lock(&pb_lock); ++ BUG_ON(page_pbc(page) != NULL && page_pbc(page)->pb_magic != PB_MAGIC); ++ for (q = pb_hash_table + hash, p = *q; ++ p != NULL && (p->page != page || p->ub != bc); ++ q = &p->next_hash, p = *q); ++ if (p == NULL) ++ goto out_unlock; ++ ++ PB_COUNT_DEC(p->refcount); ++ if (PB_COUNT_GET(p->refcount)) ++ /* ++ * More references from the same user beancounter exist. ++ * Nothing needs to be done. ++ */ ++ goto out_unlock; ++ ++ /* remove from the hash list */ ++ *q = p->next_hash; ++ ++ shift = PB_SHIFT_GET(p->refcount); ++ ++ dec_held_pages(p->ub, UB_PAGE_WEIGHT >> shift); ++ ++ if (page_pbc(page) == p) { ++ if (list_empty(&p->page_list)) ++ goto out_free; ++ page_pbc(page) = next_page_pb(p); ++ } ++ list_del(&p->page_list); ++ put_beancounter(p->ub); ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ p->ub->ub_stat[smp_processor_id()].pbcs--; ++#endif ++ pb_free(&p); ++ ++ /* Now balance the list. Move the tail and adjust its shift counter. */ ++ p = prev_page_pb(page_pbc(page)); ++ shiftt = PB_SHIFT_GET(p->refcount); ++ page_pbc(page) = p; ++ PB_SHIFT_DEC(p->refcount); ++ ++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); ++ ++ /* ++ * If the shift counter of the moved beancounter is different from the ++ * removed one's, repeat the procedure for one more tail beancounter ++ */ ++ if (shiftt > shift) { ++ p = prev_page_pb(page_pbc(page)); ++ page_pbc(page) = p; ++ PB_SHIFT_DEC(p->refcount); ++ inc_held_pages(p->ub, UB_PAGE_WEIGHT >> shiftt); ++ } ++ spin_unlock(&pb_lock); ++ return; ++ ++out_free: ++ page_pbc(page) = NULL; ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ p->ub->ub_stat[smp_processor_id()].pbcs--; ++#endif ++ put_beancounter(p->ub); ++ pb_free(&p); ++out_unlock: ++ spin_unlock(&pb_lock); ++ return; ++} ++ ++struct user_beancounter *pb_grab_page_ub(struct page *page) ++{ ++ struct page_beancounter *pb; ++ struct user_beancounter *ub; ++ ++ spin_lock(&pb_lock); ++ pb = page_pbc(page); ++ ub = (pb == NULL ? ERR_PTR(-EINVAL) : ++ get_beancounter(pb->ub)); ++ spin_unlock(&pb_lock); ++ return ub; ++} ++ ++void __init ub_init_pbc(void) ++{ ++ unsigned long hash_size; ++ ++ pb_cachep = kmem_cache_create("page_beancounter", ++ sizeof(struct page_beancounter), 0, ++ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL, NULL); ++ hash_size = num_physpages >> 2; ++ for (pb_hash_mask = 1; ++ (hash_size & pb_hash_mask) != hash_size; ++ pb_hash_mask = (pb_hash_mask << 1) + 1); ++ hash_size = pb_hash_mask + 1; ++ printk(KERN_INFO "Page beancounter hash is %lu entries.\n", hash_size); ++ pb_hash_table = vmalloc(hash_size * sizeof(struct page_beancounter *)); ++ memset(pb_hash_table, 0, hash_size * sizeof(struct page_beancounter *)); ++} +diff -upr linux-2.6.16.orig/kernel/ub/ub_pages.c linux-2.6.16-026test015/kernel/ub/ub_pages.c +--- linux-2.6.16.orig/kernel/ub/ub_pages.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_pages.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,530 @@ ++/* ++ * kernel/ub/ub_pages.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/mm.h> ++#include <linux/highmem.h> ++#include <linux/virtinfo.h> ++#include <linux/module.h> ++#include <linux/shmem_fs.h> ++#include <linux/vmalloc.h> ++ ++#include <asm/pgtable.h> ++#include <asm/page.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_vmpages.h> ++ ++void warn_bad_rss(struct vm_area_struct *vma, unsigned long freed) ++{ ++ static struct ub_rate_info ri = { ++ .burst = 10, ++ .interval = 40 * HZ, ++ }; ++ struct user_beancounter *ub; ++ char ubuid[64] = "No UB"; ++ unsigned long vmrss; ++ ++ if (!ub_ratelimit(&ri)) ++ return; ++ ++ ub = vma->vm_mm->mm_ub; ++ if (ub) ++ print_ub_uid(ub, ubuid, sizeof(ubuid)); ++ ++ vmrss = get_vma_rss(vma) + freed; ++ printk(KERN_WARNING ++ "%s vm_rss: process pid %d comm %.20s flags %lx\n" ++ "vma %p/%p rss %lu/%lu freed %lu\n" ++ "flags %lx, ub %s\n", ++ vmrss > freed ? "Positive" : "Negative", ++ current->pid, current->comm, current->flags, ++ vma, vma->vm_mm, vmrss, vma_pages(vma), freed, ++ vma->vm_flags, ubuid); ++ dump_stack(); ++} ++ ++static inline unsigned long pages_in_pte_range(struct vm_area_struct *vma, ++ pmd_t *pmd, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pte_t *pte; ++ spinlock_t *ptl; ++ ++ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); ++ do { ++ if (!pte_none(*pte) && pte_present(*pte)) ++ (*ret)++; ++ } while (pte++, addr += PAGE_SIZE, (addr != end)); ++ pte_unmap_unlock(pte - 1, ptl); ++ ++ return addr; ++} ++ ++static inline unsigned long pages_in_pmd_range(struct vm_area_struct *vma, ++ pud_t *pud, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pmd_t *pmd; ++ unsigned long next; ++ ++ pmd = pmd_offset(pud, addr); ++ do { ++ next = pmd_addr_end(addr, end); ++ if (pmd_none_or_clear_bad(pmd)) ++ continue; ++ next = pages_in_pte_range(vma, pmd, addr, next, ret); ++ } while (pmd++, addr = next, (addr != end)); ++ ++ return addr; ++} ++ ++static inline unsigned long pages_in_pud_range(struct vm_area_struct *vma, ++ pgd_t *pgd, unsigned long addr, unsigned long end, ++ unsigned long *ret) ++{ ++ pud_t *pud; ++ unsigned long next; ++ ++ pud = pud_offset(pgd, addr); ++ do { ++ next = pud_addr_end(addr, end); ++ if (pud_none_or_clear_bad(pud)) ++ continue; ++ next = pages_in_pmd_range(vma, pud, addr, next, ret); ++ } while (pud++, addr = next, (addr != end)); ++ ++ return addr; ++} ++ ++unsigned long pages_in_vma_range(struct vm_area_struct *vma, ++ unsigned long addr, unsigned long end) ++{ ++ pgd_t *pgd; ++ unsigned long next; ++ unsigned long ret; ++ ++ ret = 0; ++ BUG_ON(addr >= end); ++ pgd = pgd_offset(vma->vm_mm, addr); ++ do { ++ next = pgd_addr_end(addr, end); ++ if (pgd_none_or_clear_bad(pgd)) ++ continue; ++ next = pages_in_pud_range(vma, pgd, addr, next, &ret); ++ } while (pgd++, addr = next, (addr != end)); ++ return ret; ++} ++ ++void fastcall __ub_update_physpages(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_PHYSPAGES].held = ub->ub_tmpfs_respages ++ + (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT); ++ ub_adjust_maxheld(ub, UB_PHYSPAGES); ++} ++ ++void fastcall __ub_update_oomguarpages(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_OOMGUARPAGES].held = ++ ub->ub_parms[UB_PHYSPAGES].held + ub->ub_swap_pages; ++ ub_adjust_maxheld(ub, UB_OOMGUARPAGES); ++} ++ ++void fastcall __ub_update_privvm(struct user_beancounter *ub) ++{ ++ ub->ub_parms[UB_PRIVVMPAGES].held = ++ (ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT) ++ + ub->ub_unused_privvmpages ++ + ub->ub_parms[UB_SHMPAGES].held; ++ ub_adjust_maxheld(ub, UB_PRIVVMPAGES); ++} ++ ++static inline int __charge_privvm_locked(struct user_beancounter *ub, ++ unsigned long s, enum severity strict) ++{ ++ if (__charge_beancounter_locked(ub, UB_PRIVVMPAGES, s, strict) < 0) ++ return -ENOMEM; ++ ++ ub->ub_unused_privvmpages += s; ++ return 0; ++} ++ ++static void __unused_privvm_dec_locked(struct user_beancounter *ub, ++ long size) ++{ ++ /* catch possible overflow */ ++ if (ub->ub_unused_privvmpages < size) { ++ uncharge_warn(ub, UB_UNUSEDPRIVVM, ++ size, ub->ub_unused_privvmpages); ++ size = ub->ub_unused_privvmpages; ++ } ++ ub->ub_unused_privvmpages -= size; ++ __ub_update_privvm(ub); ++} ++ ++void __ub_unused_privvm_dec(struct mm_struct *mm, long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __unused_privvm_dec_locked(ub, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_unused_privvm_sub(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long count) ++{ ++ if (VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) ++ __ub_unused_privvm_dec(mm, count); ++} ++ ++void ub_unused_privvm_add(struct mm_struct *mm, ++ struct vm_area_struct *vma, unsigned long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL || !VM_UB_PRIVATE(vma->vm_flags, vma->vm_file)) ++ return; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_unused_privvmpages += size; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++int ub_protected_charge(struct mm_struct *mm, unsigned long size, ++ unsigned long newflags, struct vm_area_struct *vma) ++{ ++ unsigned long flags; ++ struct file *file; ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return PRIVVM_NO_CHARGE; ++ ++ flags = vma->vm_flags; ++ if (!((newflags ^ flags) & VM_WRITE)) ++ return PRIVVM_NO_CHARGE; ++ ++ file = vma->vm_file; ++ if (!VM_UB_PRIVATE(newflags | VM_WRITE, file)) ++ return PRIVVM_NO_CHARGE; ++ ++ if (flags & VM_WRITE) ++ return PRIVVM_TO_SHARED; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (__charge_privvm_locked(ub, size, UB_SOFT) < 0) ++ goto err; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return PRIVVM_TO_PRIVATE; ++ ++err: ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return PRIVVM_ERROR; ++} ++ ++int ub_memory_charge(struct mm_struct *mm, unsigned long size, ++ unsigned vm_flags, struct file *vm_file, int sv) ++{ ++ struct user_beancounter *ub, *ubl; ++ unsigned long flags; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return 0; ++ ++ size >>= PAGE_SHIFT; ++ if (size > UB_MAXVALUE) ++ return -EINVAL; ++ ++ BUG_ON(sv != UB_SOFT && sv != UB_HARD); ++ ++ if (vm_flags & VM_LOCKED) { ++ if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv)) ++ goto out_err; ++ } ++ if (VM_UB_PRIVATE(vm_flags, vm_file)) { ++ for (ubl = ub; ubl->parent != NULL; ubl = ubl->parent); ++ spin_lock_irqsave(&ubl->ub_lock, flags); ++ if (__charge_privvm_locked(ubl, size, sv)) ++ goto out_private; ++ spin_unlock_irqrestore(&ubl->ub_lock, flags); ++ } ++ return 0; ++ ++out_private: ++ spin_unlock_irqrestore(&ubl->ub_lock, flags); ++ if (vm_flags & VM_LOCKED) ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); ++out_err: ++ return -ENOMEM; ++} ++ ++void ub_memory_uncharge(struct mm_struct *mm, unsigned long size, ++ unsigned vm_flags, struct file *vm_file) ++{ ++ struct user_beancounter *ub; ++ unsigned long flags; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ size >>= PAGE_SHIFT; ++ ++ if (vm_flags & VM_LOCKED) ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size); ++ if (VM_UB_PRIVATE(vm_flags, vm_file)) { ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __unused_privvm_dec_locked(ub, size); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ } ++} ++ ++int ub_locked_charge(struct mm_struct *mm, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_LOCKEDPAGES, ++ size >> PAGE_SHIFT, UB_HARD); ++} ++ ++void ub_locked_uncharge(struct mm_struct *mm, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = mm->mm_ub; ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); ++} ++ ++int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return 0; ++ ++ return charge_beancounter(ub, UB_LOCKEDPAGES, ++ size >> PAGE_SHIFT, UB_HARD); ++} ++ ++void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return; ++ ++ uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); ++} ++ ++ ++static inline void do_ub_tmpfs_respages_inc(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_tmpfs_respages++; ++ __ub_update_physpages(ub); ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_tmpfs_respages_inc(struct shmem_inode_info *shi) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) ++ do_ub_tmpfs_respages_inc(ub); ++} ++ ++static inline void do_ub_tmpfs_respages_sub(struct user_beancounter *ub, ++ unsigned long size) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ /* catch possible overflow */ ++ if (ub->ub_tmpfs_respages < size) { ++ uncharge_warn(ub, UB_TMPFSPAGES, ++ size, ub->ub_tmpfs_respages); ++ size = ub->ub_tmpfs_respages; ++ } ++ ub->ub_tmpfs_respages -= size; ++ /* update values what is the most interesting */ ++ __ub_update_physpages(ub); ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_tmpfs_respages_sub(struct shmem_inode_info *shi, ++ unsigned long size) ++{ ++ struct user_beancounter *ub; ++ ++ for (ub = shi->shmi_ub; ub != NULL; ub = ub->parent) ++ do_ub_tmpfs_respages_sub(ub, size); ++} ++ ++int ub_shmpages_charge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ int ret; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return 0; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ret = __charge_beancounter_locked(ub, UB_SHMPAGES, size, UB_HARD); ++ if (ret == 0) ++ __ub_update_privvm(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ return ret; ++} ++ ++void ub_shmpages_uncharge(struct shmem_inode_info *shi, unsigned long size) ++{ ++ unsigned long flags; ++ struct user_beancounter *ub; ++ ++ ub = shi->shmi_ub; ++ if (ub == NULL) ++ return; ++ ++ for (; ub->parent != NULL; ub = ub->parent); ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ __uncharge_beancounter_locked(ub, UB_SHMPAGES, size); ++ __ub_update_privvm(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++#ifdef CONFIG_USER_SWAP_ACCOUNTING ++static inline void do_ub_swapentry_inc(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_swap_pages++; ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_swapentry_inc(struct swap_info_struct *si, pgoff_t num, ++ struct user_beancounter *ub) ++{ ++ si->swap_ubs[num] = get_beancounter(ub); ++ for (; ub != NULL; ub = ub->parent) ++ do_ub_swapentry_inc(ub); ++} ++EXPORT_SYMBOL(ub_swapentry_inc); ++ ++static inline void do_ub_swapentry_dec(struct user_beancounter *ub) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ if (ub->ub_swap_pages <= 0) ++ uncharge_warn(ub, UB_SWAPPAGES, 1, ub->ub_swap_pages); ++ else ++ ub->ub_swap_pages--; ++ __ub_update_oomguarpages(ub); ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++} ++ ++void ub_swapentry_dec(struct swap_info_struct *si, pgoff_t num) ++{ ++ struct user_beancounter *ub, *ubp; ++ ++ ub = si->swap_ubs[num]; ++ si->swap_ubs[num] = NULL; ++ for (ubp = ub; ubp != NULL; ubp = ubp->parent) ++ do_ub_swapentry_dec(ubp); ++ put_beancounter(ub); ++} ++EXPORT_SYMBOL(ub_swapentry_dec); ++ ++int ub_swap_init(struct swap_info_struct *si, pgoff_t num) ++{ ++ struct user_beancounter **ubs; ++ ++ ubs = vmalloc(num * sizeof(struct user_beancounter *)); ++ if (ubs == NULL) ++ return -ENOMEM; ++ ++ memset(ubs, 0, num * sizeof(struct user_beancounter *)); ++ si->swap_ubs = ubs; ++ return 0; ++} ++ ++void ub_swap_fini(struct swap_info_struct *si) ++{ ++ if (si->swap_ubs) { ++ vfree(si->swap_ubs); ++ si->swap_ubs = NULL; ++ } ++} ++#endif ++ ++static int vmguar_enough_memory(struct vnotifier_block *self, ++ unsigned long event, void *arg, int old_ret) ++{ ++ struct user_beancounter *ub; ++ ++ if (event != VIRTINFO_ENOUGHMEM) ++ return old_ret; ++ ++ for (ub = current->mm->mm_ub; ub->parent != NULL; ub = ub->parent); ++ if (ub->ub_parms[UB_PRIVVMPAGES].held > ++ ub->ub_parms[UB_VMGUARPAGES].barrier) ++ return old_ret; ++ ++ return NOTIFY_OK; ++} ++ ++static struct vnotifier_block vmguar_notifier_block = { ++ .notifier_call = vmguar_enough_memory ++}; ++ ++static int __init init_vmguar_notifier(void) ++{ ++ virtinfo_notifier_register(VITYPE_GENERAL, &vmguar_notifier_block); ++ return 0; ++} ++ ++static void __exit fini_vmguar_notifier(void) ++{ ++ virtinfo_notifier_unregister(VITYPE_GENERAL, &vmguar_notifier_block); ++} ++ ++module_init(init_vmguar_notifier); ++module_exit(fini_vmguar_notifier); +diff -upr linux-2.6.16.orig/kernel/ub/ub_proc.c linux-2.6.16-026test015/kernel/ub/ub_proc.c +--- linux-2.6.16.orig/kernel/ub/ub_proc.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_proc.c 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,389 @@ ++/* ++ * linux/fs/proc/proc_ub.c ++ * ++ * Copyright (C) 1998-2000 Andrey V. Savochkin <saw@saw.sw.com.sg> ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ * TODO: ++ * ++ * Changes: ++ */ ++ ++#include <linux/errno.h> ++#include <linux/sched.h> ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/proc_fs.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_debug.h> ++#include <ub/ub_page.h> ++ ++#include <asm/page.h> ++#include <asm/uaccess.h> ++ ++/* ++ * we have 8 format strings depending on: ++ * 1. BITS_PER_LONG ++ * 2. CONFIG_UBC_KEEP_UNUSED ++ * 3. resource number (see out_proc_beancounter) ++ */ ++ ++#ifdef CONFIG_UBC_KEEP_UNUSED ++#define REF_FORMAT "%5.5s %4i: %-12s " ++#define UID_HEAD_STR "uid ref" ++#else ++#define REF_FORMAT "%10.10s: %-12s " ++#define UID_HEAD_STR "uid" ++#endif ++#define REF2_FORMAT "%10s %-12s " ++ ++#if BITS_PER_LONG == 32 ++#define RES_FORMAT "%10lu %10lu %10lu %10lu %10lu" ++#define HEAD_FORMAT "%10s %10s %10s %10s %10s" ++#define UB_PROC_LINE_TEXT (10+2+12+1+10+1+10+1+10+1+10+1+10) ++#else ++#define RES_FORMAT "%20lu %20lu %20lu %20lu %20lu" ++#define HEAD_FORMAT "%20s %20s %20s %20s %20s" ++#define UB_PROC_LINE_TEXT (10+2+12+1+20+1+20+1+20+1+20+1+20) ++#endif ++ ++#define UB_PROC_LINE_LEN (UB_PROC_LINE_TEXT + 1) ++ ++static void out_proc_version(char *buf) ++{ ++ int len; ++ ++ len = sprintf(buf, "Version: 2.5"); ++ memset(buf + len, ' ', UB_PROC_LINE_TEXT - len); ++ buf[UB_PROC_LINE_TEXT] = '\n'; ++} ++ ++static void out_proc_head(char *buf) ++{ ++ sprintf(buf, REF2_FORMAT HEAD_FORMAT, ++ UID_HEAD_STR, "resource", "held", "maxheld", ++ "barrier", "limit", "failcnt"); ++ buf[UB_PROC_LINE_TEXT] = '\n'; ++} ++ ++static void out_proc_beancounter(char *buf, struct user_beancounter *ub, int r) ++{ ++ if (r == 0) { ++ char tmpbuf[64]; ++ print_ub_uid(ub, tmpbuf, sizeof(tmpbuf)); ++ sprintf(buf, REF_FORMAT RES_FORMAT, ++ tmpbuf, ++#ifdef CONFIG_UBC_KEEP_UNUSED ++ atomic_read(&ub->ub_refcount), ++#endif ++ ub_rnames[r], ub->ub_parms[r].held, ++ ub->ub_parms[r].maxheld, ub->ub_parms[r].barrier, ++ ub->ub_parms[r].limit, ub->ub_parms[r].failcnt); ++ } else ++ sprintf(buf, REF2_FORMAT RES_FORMAT, ++ "", ub_rnames[r], ++ ub->ub_parms[r].held, ub->ub_parms[r].maxheld, ++ ub->ub_parms[r].barrier, ub->ub_parms[r].limit, ++ ub->ub_parms[r].failcnt); ++ ++ buf[UB_PROC_LINE_TEXT] = '\n'; ++} ++ ++static int ub_accessible(struct user_beancounter *ub, ++ struct user_beancounter *exec_ub, ++ struct file *file) ++{ ++ struct user_beancounter *p, *q; ++ ++ for (p = exec_ub; p->parent != NULL; p = p->parent); ++ for (q = ub; q->parent != NULL; q = q->parent); ++ if (p != get_ub0() && q != p) ++ return 0; ++ if (ub->parent == NULL) ++ return 1; ++ return file->private_data == NULL ? 0 : 1; ++} ++ ++static ssize_t ub_proc_read(struct file *file, char *usrbuf, size_t len, ++ loff_t *poff) ++{ ++ ssize_t retval; ++ char *buf; ++ unsigned long flags; ++ int i, resource; ++ struct ub_hash_slot *slot; ++ struct user_beancounter *ub; ++ struct user_beancounter *exec_ub = get_exec_ub(); ++ loff_t n, off; ++ int rem, produced, job, tocopy; ++ const int is_capable = ++ (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)); ++ ++ retval = -ENOBUFS; ++ buf = (char *)__get_free_page(GFP_KERNEL); ++ if (buf == NULL) ++ goto out; ++ ++ retval = 0; ++ if (!is_capable) ++ goto out_free; ++ ++ off = *poff; ++ if (off < 0) /* can't happen, just in case */ ++ goto inval; ++ ++again: ++ i = 0; ++ slot = ub_hash; ++ n = off; /* The amount of data tp skip */ ++ produced = 0; ++ if (n < (UB_PROC_LINE_LEN * 2)) { ++ if (n < UB_PROC_LINE_LEN) { ++ out_proc_version(buf); ++ produced += UB_PROC_LINE_LEN; ++ n += UB_PROC_LINE_LEN; ++ } ++ out_proc_head(buf + produced); ++ produced += UB_PROC_LINE_LEN; ++ n += UB_PROC_LINE_LEN; ++ } ++ n -= (2 * UB_PROC_LINE_LEN); ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ while (1) { ++ for (ub = slot->ubh_beans; ++ ub != NULL && n >= (UB_RESOURCES * UB_PROC_LINE_LEN); ++ ub = ub->ub_next) ++ if (is_capable && ub_accessible(ub, exec_ub, file)) ++ n -= (UB_RESOURCES * UB_PROC_LINE_LEN); ++ if (ub != NULL || ++i >= UB_HASH_SIZE) ++ break; ++ ++slot; ++ } ++ rem = n; /* the amount of the data in the buffer to skip */ ++ job = PAGE_SIZE - UB_PROC_LINE_LEN + 1; /* end of buffer data */ ++ if (len < job - rem) ++ job = rem + len; ++ while (ub != NULL && produced < job) { ++ if (is_capable && ub_accessible(ub, exec_ub, file)) ++ for (resource = 0; ++ produced < job && resource < UB_RESOURCES; ++ resource++, produced += UB_PROC_LINE_LEN) ++ { ++ out_proc_beancounter(buf + produced, ++ ub, resource); ++ } ++ if (produced >= job) ++ break; ++ /* Find the next beancounter to produce more data. */ ++ ub = ub->ub_next; ++ while (ub == NULL && ++i < UB_HASH_SIZE) { ++ ++slot; ++ ub = slot->ubh_beans; ++ } ++ } ++ ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ub_debug(UBD_ALLOC, KERN_DEBUG "UB_PROC: produced %d, job %d, rem %d\n", ++ produced, job, rem); ++ ++ /* ++ * Temporary buffer `buf' contains `produced' bytes. ++ * Extract no more than `len' bytes at offset `rem'. ++ */ ++ if (produced <= rem) ++ goto out_free; ++ tocopy = produced - rem; ++ if (len < tocopy) ++ tocopy = len; ++ if (!tocopy) ++ goto out_free; ++ if (copy_to_user(usrbuf, buf + rem, tocopy)) ++ goto fault; ++ off += tocopy; /* can't overflow */ ++ *poff = off; ++ len -= tocopy; ++ retval += tocopy; ++ if (!len) ++ goto out_free; ++ usrbuf += tocopy; ++ goto again; ++ ++fault: ++ retval = -EFAULT; ++out_free: ++ free_page((unsigned long)buf); ++out: ++ return retval; ++ ++inval: ++ retval = -EINVAL; ++ goto out_free; ++} ++ ++static int ub_proc_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = strcmp(file->f_dentry->d_name.name, ++ "user_beancounters") ? ++ (void *)-1 : NULL; ++ return 0; ++} ++ ++static struct file_operations ub_file_operations = { ++ .read = &ub_proc_read, ++ .open = &ub_proc_open ++}; ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++#include <linux/seq_file.h> ++#include <linux/kmem_cache.h> ++ ++static void *ubd_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t n = *pos; ++ struct user_beancounter *ub; ++ long slot; ++ ++ spin_lock_irq(&ub_hash_lock); ++ for (slot = 0; slot < UB_HASH_SIZE; slot++) ++ for (ub = ub_hash[slot].ubh_beans; ub; ub = ub->ub_next) { ++ if (n == 0) { ++ m->private = (void *)slot; ++ return (void *)ub; ++ } ++ n--; ++ } ++ return NULL; ++} ++ ++static void *ubd_next(struct seq_file *m, void *p, loff_t *pos) ++{ ++ struct user_beancounter *ub; ++ long slot; ++ ++ ub = (struct user_beancounter *)p; ++ slot = (long)m->private; ++ ++ ++*pos; ++ ub = ub->ub_next; ++ while (1) { ++ for (; ub; ub = ub->ub_next) { ++ m->private = (void *)slot; ++ return (void *)ub; ++ } ++ slot++; ++ if (slot == UB_HASH_SIZE) ++ break; ++ ub = ub_hash[slot].ubh_beans; ++ } ++ return NULL; ++} ++ ++static void ubd_stop(struct seq_file *m, void *p) ++{ ++ spin_unlock_irq(&ub_hash_lock); ++} ++ ++#define PROC_LINE_FMT "\t%-17s\t%5lu\t%5lu\n" ++ ++static int ubd_show(struct seq_file *m, void *p) ++{ ++ struct user_beancounter *ub; ++ struct ub_cache_counter *cc; ++ long pages, vmpages, pbc, swap, unmap; ++ int i; ++ char id[64]; ++ ++ ub = (struct user_beancounter *)p; ++ print_ub_uid(ub, id, sizeof(id)); ++ seq_printf(m, "%s:%d\n", id, atomic_read(&ub->ub_refcount)); ++ ++ pages = vmpages = pbc = swap = unmap = 0; ++ for (i = 0; i < NR_CPUS; i++) { ++ pages += ub->ub_stat[i].pages_charged; ++ vmpages += ub->ub_stat[i].vmalloc_charged; ++ pbc += ub->ub_stat[i].pbcs; ++ swap += ub->ub_stat[i].swapin; ++ unmap += ub->ub_stat[i].unmap; ++ } ++ if (pages < 0) ++ pages = 0; ++ if (vmpages < 0) ++ vmpages = 0; ++ seq_printf(m, PROC_LINE_FMT, "pages", pages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, "vmalloced", vmpages, PAGE_SIZE); ++ ++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_UNUSEDPRIVVM], ++ ub->ub_unused_privvmpages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_TMPFSPAGES], ++ ub->ub_tmpfs_respages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, ub_rnames[UB_SWAPPAGES], ++ ub->ub_swap_pages, PAGE_SIZE); ++ seq_printf(m, PROC_LINE_FMT, "pbcs", pbc, ++ (unsigned long)sizeof(struct page_beancounter)); ++ ++ seq_printf(m, PROC_LINE_FMT, "swapin", swap, 0UL); ++ seq_printf(m, PROC_LINE_FMT, "unmap", unmap, 0UL); ++ /* interrupts are disabled by locking ub_hash_lock */ ++ spin_lock(&cc_lock); ++ list_for_each_entry (cc, &ub->ub_cclist, ulist) { ++ kmem_cache_t *cachep; ++ ++ cachep = cc->cachep; ++ seq_printf(m, PROC_LINE_FMT, ++ cachep->name, ++ cc->counter, ++ (unsigned long)cachep->objuse); ++ } ++ spin_unlock(&cc_lock); ++ return 0; ++} ++ ++static struct seq_operations kmemdebug_op = { ++ .start = ubd_start, ++ .next = ubd_next, ++ .stop = ubd_stop, ++ .show = ubd_show, ++}; ++ ++static int kmem_debug_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &kmemdebug_op); ++} ++ ++static struct file_operations kmem_debug_ops = { ++ .open = kmem_debug_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++#endif ++ ++void __init ub_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = create_proc_entry("user_beancounters", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &ub_file_operations; ++ else ++ panic("Can't create /proc/user_beancounters entry!\n"); ++ ++ entry = create_proc_entry("user_beancounters_sub", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &ub_file_operations; ++ else ++ panic("Can't create /proc/user_beancounters2 entry!\n"); ++ ++#ifdef CONFIG_UBC_DEBUG_KMEM ++ entry = create_proc_entry("user_beancounters_debug", S_IRUGO, NULL); ++ if (entry) ++ entry->proc_fops = &kmem_debug_ops; ++ else ++ panic("Can't create /proc/user_beancounters_debug entry!\n"); ++#endif ++} +diff -upr linux-2.6.16.orig/kernel/ub/ub_stat.c linux-2.6.16-026test015/kernel/ub/ub_stat.c +--- linux-2.6.16.orig/kernel/ub/ub_stat.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_stat.c 2006-07-04 14:41:37.000000000 +0400 +@@ -0,0 +1,465 @@ ++/* ++ * kernel/ub/ub_stat.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <linux/timer.h> ++#include <linux/sched.h> ++#include <linux/init.h> ++#include <linux/jiffies.h> ++#include <linux/list.h> ++#include <linux/errno.h> ++#include <linux/suspend.h> ++ ++#include <asm/uaccess.h> ++#include <asm/param.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_hash.h> ++#include <ub/ub_stat.h> ++ ++static spinlock_t ubs_notify_lock = SPIN_LOCK_UNLOCKED; ++static LIST_HEAD(ubs_notify_list); ++static long ubs_min_interval; ++static ubstattime_t ubs_start_time, ubs_end_time; ++static struct timer_list ubs_timer; ++ ++static int ubstat_get_list(void *buf, long size) ++{ ++ int retval; ++ unsigned long flags; ++ int slotnr; ++ struct ub_hash_slot *slot; ++ struct user_beancounter *ub, *last_ub; ++ long *page, *ptr, *end; ++ int len; ++ ++ page = (long *)__get_free_page(GFP_KERNEL); ++ if (page == NULL) ++ return -ENOMEM; ++ ++ retval = 0; ++ slotnr = 0; ++ slot = ub_hash; ++ last_ub = NULL; ++ while (1) { ++ ptr = page; ++ end = page + PAGE_SIZE / sizeof(*ptr); ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ if (last_ub == NULL) ++ ub = slot->ubh_beans; ++ else ++ ub = last_ub->ub_next; ++ while (1) { ++ for (; ub != NULL; ub = ub->ub_next) { ++ if (ub->parent != NULL) ++ continue; ++ *ptr++ = ub->ub_uid; ++ if (ptr == end) ++ break; ++ } ++ if (ptr == end) ++ break; ++ ++slot; ++ if (++slotnr >= UB_HASH_SIZE) ++ break; ++ ub = slot->ubh_beans; ++ } ++ if (ptr == page) ++ goto out_unlock; ++ if (ub != NULL) ++ get_beancounter(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ ++ if (last_ub != NULL) ++ put_beancounter(last_ub); ++ last_ub = ub; /* last visited beancounter in the slot */ ++ ++ len = min_t(long, (ptr - page) * sizeof(*ptr), size); ++ if (copy_to_user(buf, page, len)) { ++ retval = -EFAULT; ++ break; ++ } ++ retval += len; ++ if (len < PAGE_SIZE) ++ break; ++ buf += len; ++ size -= len; ++ } ++out: ++ if (last_ub != NULL) ++ put_beancounter(last_ub); ++ free_page((unsigned long)page); ++ return retval; ++ ++out_unlock: ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++ goto out; ++} ++ ++static int ubstat_gettime(void *buf, long size) ++{ ++ ubgettime_t data; ++ int retval; ++ ++ spin_lock(&ubs_notify_lock); ++ data.start_time = ubs_start_time; ++ data.end_time = ubs_end_time; ++ data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ; ++ spin_unlock(&ubs_notify_lock); ++ ++ retval = min_t(long, sizeof(data), size); ++ if (copy_to_user(buf, &data, retval)) ++ retval = -EFAULT; ++ return retval; ++} ++ ++static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf) ++{ ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparm_t param[1]; ++ } *data; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ ++ data->param[0].maxheld = ub->ub_store[res].maxheld; ++ data->param[0].failcnt = ub->ub_store[res].failcnt; ++ ++ return sizeof(*data); ++} ++ ++static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size) ++{ ++ int wrote; ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparm_t param[UB_RESOURCES]; ++ } *data; ++ int resource; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ wrote = sizeof(data->start_time) + sizeof(data->end_time); ++ ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ if (size < wrote + sizeof(data->param[resource])) ++ break; ++ data->param[resource].maxheld = ub->ub_store[resource].maxheld; ++ data->param[resource].failcnt = ub->ub_store[resource].failcnt; ++ wrote += sizeof(data->param[resource]); ++ } ++ ++ return wrote; ++} ++ ++static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf, ++ int size) ++{ ++ int wrote; ++ struct { ++ ubstattime_t start_time; ++ ubstattime_t end_time; ++ ubstatparmf_t param[UB_RESOURCES]; ++ } *data; ++ int resource; ++ ++ data = kbuf; ++ data->start_time = ubs_start_time; ++ data->end_time = ubs_end_time; ++ wrote = sizeof(data->start_time) + sizeof(data->end_time); ++ ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ if (size < wrote + sizeof(data->param[resource])) ++ break; ++ /* The beginning of ubstatparmf_t matches struct ubparm. */ ++ memcpy(&data->param[resource], &ub->ub_store[resource], ++ sizeof(ub->ub_store[resource])); ++ data->param[resource].__unused1 = 0; ++ data->param[resource].__unused2 = 0; ++ wrote += sizeof(data->param[resource]); ++ } ++ return wrote; ++} ++ ++static int ubstat_get_stat(struct user_beancounter *ub, long cmd, ++ void *buf, long size) ++{ ++ void *kbuf; ++ int retval; ++ ++ kbuf = (void *)__get_free_page(GFP_KERNEL); ++ if (kbuf == NULL) ++ return -ENOMEM; ++ ++ spin_lock(&ubs_notify_lock); ++ switch (UBSTAT_CMD(cmd)) { ++ case UBSTAT_READ_ONE: ++ retval = -EINVAL; ++ if (UBSTAT_PARMID(cmd) >= UB_RESOURCES) ++ break; ++ retval = ubstat_do_read_one(ub, ++ UBSTAT_PARMID(cmd), kbuf); ++ break; ++ case UBSTAT_READ_ALL: ++ retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE); ++ break; ++ case UBSTAT_READ_FULL: ++ retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE); ++ break; ++ default: ++ retval = -EINVAL; ++ } ++ spin_unlock(&ubs_notify_lock); ++ ++ if (retval > 0) { ++ retval = min_t(long, retval, size); ++ if (copy_to_user(buf, kbuf, retval)) ++ retval = -EFAULT; ++ } ++ ++ free_page((unsigned long)kbuf); ++ return retval; ++} ++ ++static int ubstat_handle_notifrq(ubnotifrq_t *req) ++{ ++ int retval; ++ struct ub_stat_notify *new_notify; ++ struct list_head *entry; ++ struct task_struct *tsk_to_free; ++ ++ new_notify = kmalloc(sizeof(new_notify), GFP_KERNEL); ++ if (new_notify == NULL) ++ return -ENOMEM; ++ ++ tsk_to_free = NULL; ++ INIT_LIST_HEAD(&new_notify->list); ++ ++ spin_lock(&ubs_notify_lock); ++ list_for_each(entry, &ubs_notify_list) { ++ struct ub_stat_notify *notify; ++ ++ notify = list_entry(entry, struct ub_stat_notify, list); ++ if (notify->task == current) { ++ kfree(new_notify); ++ new_notify = notify; ++ break; ++ } ++ } ++ ++ retval = -EINVAL; ++ if (req->maxinterval < 1) ++ goto out_unlock; ++ if (req->maxinterval > TIME_MAX_SEC) ++ req->maxinterval = TIME_MAX_SEC; ++ if (req->maxinterval < ubs_min_interval) { ++ unsigned long dif; ++ ++ ubs_min_interval = req->maxinterval; ++ dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ; ++ if (dif > req->maxinterval) ++ mod_timer(&ubs_timer, ++ ubs_timer.expires - ++ (dif - req->maxinterval) * HZ); ++ } ++ ++ if (entry != &ubs_notify_list) { ++ list_del(&new_notify->list); ++ tsk_to_free = new_notify->task; ++ } ++ if (req->signum) { ++ new_notify->task = current; ++ get_task_struct(new_notify->task); ++ new_notify->signum = req->signum; ++ list_add(&new_notify->list, &ubs_notify_list); ++ } else ++ kfree(new_notify); ++ retval = 0; ++out_unlock: ++ spin_unlock(&ubs_notify_lock); ++ if (tsk_to_free != NULL) ++ put_task_struct(tsk_to_free); ++ return retval; ++} ++ ++/* ++ * former sys_ubstat ++ */ ++long do_ubstat(int func, unsigned long arg1, unsigned long arg2, void *buf, ++ long size) ++{ ++ int retval; ++ struct user_beancounter *ub; ++ ++ if (func == UBSTAT_UBPARMNUM) ++ return UB_RESOURCES; ++ if (func == UBSTAT_UBLIST) ++ return ubstat_get_list(buf, size); ++ if (!(capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))) ++ return -EPERM; ++ ++ if (func == UBSTAT_GETTIME) { ++ retval = ubstat_gettime(buf, size); ++ goto notify; ++ } ++ ++ ub = get_exec_ub(); ++ if (ub != NULL && ub->ub_uid == arg1) ++ get_beancounter(ub); ++ else /* FIXME must be if (ve_is_super) */ ++ ub = get_beancounter_byuid(arg1, 0); ++ ++ if (ub == NULL) ++ return -ESRCH; ++ ++ retval = ubstat_get_stat(ub, func, buf, size); ++ put_beancounter(ub); ++notify: ++ /* Handle request for notification */ ++ if (retval >= 0) { ++ ubnotifrq_t notifrq; ++ int err; ++ ++ err = -EFAULT; ++ if (!copy_from_user(¬ifrq, (void *)arg2, sizeof(notifrq))) ++ err = ubstat_handle_notifrq(¬ifrq); ++ if (err) ++ retval = err; ++ } ++ ++ return retval; ++} ++ ++static void ubstat_save_onestat(struct user_beancounter *ub) ++{ ++ int resource; ++ ++ /* called with local irq disabled */ ++ spin_lock(&ub->ub_lock); ++ for (resource = 0; resource < UB_RESOURCES; resource++) { ++ memcpy(&ub->ub_store[resource], &ub->ub_parms[resource], ++ sizeof(struct ubparm)); ++ ub->ub_parms[resource].minheld = ++ ub->ub_parms[resource].maxheld = ++ ub->ub_parms[resource].held; ++ } ++ spin_unlock(&ub->ub_lock); ++} ++ ++static void ubstat_save_statistics(void) ++{ ++ unsigned long flags; ++ int i; ++ struct user_beancounter *ub; ++ ++ spin_lock_irqsave(&ub_hash_lock, flags); ++ for_each_beancounter(i, ub) ++ ubstat_save_onestat(ub); ++ spin_unlock_irqrestore(&ub_hash_lock, flags); ++} ++ ++static void ubstatd_timeout(unsigned long __data) ++{ ++ struct task_struct *p; ++ ++ p = (struct task_struct *) __data; ++ wake_up_process(p); ++} ++ ++/* ++ * Safe wrapper for send_sig. It prevents a race with release_task ++ * for sighand. ++ * Should be called under tasklist_lock. ++ */ ++static void task_send_sig(struct ub_stat_notify *notify) ++{ ++ if (likely(notify->task->sighand != NULL)) ++ send_sig(notify->signum, notify->task, 1); ++} ++ ++static inline void do_notifies(void) ++{ ++ LIST_HEAD(notif_free_list); ++ struct ub_stat_notify *notify; ++ struct ub_stat_notify *tmp; ++ ++ spin_lock(&ubs_notify_lock); ++ ubs_start_time = ubs_end_time; ++ /* ++ * the expression below relies on time being unsigned long and ++ * arithmetic promotion rules ++ */ ++ ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ; ++ mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ); ++ ubs_min_interval = TIME_MAX_SEC; ++ /* save statistics accumulated for the interval */ ++ ubstat_save_statistics(); ++ /* send signals */ ++ read_lock(&tasklist_lock); ++ while (!list_empty(&ubs_notify_list)) { ++ notify = list_entry(ubs_notify_list.next, ++ struct ub_stat_notify, list); ++ task_send_sig(notify); ++ list_del(¬ify->list); ++ list_add(¬ify->list, ¬if_free_list); ++ } ++ read_unlock(&tasklist_lock); ++ spin_unlock(&ubs_notify_lock); ++ ++ list_for_each_entry_safe(notify, tmp, ¬if_free_list, list) { ++ put_task_struct(notify->task); ++ kfree(notify); ++ } ++} ++ ++/* ++ * Kernel thread ++ */ ++static int ubstatd(void *unused) ++{ ++ /* daemonize call will take care of signals */ ++ daemonize("ubstatd"); ++ ++ ubs_timer.data = (unsigned long)current; ++ ubs_timer.function = ubstatd_timeout; ++ add_timer(&ubs_timer); ++ ++ while (1) { ++ set_task_state(current, TASK_INTERRUPTIBLE); ++ if (time_after(ubs_timer.expires, jiffies)) { ++ schedule(); ++ try_to_freeze(); ++ continue; ++ } ++ ++ __set_task_state(current, TASK_RUNNING); ++ do_notifies(); ++ } ++ return 0; ++} ++ ++static int __init ubstatd_init(void) ++{ ++ init_timer(&ubs_timer); ++ ubs_timer.expires = TIME_MAX_JIF; ++ ubs_min_interval = TIME_MAX_SEC; ++ ubs_start_time = ubs_end_time = 0; ++ ++ kernel_thread(ubstatd, NULL, 0); ++ return 0; ++} ++ ++module_init(ubstatd_init); +diff -upr linux-2.6.16.orig/kernel/ub/ub_sys.c linux-2.6.16-026test015/kernel/ub/ub_sys.c +--- linux-2.6.16.orig/kernel/ub/ub_sys.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ub/ub_sys.c 2006-07-04 14:41:38.000000000 +0400 +@@ -0,0 +1,154 @@ ++/* ++ * kernel/ub/ub_sys.c ++ * ++ * Copyright (C) 2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <asm/uaccess.h> ++ ++#include <ub/beancounter.h> ++ ++#ifndef CONFIG_USER_RESOURCE ++asmlinkage long sys_getluid(void) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage long sys_setluid(uid_t uid) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, ++ unsigned long *limits) ++{ ++ return -ENOSYS; ++} ++ ++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void *buf, long size) ++{ ++ return -ENOSYS; ++} ++#else /* CONFIG_USER_RESOURCE */ ++ ++/* ++ * The (rather boring) getluid syscall ++ */ ++asmlinkage long sys_getluid(void) ++{ ++ struct user_beancounter *ub; ++ ++ ub = get_exec_ub(); ++ if (ub == NULL) ++ return -EINVAL; ++ ++ return ub->ub_uid; ++} ++ ++/* ++ * The setluid syscall ++ */ ++asmlinkage long sys_setluid(uid_t uid) ++{ ++ struct user_beancounter *ub; ++ struct task_beancounter *task_bc; ++ int error; ++ ++ task_bc = ¤t->task_bc; ++ ++ /* You may not disown a setluid */ ++ error = -EINVAL; ++ if (uid == (uid_t)-1) ++ goto out; ++ ++ /* You may only set an ub as root */ ++ error = -EPERM; ++ if (!capable(CAP_SETUID)) ++ goto out; ++ ++ /* Ok - set up a beancounter entry for this user */ ++ error = -ENOBUFS; ++ ub = get_beancounter_byuid(uid, 1); ++ if (ub == NULL) ++ goto out; ++ ++ ub_debug(UBD_ALLOC | UBD_LIMIT, "setluid, bean %p (count %d) " ++ "for %.20s pid %d\n", ++ ub, atomic_read(&ub->ub_refcount), ++ current->comm, current->pid); ++ /* install bc */ ++ put_beancounter(task_bc->exec_ub); ++ task_bc->exec_ub = ub; ++ put_beancounter(task_bc->fork_sub); ++ task_bc->fork_sub = get_beancounter(ub); ++ error = 0; ++out: ++ return error; ++} ++ ++/* ++ * The setbeanlimit syscall ++ */ ++asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, ++ unsigned long *limits) ++{ ++ int error; ++ unsigned long flags; ++ struct user_beancounter *ub; ++ unsigned long new_limits[2]; ++ ++ error = -EPERM; ++ if(!capable(CAP_SYS_RESOURCE)) ++ goto out; ++ ++ if (!ve_is_super(get_exec_env())) ++ goto out; ++ ++ error = -EINVAL; ++ if (resource >= UB_RESOURCES) ++ goto out; ++ ++ error = -EFAULT; ++ if (copy_from_user(&new_limits, limits, sizeof(new_limits))) ++ goto out; ++ ++ error = -EINVAL; ++ if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE) ++ goto out; ++ ++ error = -ENOENT; ++ ub = get_beancounter_byuid(uid, 0); ++ if (ub == NULL) { ++ ub_debug(UBD_LIMIT, "No login bc for uid %d\n", uid); ++ goto out; ++ } ++ ++ spin_lock_irqsave(&ub->ub_lock, flags); ++ ub->ub_parms[resource].barrier = new_limits[0]; ++ ub->ub_parms[resource].limit = new_limits[1]; ++ spin_unlock_irqrestore(&ub->ub_lock, flags); ++ ++ put_beancounter(ub); ++ ++ error = 0; ++out: ++ return error; ++} ++ ++extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void *buf, long size); ++asmlinkage long sys_ubstat(int func, unsigned long arg1, unsigned long arg2, ++ void *buf, long size) ++{ ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ ++ return do_ubstat(func, arg1, arg2, buf, size); ++} ++#endif +diff -upr linux-2.6.16.orig/kernel/uid16.c linux-2.6.16-026test015/kernel/uid16.c +--- linux-2.6.16.orig/kernel/uid16.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/uid16.c 2006-07-04 14:41:36.000000000 +0400 +@@ -20,43 +20,67 @@ + + asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) + { +- return sys_chown(filename, low2highuid(user), low2highgid(group)); ++ long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) + { +- return sys_lchown(filename, low2highuid(user), low2highgid(group)); ++ long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) + { +- return sys_fchown(fd, low2highuid(user), low2highgid(group)); ++ long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) + { +- return sys_setregid(low2highgid(rgid), low2highgid(egid)); ++ long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_setgid16(old_gid_t gid) + { +- return sys_setgid(low2highgid(gid)); ++ long ret = sys_setgid(low2highgid(gid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) + { +- return sys_setreuid(low2highuid(ruid), low2highuid(euid)); ++ long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_setuid16(old_uid_t uid) + { +- return sys_setuid(low2highuid(uid)); ++ long ret = sys_setuid(low2highuid(uid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) + { +- return sys_setresuid(low2highuid(ruid), low2highuid(euid), +- low2highuid(suid)); ++ long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), ++ low2highuid(suid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) +@@ -72,8 +96,11 @@ asmlinkage long sys_getresuid16(old_uid_ + + asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) + { +- return sys_setresgid(low2highgid(rgid), low2highgid(egid), +- low2highgid(sgid)); ++ long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), ++ low2highgid(sgid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) +@@ -89,12 +116,18 @@ asmlinkage long sys_getresgid16(old_gid_ + + asmlinkage long sys_setfsuid16(old_uid_t uid) + { +- return sys_setfsuid(low2highuid(uid)); ++ long ret = sys_setfsuid(low2highuid(uid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + asmlinkage long sys_setfsgid16(old_gid_t gid) + { +- return sys_setfsgid(low2highgid(gid)); ++ long ret = sys_setfsgid(low2highgid(gid)); ++ /* avoid REGPARM breakage on x86: */ ++ prevent_tail_call(ret); ++ return ret; + } + + static int groups16_to_user(old_gid_t __user *grouplist, +diff -upr linux-2.6.16.orig/kernel/user.c linux-2.6.16-026test015/kernel/user.c +--- linux-2.6.16.orig/kernel/user.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/kernel/user.c 2006-07-04 14:41:39.000000000 +0400 +@@ -14,6 +14,7 @@ + #include <linux/bitops.h> + #include <linux/key.h> + #include <linux/interrupt.h> ++#include <linux/module.h> + + /* + * UID task count cache, to get fast user lookup in "alloc_uid" +@@ -24,7 +25,20 @@ + #define UIDHASH_SZ (1 << UIDHASH_BITS) + #define UIDHASH_MASK (UIDHASH_SZ - 1) + #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) +-#define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) ++#define __uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) ++ ++#ifdef CONFIG_VE ++#define UIDHASH_MASK_VE (UIDHASH_SZ_VE - 1) ++#define __uidhashfn_ve(uid) (((uid >> UIDHASH_BITS_VE) ^ uid) & \ ++ UIDHASH_MASK_VE) ++#define __uidhashentry_ve(uid, envid) ((envid)->uidhash_table + \ ++ __uidhashfn_ve(uid)) ++#define uidhashentry_ve(uid) (ve_is_super(get_exec_env()) ? \ ++ __uidhashentry(uid) : \ ++ __uidhashentry_ve(uid, get_exec_env())) ++#else ++#define uidhashentry_ve(uid) __uidhashentry(uid) ++#endif + + static kmem_cache_t *uid_cachep; + static struct list_head uidhash_table[UIDHASH_SZ]; +@@ -96,7 +110,7 @@ struct user_struct *find_user(uid_t uid) + unsigned long flags; + + spin_lock_irqsave(&uidhash_lock, flags); +- ret = uid_hash_find(uid, uidhashentry(uid)); ++ ret = uid_hash_find(uid, uidhashentry_ve(uid)); + spin_unlock_irqrestore(&uidhash_lock, flags); + return ret; + } +@@ -115,10 +129,11 @@ void free_uid(struct user_struct *up) + } + local_irq_restore(flags); + } ++EXPORT_SYMBOL_GPL(free_uid); + + struct user_struct * alloc_uid(uid_t uid) + { +- struct list_head *hashent = uidhashentry(uid); ++ struct list_head *hashent = uidhashentry_ve(uid); + struct user_struct *up; + + spin_lock_irq(&uidhash_lock); +@@ -168,6 +183,7 @@ struct user_struct * alloc_uid(uid_t uid + } + return up; + } ++EXPORT_SYMBOL_GPL(alloc_uid); + + void switch_uid(struct user_struct *new_user) + { +@@ -186,21 +202,21 @@ void switch_uid(struct user_struct *new_ + free_uid(old_user); + suid_keys(current); + } +- ++EXPORT_SYMBOL_GPL(switch_uid); + + static int __init uid_cache_init(void) + { + int n; + + uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), +- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); ++ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_UBC, NULL, NULL); + + for(n = 0; n < UIDHASH_SZ; ++n) + INIT_LIST_HEAD(uidhash_table + n); + + /* Insert the root user immediately (init already runs as root) */ + spin_lock_irq(&uidhash_lock); +- uid_hash_insert(&root_user, uidhashentry(0)); ++ uid_hash_insert(&root_user, __uidhashentry(0)); + spin_unlock_irq(&uidhash_lock); + + return 0; +diff -upr linux-2.6.16.orig/kernel/ve.c linux-2.6.16-026test015/kernel/ve.c +--- linux-2.6.16.orig/kernel/ve.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/ve.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,205 @@ ++/* ++ * linux/kernel/ve.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++/* ++ * 've.c' helper file performing VE sub-system initialization ++ */ ++ ++#include <linux/sched.h> ++#include <linux/delay.h> ++#include <linux/capability.h> ++#include <linux/ve.h> ++#include <linux/smp_lock.h> ++#include <linux/init.h> ++ ++#include <linux/errno.h> ++#include <linux/unistd.h> ++#include <linux/slab.h> ++#include <linux/sys.h> ++#include <linux/kdev_t.h> ++#include <linux/termios.h> ++#include <linux/tty_driver.h> ++#include <linux/netdevice.h> ++#include <linux/utsname.h> ++#include <linux/proc_fs.h> ++#include <linux/kernel_stat.h> ++#include <linux/module.h> ++#include <linux/rcupdate.h> ++#include <linux/ve_proto.h> ++#include <linux/ve_owner.h> ++#include <linux/devpts_fs.h> ++ ++#include <linux/nfcalls.h> ++ ++unsigned long vz_rstamp = 0x37e0f59d; ++ ++#ifdef CONFIG_MODULES ++struct module no_module = { .state = MODULE_STATE_GOING }; ++EXPORT_SYMBOL(no_module); ++#endif ++ ++#ifdef CONFIG_VE ++ ++DCL_VE_OWNER(SKB, struct sk_buff, owner_env) ++DCL_VE_OWNER(SK, struct sock, sk_owner_env) ++DCL_VE_OWNER(TW, struct tcp_tw_bucket, tw_owner_env) ++DCL_VE_OWNER(FILP, struct file, owner_env) ++DCL_VE_OWNER(FSTYPE, struct file_system_type, owner_env) ++ ++INIT_KSYM_MODULE(x_tables); ++INIT_KSYM_MODULE(xt_tcpudp); ++INIT_KSYM_MODULE(ip_tables); ++INIT_KSYM_MODULE(ip6_tables); ++INIT_KSYM_MODULE(iptable_filter); ++INIT_KSYM_MODULE(ip6table_filter); ++INIT_KSYM_MODULE(iptable_mangle); ++INIT_KSYM_MODULE(ip6table_mangle); ++INIT_KSYM_MODULE(xt_limit); ++INIT_KSYM_MODULE(ipt_multiport); ++INIT_KSYM_MODULE(ip6t_multiport); ++INIT_KSYM_MODULE(ipt_tos); ++INIT_KSYM_MODULE(ipt_TOS); ++INIT_KSYM_MODULE(ipt_REJECT); ++INIT_KSYM_MODULE(ip6t_REJECT); ++INIT_KSYM_MODULE(ipt_TCPMSS); ++INIT_KSYM_MODULE(xt_tcpmss); ++INIT_KSYM_MODULE(ipt_ttl); ++INIT_KSYM_MODULE(ipt_LOG); ++INIT_KSYM_MODULE(ip6t_LOG); ++INIT_KSYM_MODULE(xt_length); ++INIT_KSYM_MODULE(ip_conntrack); ++INIT_KSYM_MODULE(ip_conntrack_ftp); ++INIT_KSYM_MODULE(ip_conntrack_irc); ++INIT_KSYM_MODULE(xt_conntrack); ++INIT_KSYM_MODULE(xt_state); ++INIT_KSYM_MODULE(xt_helper); ++INIT_KSYM_MODULE(ip_nat); ++INIT_KSYM_MODULE(iptable_nat); ++INIT_KSYM_MODULE(ip_nat_ftp); ++INIT_KSYM_MODULE(ip_nat_irc); ++INIT_KSYM_MODULE(ipt_REDIRECT); ++ ++INIT_KSYM_CALL(int, init_netfilter, (void)); ++INIT_KSYM_CALL(int, init_xtables, (void)); ++INIT_KSYM_CALL(int, init_xt_tcpudp, (void)); ++INIT_KSYM_CALL(int, init_iptables, (void)); ++INIT_KSYM_CALL(int, init_ip6tables, (void)); ++INIT_KSYM_CALL(int, init_iptable_filter, (void)); ++INIT_KSYM_CALL(int, init_ip6table_filter, (void)); ++INIT_KSYM_CALL(int, init_iptable_mangle, (void)); ++INIT_KSYM_CALL(int, init_ip6table_mangle, (void)); ++INIT_KSYM_CALL(int, init_xt_limit, (void)); ++INIT_KSYM_CALL(int, init_iptable_multiport, (void)); ++INIT_KSYM_CALL(int, init_ip6table_multiport, (void)); ++INIT_KSYM_CALL(int, init_iptable_tos, (void)); ++INIT_KSYM_CALL(int, init_iptable_TOS, (void)); ++INIT_KSYM_CALL(int, init_iptable_REJECT, (void)); ++INIT_KSYM_CALL(int, init_ip6table_REJECT, (void)); ++INIT_KSYM_CALL(int, init_iptable_TCPMSS, (void)); ++INIT_KSYM_CALL(int, init_xt_tcpmss, (void)); ++INIT_KSYM_CALL(int, init_iptable_ttl, (void)); ++INIT_KSYM_CALL(int, init_iptable_LOG, (void)); ++INIT_KSYM_CALL(int, init_ip6table_LOG, (void)); ++INIT_KSYM_CALL(int, init_xt_length, (void)); ++INIT_KSYM_CALL(int, init_iptable_conntrack, (void)); ++INIT_KSYM_CALL(int, init_iptable_ftp, (void)); ++INIT_KSYM_CALL(int, init_iptable_irc, (void)); ++INIT_KSYM_CALL(int, init_xt_conntrack_match, (void)); ++INIT_KSYM_CALL(int, init_xt_state, (void)); ++INIT_KSYM_CALL(int, init_xt_helper, (void)); ++INIT_KSYM_CALL(int, ip_nat_init, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat_ftp, (void)); ++INIT_KSYM_CALL(int, init_iptable_nat_irc, (void)); ++INIT_KSYM_CALL(int, init_iptable_REDIRECT, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat_irc, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat_ftp, (void)); ++INIT_KSYM_CALL(void, fini_iptable_nat, (void)); ++INIT_KSYM_CALL(void, ip_nat_cleanup, (void)); ++INIT_KSYM_CALL(void, fini_xt_helper, (void)); ++INIT_KSYM_CALL(void, fini_xt_state, (void)); ++INIT_KSYM_CALL(void, fini_xt_conntrack_match, (void)); ++INIT_KSYM_CALL(void, fini_iptable_irc, (void)); ++INIT_KSYM_CALL(void, fini_iptable_ftp, (void)); ++INIT_KSYM_CALL(void, fini_iptable_conntrack, (void)); ++INIT_KSYM_CALL(void, fini_xt_length, (void)); ++INIT_KSYM_CALL(void, fini_ip6table_LOG, (void)); ++INIT_KSYM_CALL(void, fini_iptable_LOG, (void)); ++INIT_KSYM_CALL(void, fini_iptable_ttl, (void)); ++INIT_KSYM_CALL(void, fini_xt_tcpmss, (void)); ++INIT_KSYM_CALL(void, fini_iptable_TCPMSS, (void)); ++INIT_KSYM_CALL(void, fini_ip6table_REJECT, (void)); ++INIT_KSYM_CALL(void, fini_iptable_REJECT, (void)); ++INIT_KSYM_CALL(void, fini_iptable_TOS, (void)); ++INIT_KSYM_CALL(void, fini_iptable_tos, (void)); ++INIT_KSYM_CALL(void, fini_ip6table_multiport, (void)); ++INIT_KSYM_CALL(void, fini_iptable_multiport, (void)); ++INIT_KSYM_CALL(void, fini_xt_limit, (void)); ++INIT_KSYM_CALL(void, fini_iptable_filter, (void)); ++INIT_KSYM_CALL(void, fini_ip6table_filter, (void)); ++INIT_KSYM_CALL(void, fini_iptable_mangle, (void)); ++INIT_KSYM_CALL(void, fini_ip6table_mangle, (void)); ++INIT_KSYM_CALL(void, fini_ip6tables, (void)); ++INIT_KSYM_CALL(void, fini_iptables, (void)); ++INIT_KSYM_CALL(void, fini_xt_tcpudp, (void)); ++INIT_KSYM_CALL(void, fini_xtables, (void)); ++INIT_KSYM_CALL(void, fini_netfilter, (void)); ++INIT_KSYM_CALL(void, fini_iptable_REDIRECT, (void)); ++ ++INIT_KSYM_CALL(void, ipt_flush_table, (struct xt_table *table)); ++INIT_KSYM_CALL(void, ip6t_flush_table, (struct xt_table *table)); ++ ++#if defined(CONFIG_VE_CALLS_MODULE) || defined(CONFIG_VE_CALLS) ++INIT_KSYM_MODULE(vzmon); ++INIT_KSYM_CALL(int, real_get_device_perms_ve, ++ (int dev_type, dev_t dev, int access_mode)); ++INIT_KSYM_CALL(void, real_do_env_cleanup, (struct ve_struct *env)); ++INIT_KSYM_CALL(void, real_do_env_free, (struct ve_struct *env)); ++INIT_KSYM_CALL(void, real_update_load_avg_ve, (void)); ++ ++int get_device_perms_ve(int dev_type, dev_t dev, int access_mode) ++{ ++ return KSYMSAFECALL(int, vzmon, real_get_device_perms_ve, ++ (dev_type, dev, access_mode)); ++} ++EXPORT_SYMBOL(get_device_perms_ve); ++ ++void do_env_cleanup(struct ve_struct *env) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_do_env_cleanup, (env)); ++} ++ ++void do_env_free(struct ve_struct *env) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_do_env_free, (env)); ++} ++EXPORT_SYMBOL(do_env_free); ++ ++void do_update_load_avg_ve(void) ++{ ++ KSYMSAFECALL_VOID(vzmon, real_update_load_avg_ve, ()); ++} ++#endif ++ ++struct ve_struct ve0 = { ++ .utsname = &system_utsname, ++ .vetask_lh = LIST_HEAD_INIT(ve0.vetask_lh), ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ._net_dev_tail = &ve0._net_dev_base, ++ .ifindex = -1, ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ .devpts_config = &devpts_config, ++#endif ++}; ++ ++EXPORT_SYMBOL(ve0); ++ ++#endif /* CONFIG_VE */ +diff -upr linux-2.6.16.orig/kernel/vecalls.c linux-2.6.16-026test015/kernel/vecalls.c +--- linux-2.6.16.orig/kernel/vecalls.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/vecalls.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,3547 @@ ++/* ++ * linux/kernel/vecalls.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ */ ++ ++/* ++ * 'vecalls.c' is file with basic VE support. It provides basic primities ++ * along with initialization script ++ */ ++ ++#include <linux/sched.h> ++#include <linux/delay.h> ++#include <linux/capability.h> ++#include <linux/ve.h> ++#include <linux/smp_lock.h> ++#include <linux/init.h> ++#include <linux/list.h> ++#include <linux/ve_owner.h> ++#include <linux/errno.h> ++#include <linux/unistd.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/sys.h> ++#include <linux/fs.h> ++#include <linux/namespace.h> ++#include <linux/termios.h> ++#include <linux/tty_driver.h> ++#include <linux/netdevice.h> ++#include <linux/wait.h> ++#include <linux/inetdevice.h> ++#include <net/addrconf.h> ++#include <linux/utsname.h> ++#include <linux/sysctl.h> ++#include <linux/proc_fs.h> ++#include <linux/seq_file.h> ++#include <linux/kernel_stat.h> ++#include <linux/module.h> ++#include <linux/suspend.h> ++#include <linux/rcupdate.h> ++#include <linux/in.h> ++#include <linux/major.h> ++#include <linux/kdev_t.h> ++#include <linux/idr.h> ++#include <linux/inetdevice.h> ++#include <net/pkt_sched.h> ++#include <linux/divert.h> ++#include <ub/beancounter.h> ++ ++#include <net/route.h> ++#include <net/ip_fib.h> ++#include <net/ip6_route.h> ++#include <net/arp.h> ++#include <net/ipv6.h> ++ ++#include <linux/ve_proto.h> ++#include <linux/venet.h> ++#include <linux/vzctl.h> ++#include <linux/vzcalluser.h> ++#ifdef CONFIG_FAIRSCHED ++#include <linux/fairsched.h> ++#endif ++ ++#include <linux/nfcalls.h> ++#include <linux/virtinfo.h> ++ ++struct ve_struct *ve_list_head = NULL; ++int nr_ve = 1; /* One VE always exists. Compatibility with vestat */ ++rwlock_t ve_list_guard = RW_LOCK_UNLOCKED; ++static rwlock_t devperms_hash_guard = RW_LOCK_UNLOCKED; ++ ++extern int glob_virt_pids; ++ ++static int do_env_enter(struct ve_struct *ve, unsigned int flags); ++static void do_clean_devperms(envid_t veid); ++static int alloc_ve_tty_drivers(struct ve_struct* ve); ++static void free_ve_tty_drivers(struct ve_struct* ve); ++static int register_ve_tty_drivers(struct ve_struct* ve); ++static void unregister_ve_tty_drivers(struct ve_struct* ve); ++static int init_ve_tty_drivers(struct ve_struct *); ++static void fini_ve_tty_drivers(struct ve_struct *); ++static void clear_termios(struct tty_driver* driver ); ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static void ve_mapped_devs_cleanup(struct ve_struct *ve); ++#endif ++ ++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf); ++ ++static void vecalls_exit(void); ++ ++struct ve_struct *__find_ve_by_id(envid_t veid) ++{ ++ struct ve_struct *ve; ++ for (ve = ve_list_head; ++ ve != NULL && ve->veid != veid; ++ ve = ve->next); ++ return ve; ++} ++ ++struct ve_struct *get_ve_by_id(envid_t veid) ++{ ++ struct ve_struct *ve; ++ read_lock(&ve_list_guard); ++ ve = __find_ve_by_id(veid); ++ get_ve(ve); ++ read_unlock(&ve_list_guard); ++ return ve; ++} ++ ++/* ++ * real_put_ve() MUST be used instead of put_ve() inside vecalls. ++ */ ++void real_do_env_free(struct ve_struct *ve); ++static inline void real_put_ve(struct ve_struct *ve) ++{ ++ if (ve && atomic_dec_and_test(&ve->counter)) { ++ if (atomic_read(&ve->pcounter) > 0) ++ BUG(); ++ if (ve->is_running) ++ BUG(); ++ real_do_env_free(ve); ++ } ++} ++ ++extern struct file_system_type devpts_fs_type; ++extern struct file_system_type sysfs_fs_type; ++extern struct file_system_type tmpfs_fs_type; ++extern struct file_system_type proc_fs_type; ++ ++extern spinlock_t task_capability_lock; ++extern void ve_ipc_free(struct ve_struct * ve); ++extern void ip_fragment_cleanup(struct ve_struct *ve); ++ ++static int ve_get_cpu_stat(envid_t veid, struct vz_cpu_stat *buf) ++{ ++ struct ve_struct *ve; ++ struct vz_cpu_stat *vstat; ++ int retval; ++ int i, cpu; ++ unsigned long tmp; ++ ++ if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid)) ++ return -EPERM; ++ if (veid == 0) ++ return -ESRCH; ++ ++ vstat = kmalloc(sizeof(*vstat), GFP_KERNEL); ++ if (!vstat) ++ return -ENOMEM; ++ memset(vstat, 0, sizeof(*vstat)); ++ ++ retval = -ESRCH; ++ read_lock(&ve_list_guard); ++ ve = __find_ve_by_id(veid); ++ if (ve == NULL) ++ goto out_unlock; ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ struct ve_cpu_stats *st; ++ ++ st = VE_CPU_STATS(ve, cpu); ++ vstat->user_jif += st->user; ++ vstat->nice_jif += st->nice; ++ vstat->system_jif += st->system; ++ vstat->idle_clk += ve_sched_get_idle_time(ve, cpu); ++ } ++ vstat->uptime_clk = get_cycles() - ve->start_cycles; ++ vstat->uptime_jif = jiffies - ve->start_jiffies; ++ for (i = 0; i < 3; i++) { ++ tmp = ve->avenrun[i] + (FIXED_1/200); ++ vstat->avenrun[i].val_int = LOAD_INT(tmp); ++ vstat->avenrun[i].val_frac = LOAD_FRAC(tmp); ++ } ++ read_unlock(&ve_list_guard); ++ ++ retval = 0; ++ if (copy_to_user(buf, vstat, sizeof(*vstat))) ++ retval = -EFAULT; ++out_free: ++ kfree(vstat); ++ return retval; ++ ++out_unlock: ++ read_unlock(&ve_list_guard); ++ goto out_free; ++} ++ ++/********************************************************************** ++ * Devices permissions routines, ++ * character and block devices separately ++ **********************************************************************/ ++ ++/* Rules applied in the following order: ++ MAJOR!=0, MINOR!=0 ++ MAJOR!=0, MINOR==0 ++ MAJOR==0, MINOR==0 ++*/ ++struct devperms_struct ++{ ++ dev_t dev; /* device id */ ++ unsigned char mask; ++ unsigned type; ++ envid_t veid; ++ ++ struct devperms_struct *devhash_next; ++ struct devperms_struct **devhash_pprev; ++}; ++ ++static struct devperms_struct original_perms[] = ++{{ ++ MKDEV(0,0), /*device*/ ++ S_IROTH | S_IWOTH, ++ S_IFCHR, /*type*/ ++ 0, /*veid*/ ++ NULL, NULL ++}, ++{ ++ MKDEV(0,0), /*device*/ ++ S_IXGRP | S_IROTH | S_IWOTH, ++ S_IFBLK, /*type*/ ++ 0, /*veid*/ ++ NULL, NULL ++}}; ++ ++static struct devperms_struct default_major_perms[] = { ++ {MKDEV(UNIX98_PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++ {MKDEV(UNIX98_PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++ {MKDEV(PTY_MASTER_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++ {MKDEV(PTY_SLAVE_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR}, ++}; ++static struct devperms_struct default_minor_perms[] = { ++ {MKDEV(MEM_MAJOR, 3), S_IROTH | S_IWOTH, S_IFCHR}, /* null */ ++ {MKDEV(MEM_MAJOR, 5), S_IROTH | S_IWOTH, S_IFCHR}, /* zero */ ++ {MKDEV(MEM_MAJOR, 7), S_IROTH | S_IWOTH, S_IFCHR}, /* full */ ++ {MKDEV(TTYAUX_MAJOR, 0), S_IROTH | S_IWOTH, S_IFCHR},/* tty */ ++ {MKDEV(TTYAUX_MAJOR, 2), S_IROTH | S_IWOTH, S_IFCHR},/* ptmx */ ++ {MKDEV(MEM_MAJOR, 8), S_IROTH, S_IFCHR}, /* random */ ++ {MKDEV(MEM_MAJOR, 9), S_IROTH, S_IFCHR}, /* urandom */ ++}; ++ ++static struct devperms_struct default_deny_perms = { ++ MKDEV(0, 0), 0, S_IFCHR ++}; ++ ++static inline struct devperms_struct *find_default_devperms(int type, ++ dev_t dev) ++{ ++ int i; ++ ++ /* XXX all defaults perms are S_IFCHR */ ++ if (type != S_IFCHR) ++ return &default_deny_perms; ++ ++ for (i = 0; ++ i < sizeof(default_minor_perms)/sizeof(struct devperms_struct); ++ i++) ++ if (MAJOR(dev) == MAJOR(default_minor_perms[i].dev) && ++ MINOR(dev) == MINOR(default_minor_perms[i].dev)) ++ return &default_minor_perms[i]; ++ for (i = 0; ++ i < sizeof(default_major_perms)/sizeof(struct devperms_struct); ++ i++) ++ if (MAJOR(dev) == MAJOR(default_major_perms[i].dev)) ++ return &default_major_perms[i]; ++ ++ return &default_deny_perms; ++} ++ ++#define DEVPERMS_HASH_SZ 512 ++struct devperms_struct *devperms_hash[DEVPERMS_HASH_SZ]; ++ ++#define devperms_hashfn(id,dev) \ ++ ( (id << 5) ^ (id >> 5) ^ (MAJOR(dev)) ^ MINOR(dev) ) & \ ++ (DEVPERMS_HASH_SZ - 1) ++ ++static inline void hash_devperms(struct devperms_struct *p) ++{ ++ struct devperms_struct **htable = ++ &devperms_hash[devperms_hashfn(p->veid,p->dev)]; ++ ++ if ((p->devhash_next = *htable) != NULL) ++ (*htable)->devhash_pprev = &p->devhash_next; ++ *htable = p; ++ p->devhash_pprev = htable; ++} ++ ++static inline void unhash_devperms(struct devperms_struct *p) ++{ ++ if (p->devhash_next) ++ p->devhash_next->devhash_pprev = p->devhash_pprev; ++ *p->devhash_pprev = p->devhash_next; ++} ++ ++static int __init init_devperms_hash(void) ++{ ++ write_lock_irq(&devperms_hash_guard); ++ memset(devperms_hash, 0, sizeof(devperms_hash)); ++ hash_devperms(original_perms); ++ hash_devperms(original_perms+1); ++ write_unlock_irq(&devperms_hash_guard); ++ return 0; ++} ++ ++static inline void fini_devperms_hash(void) ++{ ++} ++ ++static inline struct devperms_struct *find_devperms(envid_t veid, ++ int type, ++ dev_t dev) ++{ ++ struct devperms_struct *p, **htable = ++ &devperms_hash[devperms_hashfn(veid,dev)]; ++ ++ for (p = *htable; p && !(p->type==type && ++ MAJOR(dev)==MAJOR(p->dev) && ++ MINOR(dev)==MINOR(p->dev) && ++ p->veid==veid); ++ p = p->devhash_next) ++ ; ++ return p; ++} ++ ++ ++static void do_clean_devperms(envid_t veid) ++{ ++ int i; ++ struct devperms_struct* ve; ++ ++ write_lock_irq(&devperms_hash_guard); ++ for (i = 0; i < DEVPERMS_HASH_SZ; i++) ++ for (ve = devperms_hash[i]; ve;) { ++ struct devperms_struct *next = ve->devhash_next; ++ if (ve->veid == veid) { ++ unhash_devperms(ve); ++ kfree(ve); ++ } ++ ++ ve = next; ++ } ++ write_unlock_irq(&devperms_hash_guard); ++} ++ ++/* ++ * Mode is a mask of ++ * FMODE_READ for read access (configurable by S_IROTH) ++ * FMODE_WRITE for write access (configurable by S_IWOTH) ++ * FMODE_QUOTACTL for quotactl access (configurable by S_IXGRP) ++ */ ++int real_get_device_perms_ve(int dev_type, dev_t dev, int access_mode) ++{ ++ struct devperms_struct *perms; ++ struct ve_struct *ve; ++ envid_t veid; ++ ++ perms = NULL; ++ ve = get_exec_env(); ++ veid = ve->veid; ++ ++ read_lock(&devperms_hash_guard); ++ ++ perms = find_devperms(veid, dev_type|VE_USE_MINOR, dev); ++ if (perms) ++ goto end; ++ ++ perms = find_devperms(veid, dev_type|VE_USE_MAJOR, MKDEV(MAJOR(dev),0)); ++ if (perms) ++ goto end; ++ ++ perms = find_devperms(veid, dev_type, MKDEV(0,0)); ++ if (perms) ++ goto end; ++ ++ perms = find_default_devperms(dev_type, dev); ++ ++end: ++ read_unlock(&devperms_hash_guard); ++ ++ access_mode = "\000\004\002\006\010\014\012\016"[access_mode]; ++ return perms ? ++ (((perms->mask & access_mode) == access_mode) ? 0 : -EACCES) : ++ -ENODEV; ++} ++EXPORT_SYMBOL(real_get_device_perms_ve); ++ ++int do_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask) ++{ ++ struct devperms_struct *perms; ++ ++ write_lock_irq(&devperms_hash_guard); ++ perms = find_devperms(veid, type, dev); ++ if (!perms) { ++ struct devperms_struct *perms_new; ++ write_unlock_irq(&devperms_hash_guard); ++ ++ perms_new = kmalloc(sizeof(struct devperms_struct), GFP_KERNEL); ++ if (!perms_new) ++ return -ENOMEM; ++ ++ write_lock_irq(&devperms_hash_guard); ++ perms = find_devperms(veid, type, dev); ++ if (perms) { ++ kfree(perms_new); ++ perms_new = perms; ++ } ++ ++ switch (type & VE_USE_MASK) { ++ case 0: ++ dev = 0; ++ break; ++ case VE_USE_MAJOR: ++ dev = MKDEV(MAJOR(dev),0); ++ break; ++ } ++ ++ perms_new->veid = veid; ++ perms_new->dev = dev; ++ perms_new->type = type; ++ perms_new->mask = mask & S_IALLUGO; ++ hash_devperms(perms_new); ++ } else ++ perms->mask = mask & S_IALLUGO; ++ write_unlock_irq(&devperms_hash_guard); ++ return 0; ++} ++EXPORT_SYMBOL(do_setdevperms); ++ ++int real_setdevperms(envid_t veid, unsigned type, dev_t dev, unsigned mask) ++{ ++ struct ve_struct *ve; ++ int err; ++ ++ if (!capable(CAP_SETVEID) || veid == 0) ++ return -EPERM; ++ ++ if ((ve = get_ve_by_id(veid)) == NULL) ++ return -ESRCH; ++ ++ down_read(&ve->op_sem); ++ err = -ESRCH; ++ if (ve->is_running) ++ err = do_setdevperms(veid, type, dev, mask); ++ up_read(&ve->op_sem); ++ real_put_ve(ve); ++ return err; ++} ++ ++void real_update_load_avg_ve(void) ++{ ++ struct ve_struct *ve; ++ unsigned long nr_active; ++ ++ read_lock(&ve_list_guard); ++ for (ve = ve_list_head; ve != NULL; ve = ve->next) { ++ nr_active = nr_running_ve(ve) + nr_uninterruptible_ve(ve); ++ nr_active *= FIXED_1; ++ CALC_LOAD(ve->avenrun[0], EXP_1, nr_active); ++ CALC_LOAD(ve->avenrun[1], EXP_5, nr_active); ++ CALC_LOAD(ve->avenrun[2], EXP_15, nr_active); ++ } ++ read_unlock(&ve_list_guard); ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * FS-related helpers to VE start/stop ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++/* ++ * DEVPTS needs a virtualization: each environment should see each own list of ++ * pseudo-terminals. ++ * To implement it we need to have separate devpts superblocks for each ++ * VE, and each VE should mount its own one. ++ * Thus, separate vfsmount structures are required. ++ * To minimize intrusion into vfsmount lookup code, separate file_system_type ++ * structures are created. ++ * ++ * In addition to this, patch fo character device itself is required, as file ++ * system itself is used only for MINOR/MAJOR lookup. ++ */ ++static int register_ve_fs_type(struct ve_struct *ve, ++ struct file_system_type *template, ++ struct file_system_type **p_fs_type, struct vfsmount **p_mnt) ++{ ++ struct vfsmount *mnt; ++ struct file_system_type *local_fs_type; ++ int ret; ++ ++ VZTRACE("register_ve_fs_type(\"%s\")\n", template->name); ++ ++ local_fs_type = kmalloc(sizeof(*local_fs_type) + sizeof(void *), ++ GFP_KERNEL); ++ if (local_fs_type == NULL) ++ return -ENOMEM; ++ ++ memset(local_fs_type, 0, sizeof(*local_fs_type)); ++ local_fs_type->name = template->name; ++ local_fs_type->fs_flags = template->fs_flags; ++ local_fs_type->get_sb = template->get_sb; ++ local_fs_type->kill_sb = template->kill_sb; ++ local_fs_type->owner = template->owner; ++ /* ++ * 1. we do not have refcounter on fstype ++ * 2. fstype holds reference to ve using get_ve()/put_ve(). ++ * so we free fstype when freeing ve and we are sure it's ok to free it ++ */ ++ SET_VE_OWNER_FSTYPE(local_fs_type, ve); ++ get_filesystem(local_fs_type); /* get_ve() inside */ ++ ++ ret = register_filesystem(local_fs_type); /* does not get */ ++ if (ret) ++ goto reg_err; ++ ++ mnt = kern_mount(local_fs_type); ++ if (IS_ERR(mnt)) ++ goto mnt_err; ++ ++ /* Usage counters after succesful execution kern_mount: ++ * local_fs_type - +1 (get_fs_type,get_sb_single,put_filesystem) ++ * mnt - +1 == 1 (alloc_vfsmnt) ++ */ ++ ++ *p_fs_type = local_fs_type; ++ *p_mnt = mnt; ++ return 0; ++ ++mnt_err: ++ ret = PTR_ERR(mnt); ++ unregister_filesystem(local_fs_type); /* does not put */ ++ ++reg_err: ++ put_filesystem(local_fs_type); ++ kfree(local_fs_type); ++ printk(KERN_DEBUG ++ "register_ve_fs_type(\"%s\") err=%d\n", template->name, ret); ++ return ret; ++} ++ ++static void umount_ve_fs_type(struct file_system_type *local_fs_type) ++{ ++ struct vfsmount *mnt; ++ struct list_head *p, *q; ++ LIST_HEAD(kill); ++ LIST_HEAD(umount_list); ++ ++ down_write(&namespace_sem); ++ spin_lock(&vfsmount_lock); ++ list_for_each_safe(p, q, ¤t->namespace->list) { ++ mnt = list_entry(p, struct vfsmount, mnt_list); ++ if (mnt->mnt_sb->s_type != local_fs_type) ++ continue; ++ list_del(p); ++ list_add(p, &kill); ++ } ++ ++ while (!list_empty(&kill)) { ++ mnt = list_entry(kill.next, struct vfsmount, mnt_list); ++ umount_tree(mnt, 1, &umount_list); ++ } ++ spin_unlock(&vfsmount_lock); ++ up_write(&namespace_sem); ++ release_mounts(&umount_list); ++} ++ ++static void unregister_ve_fs_type(struct file_system_type *local_fs_type, ++ struct vfsmount *local_fs_mount) ++{ ++ if (local_fs_mount == NULL || ++ local_fs_type == NULL) { ++ if (local_fs_mount != NULL || ++ local_fs_type != NULL) ++ BUG(); ++ return; ++ } ++ ++ VZTRACE("unregister_ve_fs_type(\"%s\")\n", local_fs_type->name); ++ ++ unregister_filesystem(local_fs_type); ++ umount_ve_fs_type(local_fs_type); ++ kern_umount(local_fs_mount); /* alias to mntput, drop our ref */ ++ put_filesystem(local_fs_type); ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * FS-related helpers to VE start/stop ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#ifdef CONFIG_SYSCTL ++static ctl_table ve_sysctl_tables[] = { ++ /* kernel */ ++ { ++ .ctl_name = CTL_KERN, ++ .procname = "kernel", ++ .mode = 0555, ++ .child = &ve_sysctl_tables[2], ++ }, ++ { .ctl_name = 0 }, ++ /* kernel/[vars] */ ++ { ++ .ctl_name = KERN_NODENAME, ++ .procname = "hostname", ++ .maxlen = 64, ++ .mode = 0644, ++ .proc_handler = &proc_doutsstring, ++ .strategy = &sysctl_string, ++ }, ++ { ++ .ctl_name = KERN_DOMAINNAME, ++ .procname = "domainname", ++ .maxlen = 64, ++ .mode = 0644, ++ .proc_handler = &proc_doutsstring, ++ .strategy = &sysctl_string, ++ }, ++ { ++ .ctl_name = KERN_SHMMAX, ++ .procname = "shmmax", ++ .maxlen = sizeof(size_t), ++ .mode = 0644, ++ .proc_handler = &proc_doulongvec_minmax, ++ }, ++ { ++ .ctl_name = KERN_SHMALL, ++ .procname = "shmall", ++ .maxlen = sizeof(size_t), ++ .mode = 0644, ++ .proc_handler = &proc_doulongvec_minmax, ++ }, ++ { ++ .ctl_name = KERN_SHMMNI, ++ .procname = "shmmni", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_MSGMAX, ++ .procname = "msgmax", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_MSGMNI, ++ .procname = "msgmni", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_MSGMNB, ++ .procname = "msgmnb", ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, ++ { ++ .ctl_name = KERN_SEM, ++ .procname = "sem", ++ .maxlen = 4 * sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec ++ }, ++ { .ctl_name = 0, } ++}; ++ ++static int register_ve_sysctltables(struct ve_struct *ve) ++{ ++ struct ctl_table_header *header; ++ ctl_table *root, *table; ++ ++ VZTRACE("register_ve_sysctltables\n"); ++ ++ root = clone_sysctl_template(ve_sysctl_tables, ++ sizeof(ve_sysctl_tables) / sizeof(ctl_table)); ++ if (root == NULL) ++ goto out; ++ ++ table = root->child; ++ table[0].data = &ve->utsname->nodename; ++ table[1].data = &ve->utsname->domainname; ++ table[2].data = &ve->_shm_ctlmax; ++ table[3].data = &ve->_shm_ctlall; ++ table[4].data = &ve->_shm_ctlmni; ++ table[5].data = &ve->_msg_ctlmax; ++ table[6].data = &ve->_msg_ctlmni; ++ table[7].data = &ve->_msg_ctlmnb; ++ table[8].data = &ve->_sem_ctls[0]; ++ ++ /* insert at head to override kern entries */ ++ header = register_sysctl_table(root, 1); ++ if (header == NULL) ++ goto out_free; ++ ++ ve->kern_header = header; ++ ve->kern_table = root; ++ return 0; ++ ++out_free: ++ free_sysctl_clone(root); ++out: ++ return -ENOMEM; ++} ++ ++static inline void unregister_ve_sysctltables(struct ve_struct *ve) ++{ ++ unregister_sysctl_table(ve->kern_header); ++} ++ ++static inline void free_ve_sysctltables(struct ve_struct *ve) ++{ ++ free_sysctl_clone(ve->kern_table); ++} ++#endif ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE start: subsystems ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++extern struct new_utsname virt_utsname; ++ ++static int init_ve_utsname(struct ve_struct *ve) ++{ ++ ve->utsname = kmalloc(sizeof(*ve->utsname), GFP_KERNEL); ++ if (ve->utsname == NULL) ++ return -ENOMEM; ++ ++ down_read(&uts_sem); /* protect the source */ ++ memcpy(ve->utsname, &system_utsname, sizeof(*ve->utsname)); ++ memcpy(ve->utsname->release, virt_utsname.release, ++ sizeof(virt_utsname.release)); ++ up_read(&uts_sem); ++ ++ return 0; ++} ++ ++static void free_ve_utsname(struct ve_struct *ve) ++{ ++ kfree(ve->utsname); ++ ve->utsname = NULL; ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#include <net/ip.h> ++#include <net/tcp.h> ++#include <net/udp.h> ++#include <net/icmp.h> ++ ++static int init_fini_ve_mibs(struct ve_struct *ve, int fini) ++{ ++ if (fini) ++ goto fini; ++ if (!(ve->_net_statistics[0] = alloc_percpu(struct linux_mib))) ++ goto out1; ++ if (!(ve->_net_statistics[1] = alloc_percpu(struct linux_mib))) ++ goto out2; ++ if (!(ve->_ip_statistics[0] = alloc_percpu(struct ipstats_mib))) ++ goto out3; ++ if (!(ve->_ip_statistics[1] = alloc_percpu(struct ipstats_mib))) ++ goto out4; ++ if (!(ve->_icmp_statistics[0] = alloc_percpu(struct icmp_mib))) ++ goto out5; ++ if (!(ve->_icmp_statistics[1] = alloc_percpu(struct icmp_mib))) ++ goto out6; ++ if (!(ve->_tcp_statistics[0] = alloc_percpu(struct tcp_mib))) ++ goto out7; ++ if (!(ve->_tcp_statistics[1] = alloc_percpu(struct tcp_mib))) ++ goto out8; ++ if (!(ve->_udp_statistics[0] = alloc_percpu(struct udp_mib))) ++ goto out9; ++ if (!(ve->_udp_statistics[1] = alloc_percpu(struct udp_mib))) ++ goto out10; ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (!(ve->_ipv6_statistics[0] = alloc_percpu(struct ipstats_mib))) ++ goto out11; ++ if (!(ve->_ipv6_statistics[1] = alloc_percpu(struct ipstats_mib))) ++ goto out12; ++ if (!(ve->_icmpv6_statistics[0] = alloc_percpu(struct icmpv6_mib))) ++ goto out13; ++ if (!(ve->_icmpv6_statistics[1] = alloc_percpu(struct icmpv6_mib))) ++ goto out14; ++ if (!(ve->_udp_stats_in6[0] = alloc_percpu(struct udp_mib))) ++ goto out15; ++ if (!(ve->_udp_stats_in6[1] = alloc_percpu(struct udp_mib))) ++ goto out16; ++#endif ++ return 0; ++fini: ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ free_percpu(ve->_udp_stats_in6[1]); ++out16: ++ free_percpu(ve->_udp_stats_in6[0]); ++out15: ++ free_percpu(ve->_icmpv6_statistics[1]); ++out14: ++ free_percpu(ve->_icmpv6_statistics[0]); ++out13: ++ free_percpu(ve->_ipv6_statistics[1]); ++out12: ++ free_percpu(ve->_ipv6_statistics[0]); ++out11: ++#endif ++ free_percpu(ve->_udp_statistics[1]); ++out10: ++ free_percpu(ve->_udp_statistics[0]); ++out9: ++ free_percpu(ve->_tcp_statistics[1]); ++out8: ++ free_percpu(ve->_tcp_statistics[0]); ++out7: ++ free_percpu(ve->_icmp_statistics[1]); ++out6: ++ free_percpu(ve->_icmp_statistics[0]); ++out5: ++ free_percpu(ve->_ip_statistics[1]); ++out4: ++ free_percpu(ve->_ip_statistics[0]); ++out3: ++ free_percpu(ve->_net_statistics[1]); ++out2: ++ free_percpu(ve->_net_statistics[0]); ++out1: ++ return -ENOMEM; ++} ++ ++static inline int init_ve_mibs(struct ve_struct *ve) ++{ ++ return init_fini_ve_mibs(ve, 0); ++} ++ ++static inline void fini_ve_mibs(struct ve_struct *ve) ++{ ++ (void)init_fini_ve_mibs(ve, 1); ++} ++ ++extern struct net_device templ_loopback_dev; ++static void veloop_setup(struct net_device *dev) ++{ ++ int padded; ++ padded = dev->padded; ++ memcpy(dev, &templ_loopback_dev, sizeof(struct net_device)); ++ dev->padded = padded; ++} ++ ++static int init_ve_netdev(void) ++{ ++ struct ve_struct *ve; ++ struct net_device_stats *stats; ++ int err; ++ ++ ve = get_exec_env(); ++ INIT_HLIST_HEAD(&ve->_net_dev_head); ++ ve->_net_dev_base = NULL; ++ ve->_net_dev_tail = &ve->_net_dev_base; ++ ++ ve->_loopback_dev = alloc_netdev(0, templ_loopback_dev.name, ++ veloop_setup); ++ if (ve->_loopback_dev == NULL) ++ return -ENOMEM; ++ if (loopback_dev.get_stats != NULL) { ++ stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL); ++ if (stats != NULL) { ++ memset(stats, 0, sizeof(struct net_device_stats)); ++ ve->_loopback_dev->priv = stats; ++ ve->_loopback_dev->get_stats = loopback_dev.get_stats; ++ ve->_loopback_dev->destructor = loopback_dev.destructor; ++ } ++ } ++ err = register_netdev(ve->_loopback_dev); ++ if (err) { ++ if (ve->_loopback_dev->priv != NULL) ++ kfree(ve->_loopback_dev->priv); ++ free_netdev(ve->_loopback_dev); ++ } ++ return err; ++} ++ ++static void fini_ve_netdev(void) ++{ ++ struct ve_struct *ve; ++ struct net_device *dev; ++ ++ ve = get_exec_env(); ++ while (1) { ++ rtnl_lock(); ++ /* ++ * loopback is special, it can be referenced in fib's, ++ * so it must be freed the last. Doing so is ++ * sufficient to guarantee absence of such references. ++ */ ++ if (dev_base == ve->_loopback_dev) ++ dev = dev_base->next; ++ else ++ dev = dev_base; ++ if (dev == NULL) ++ break; ++ unregister_netdevice(dev); ++ rtnl_unlock(); ++ free_netdev(dev); ++ } ++ unregister_netdevice(ve->_loopback_dev); ++ rtnl_unlock(); ++ free_netdev(ve->_loopback_dev); ++ ve->_loopback_dev = NULL; ++} ++#else ++#define init_ve_mibs(ve) (0) ++#define fini_ve_mibs(ve) do { } while (0) ++#define init_ve_netdev() (0) ++#define fini_ve_netdev() do { } while (0) ++#endif ++ ++static int prepare_proc_root(struct ve_struct *ve) ++{ ++ struct proc_dir_entry *de; ++ ++ de = kmalloc(sizeof(struct proc_dir_entry) + 6, GFP_KERNEL); ++ if (de == NULL) ++ return -ENOMEM; ++ memset(de, 0, sizeof(struct proc_dir_entry)); ++ memcpy(de + 1, "/proc", 6); ++ de->name = (char *)(de + 1); ++ de->namelen = 5; ++ de->mode = S_IFDIR | S_IRUGO | S_IXUGO; ++ de->nlink = 2; ++ atomic_set(&de->count, 1); ++ ++ ve->proc_root = de; ++ return 0; ++} ++ ++#ifdef CONFIG_PROC_FS ++static int init_ve_proc(struct ve_struct *ve) ++{ ++ int err; ++ struct proc_dir_entry *de; ++ ++ err = prepare_proc_root(ve); ++ if (err) ++ goto out_root; ++ ++ err = register_ve_fs_type(ve, &proc_fs_type, ++ &ve->proc_fstype, &ve->proc_mnt); ++ if (err) ++ goto out_reg; ++ ++ /* create necessary /proc subdirs in VE local proc tree */ ++ err = -ENOMEM; ++ de = create_proc_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ if (!de) ++ goto out_vz; ++ ++#ifdef CONFIG_VE_IPTABLES ++ proc_net = proc_mkdir("net", NULL); ++ if (!proc_net) ++ goto out_net; ++#endif ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if (ve_snmp_proc_init()) ++ goto out_snmp; ++#endif ++ ++ return 0; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++out_snmp: ++ remove_proc_entry("net", NULL); ++#endif ++#ifdef CONFIG_VE_IPTABLES ++out_net: ++ remove_proc_entry("vz", NULL); ++#endif ++out_vz: ++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); ++ ve->proc_mnt = NULL; ++out_reg: ++ /* proc_fstype and proc_root are freed in real_put_ve -> free_ve_proc */ ++ ; ++out_root: ++ return err; ++} ++ ++static void fini_ve_proc(struct ve_struct *ve) ++{ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ ve_snmp_proc_fini(); ++#endif ++#ifdef CONFIG_VE_IPTABLES ++ remove_proc_entry("net", NULL); ++ proc_net = NULL; ++#endif ++ remove_proc_entry("vz", NULL); ++ unregister_ve_fs_type(ve->proc_fstype, ve->proc_mnt); ++ ve->proc_mnt = NULL; ++} ++ ++static void free_ve_proc(struct ve_struct *ve) ++{ ++ /* proc filesystem frees proc_dir_entries on remove_proc_entry() only, ++ so we check that everything was removed and not lost */ ++ if (ve->proc_root && ve->proc_root->subdir) { ++ struct proc_dir_entry *p = ve->proc_root; ++ printk(KERN_WARNING "VPS: %d: proc entry /proc", ve->veid); ++ while ((p = p->subdir) != NULL) ++ printk("/%s", p->name); ++ printk(" is not removed!\n"); ++ } ++ ++ kfree(ve->proc_root); ++ kfree(ve->proc_fstype); ++ ++ ve->proc_fstype = NULL; ++ ve->proc_root = NULL; ++} ++#else ++#define init_ve_proc(ve) (0) ++#define fini_ve_proc(ve) do { } while (0) ++#define free_ve_proc(ve) do { } while (0) ++#endif ++ ++#ifdef CONFIG_SYSCTL ++static int init_ve_sysctl(struct ve_struct *ve) ++{ ++ int err; ++ ++#ifdef CONFIG_PROC_FS ++ err = -ENOMEM; ++ ve->proc_sys_root = proc_mkdir("sys", 0); ++ if (ve->proc_sys_root == NULL) ++ goto out_proc; ++#endif ++ INIT_LIST_HEAD(&ve->sysctl_lh); ++ err = register_ve_sysctltables(ve); ++ if (err) ++ goto out_reg; ++ ++ err = devinet_sysctl_init(ve); ++ if (err) ++ goto out_dev; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ err = addrconf_sysctl_init(ve); ++ if (err) ++ goto out_dev6; ++#endif ++ ++ return 0; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++out_dev6: ++ devinet_sysctl_fini(ve); ++#endif ++out_dev: ++ unregister_ve_sysctltables(ve); ++ free_ve_sysctltables(ve); ++out_reg: ++#ifdef CONFIG_PROC_FS ++ remove_proc_entry("sys", NULL); ++out_proc: ++#endif ++ return err; ++} ++ ++static void fini_ve_sysctl(struct ve_struct *ve) ++{ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ addrconf_sysctl_fini(ve); ++#endif ++ devinet_sysctl_fini(ve); ++ unregister_ve_sysctltables(ve); ++ remove_proc_entry("sys", NULL); ++} ++ ++static void free_ve_sysctl(struct ve_struct *ve) ++{ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ addrconf_sysctl_free(ve); ++#endif ++ devinet_sysctl_free(ve); ++ free_ve_sysctltables(ve); ++} ++#else ++#define init_ve_sysctl(ve) (0) ++#define fini_ve_sysctl(ve) do { } while (0) ++#define free_ve_sysctl(ve) do { } while (0) ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++#include <linux/devpts_fs.h> ++ ++static int init_ve_devpts(struct ve_struct *ve) ++{ ++ int err; ++ ++ err = -ENOMEM; ++ ve->devpts_config = kmalloc(sizeof(struct devpts_config), GFP_KERNEL); ++ if (ve->devpts_config == NULL) ++ goto out; ++ memset(ve->devpts_config, 0, sizeof(struct devpts_config)); ++ ve->devpts_config->mode = 0600; ++ err = register_ve_fs_type(ve, &devpts_fs_type, ++ &ve->devpts_fstype, &ve->devpts_mnt); ++ if (err) { ++ kfree(ve->devpts_config); ++ ve->devpts_config = NULL; ++ } ++out: ++ return err; ++} ++ ++static void fini_ve_devpts(struct ve_struct *ve) ++{ ++ unregister_ve_fs_type(ve->devpts_fstype, ve->devpts_mnt); ++ /* devpts_fstype is freed in real_put_ve -> free_ve_filesystems */ ++ ve->devpts_mnt = NULL; ++ kfree(ve->devpts_config); ++ ve->devpts_config = NULL; ++} ++#else ++#define init_ve_devpts(ve) (0) ++#define fini_ve_devpts(ve) do { } while (0) ++#endif ++ ++static int init_ve_shmem(struct ve_struct *ve) ++{ ++ return register_ve_fs_type(ve, ++ &tmpfs_fs_type, ++ &ve->shmem_fstype, ++ &ve->shmem_mnt); ++} ++ ++static void fini_ve_shmem(struct ve_struct *ve) ++{ ++ unregister_ve_fs_type(ve->shmem_fstype, ve->shmem_mnt); ++ /* shmem_fstype is freed in real_put_ve -> free_ve_filesystems */ ++ ve->shmem_mnt = NULL; ++} ++ ++static inline int init_ve_sysfs_root(struct ve_struct *ve) ++{ ++ struct sysfs_dirent *sysfs_root; ++ ++ sysfs_root = kmalloc(sizeof(struct sysfs_dirent), GFP_KERNEL); ++ if (sysfs_root == NULL) ++ return -ENOMEM; ++ ++ memset(sysfs_root, 0, sizeof(struct sysfs_dirent)); ++ INIT_LIST_HEAD(&sysfs_root->s_sibling); ++ INIT_LIST_HEAD(&sysfs_root->s_children); ++ sysfs_root->s_type = SYSFS_ROOT; ++ ve->sysfs_root = sysfs_root; ++ return 0; ++} ++ ++static int init_ve_sysfs(struct ve_struct *ve) ++{ ++ struct subsystem *subsys; ++ struct class *nc; ++ int err; ++ extern struct subsystem class_obj_subsys; ++ extern struct subsystem class_subsys; ++ extern struct class net_class; ++ ++#ifdef CONFIG_SYSFS ++ err = 0; ++ if (ve->features & VE_FEATURE_SYSFS) { ++ err = init_ve_sysfs_root(ve); ++ if (err != 0) ++ goto out; ++ err = register_ve_fs_type(ve, ++ &sysfs_fs_type, ++ &ve->sysfs_fstype, ++ &ve->sysfs_mnt); ++ } ++ if (err != 0) ++ goto out_fs_type; ++#endif ++ err = -ENOMEM; ++ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL); ++ if (subsys == NULL) ++ goto out_class_obj; ++ /* ick, this is ugly, the things we go through to keep from showing up ++ * in sysfs... */ ++ memset(subsys, 0, sizeof(*subsys)); ++ memcpy(&subsys->kset.kobj.name, &class_obj_subsys.kset.kobj.name, ++ sizeof(subsys->kset.kobj.name)); ++ subsys->kset.ktype = class_obj_subsys.kset.ktype; ++ subsys->kset.uevent_ops = class_obj_subsys.kset.uevent_ops; ++ subsystem_init(subsys); ++ if (!subsys->kset.subsys) ++ subsys->kset.subsys = subsys; ++ ve->class_obj_subsys = subsys; ++ ++ err = -ENOMEM; ++ subsys = kmalloc(sizeof(*subsys), GFP_KERNEL); ++ if (subsys == NULL) ++ goto out_class_subsys; ++ /* ick, this is ugly, the things we go through to keep from showing up ++ * in sysfs... */ ++ memset(subsys, 0, sizeof(*subsys)); ++ memcpy(&subsys->kset.kobj.name, &class_subsys.kset.kobj.name, ++ sizeof(subsys->kset.kobj.name)); ++ subsys->kset.ktype = class_subsys.kset.ktype; ++ subsys->kset.uevent_ops = class_subsys.kset.uevent_ops; ++ ve->class_subsys = subsys; ++ err = subsystem_register(subsys); ++ if (err != 0) ++ goto out_register; ++ ++ err = -ENOMEM; ++ nc = kmalloc(sizeof(*nc), GFP_KERNEL); ++ if (nc == NULL) ++ goto out_nc; ++ memset(nc, 0, sizeof(*nc)); ++ nc->name = net_class.name; ++ nc->release = net_class.release; ++ nc->uevent = net_class.uevent; ++ err = class_register(nc); ++ if (err != 0) ++ goto out_class_register; ++ ve->net_class = nc; ++ ++ return err; ++ ++out_class_register: ++ kfree(nc); ++out_nc: ++ subsystem_unregister(subsys); ++out_register: ++ kfree(ve->class_subsys); ++out_class_subsys: ++ kfree(ve->class_obj_subsys); ++out_class_obj: ++#ifdef CONFIG_SYSFS ++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); ++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ ++out_fs_type: ++ kfree(ve->sysfs_root); ++ ve->sysfs_root = NULL; ++#endif ++ ve->class_subsys = NULL; ++ ve->class_obj_subsys = NULL; ++out: ++ return err; ++} ++ ++static void fini_ve_sysfs(struct ve_struct *ve) ++{ ++ class_unregister(ve->net_class); ++ subsystem_unregister(ve->class_subsys); ++ ++ kfree(ve->net_class); ++ kfree(ve->class_subsys); ++ kfree(ve->class_obj_subsys); ++ ++ ve->net_class = NULL; ++ ve->class_subsys = NULL; ++ ve->class_obj_subsys = NULL; ++#ifdef CONFIG_SYSFS ++ unregister_ve_fs_type(ve->sysfs_fstype, ve->sysfs_mnt); ++ ve->sysfs_mnt = NULL; ++ kfree(ve->sysfs_root); ++ ve->sysfs_root = NULL; ++ /* sysfs_fstype is freed in real_put_ve -> free_ve_filesystems */ ++#endif ++} ++ ++static void free_ve_filesystems(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSFS ++ kfree(ve->sysfs_fstype); ++ ve->sysfs_fstype = NULL; ++#endif ++ kfree(ve->shmem_fstype); ++ ve->shmem_fstype = NULL; ++ ++ kfree(ve->devpts_fstype); ++ ve->devpts_fstype = NULL; ++ ++ free_ve_proc(ve); ++} ++ ++static int init_printk(struct ve_struct *ve) ++{ ++ struct ve_prep_printk { ++ wait_queue_head_t log_wait; ++ unsigned long log_start; ++ unsigned long log_end; ++ unsigned long logged_chars; ++ } *tmp; ++ ++ tmp = kmalloc(sizeof(struct ve_prep_printk), GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ memset(tmp, 0, sizeof(struct ve_prep_printk)); ++ init_waitqueue_head(&tmp->log_wait); ++ ve->_log_wait = &tmp->log_wait; ++ ve->_log_start = &tmp->log_start; ++ ve->_log_end = &tmp->log_end; ++ ve->_logged_chars = &tmp->logged_chars; ++ /* ve->log_buf will be initialized later by ve_log_init() */ ++ return 0; ++} ++ ++static void fini_printk(struct ve_struct *ve) ++{ ++ /* ++ * there is no spinlock protection here because nobody can use ++ * log_buf at the moments when this code is called. ++ */ ++ kfree(ve->log_buf); ++ kfree(ve->_log_wait); ++} ++ ++static void fini_venet(struct ve_struct *ve) ++{ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ tcp_v4_kill_ve_sockets(ve); ++#endif ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ve_mapped_devs_cleanup(ve); ++#endif ++} ++ ++static int init_ve_sched(struct ve_struct *ve) ++{ ++#ifdef CONFIG_FAIRSCHED ++ int err; ++ ++ /* ++ * We refuse to switch to an already existing node since nodes ++ * keep a pointer to their ve_struct... ++ */ ++ err = sys_fairsched_mknod(0, 1, ve->veid); ++ if (err < 0) { ++ printk(KERN_WARNING "Can't create fairsched node %d\n", ++ ve->veid); ++ return err; ++ } ++ err = sys_fairsched_mvpr(current->pid, ve->veid); ++ if (err) { ++ printk(KERN_WARNING "Can't switch to fairsched node %d\n", ++ ve->veid); ++ if (sys_fairsched_rmnod(ve->veid)) ++ printk(KERN_ERR "Can't clean fairsched node %d\n", ++ ve->veid); ++ return err; ++ } ++#endif ++ ve_sched_attach(ve); ++ return 0; ++} ++ ++static void fini_ve_sched(struct ve_struct *ve) ++{ ++#ifdef CONFIG_FAIRSCHED ++ if (task_vsched_id(current) == ve->veid) ++ if (sys_fairsched_mvpr(current->pid, fairsched_init_node.id)) ++ printk(KERN_WARNING "Can't leave fairsched node %d\n", ++ ve->veid); ++ if (sys_fairsched_rmnod(ve->veid)) ++ printk(KERN_ERR "Can't remove fairsched node %d\n", ++ ve->veid); ++#endif ++} ++ ++static int init_ve_struct(struct ve_struct *ve, envid_t veid, ++ u32 class_id, env_create_param_t *data, ++ struct task_struct *init_tsk) ++{ ++ int n; ++ ++ memset(ve, 0, sizeof(*ve)); ++ (void)get_ve(ve); ++ ve->veid = veid; ++ ve->class_id = class_id; ++ ve->init_entry = init_tsk; ++ ve->features = data->feature_mask; ++ INIT_LIST_HEAD(&ve->vetask_lh); ++ init_rwsem(&ve->op_sem); ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ve->ifindex = -1; ++#endif ++ ++ for(n = 0; n < UIDHASH_SZ_VE; ++n) ++ INIT_LIST_HEAD(&ve->uidhash_table[n]); ++ ++ do_posix_clock_monotonic_gettime(&ve->start_timespec); ++ ve->start_jiffies = jiffies; ++ ve->start_cycles = get_cycles(); ++ ve->virt_pids = glob_virt_pids; ++ ++ return 0; ++} ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * /proc/meminfo virtualization ++ * ++ ********************************************************************** ++ **********************************************************************/ ++static int ve_set_meminfo(envid_t veid, unsigned long val) ++{ ++ struct ve_struct *ve; ++ ++ ve = get_ve_by_id(veid); ++ if (!ve) ++ return -EINVAL; ++ ++ ve->meminfo_val = val; ++ real_put_ve(ve); ++ return 0; ++} ++ ++static int init_ve_meminfo(struct ve_struct *ve) ++{ ++ ve->meminfo_val = 0; ++ return 0; ++} ++ ++static inline void fini_ve_meminfo(struct ve_struct *ve) ++{ ++} ++ ++static void set_ve_root(struct ve_struct *ve, struct task_struct *tsk) ++{ ++ read_lock(&tsk->fs->lock); ++ ve->fs_rootmnt = tsk->fs->rootmnt; ++ ve->fs_root = tsk->fs->root; ++ read_unlock(&tsk->fs->lock); ++ mark_tree_virtual(ve->fs_rootmnt, ve->fs_root); ++} ++ ++static void set_ve_caps(struct ve_struct *ve, struct task_struct *tsk) ++{ ++ /* required for real_setdevperms from register_ve_<fs> above */ ++ memcpy(&ve->cap_default, &tsk->cap_effective, sizeof(kernel_cap_t)); ++ cap_lower(ve->cap_default, CAP_SETVEID); ++} ++ ++static int ve_list_add(struct ve_struct *ve) ++{ ++ write_lock_irq(&ve_list_guard); ++ if (__find_ve_by_id(ve->veid) != NULL) ++ goto err_exists; ++ ++ ve->prev = NULL; ++ ve->next = ve_list_head; ++ if (ve_list_head) ++ ve_list_head->prev = ve; ++ ve_list_head = ve; ++ nr_ve++; ++ write_unlock_irq(&ve_list_guard); ++ return 0; ++ ++err_exists: ++ write_unlock_irq(&ve_list_guard); ++ return -EEXIST; ++} ++ ++static void ve_list_del(struct ve_struct *ve) ++{ ++ write_lock_irq(&ve_list_guard); ++ if (ve->prev) ++ ve->prev->next = ve->next; ++ else ++ ve_list_head = ve->next; ++ if (ve->next) ++ ve->next->prev = ve->prev; ++ nr_ve--; ++ write_unlock_irq(&ve_list_guard); ++} ++ ++static void set_task_ve_caps(struct task_struct *tsk, struct ve_struct *ve) ++{ ++ spin_lock(&task_capability_lock); ++ cap_mask(tsk->cap_effective, ve->cap_default); ++ cap_mask(tsk->cap_inheritable, ve->cap_default); ++ cap_mask(tsk->cap_permitted, ve->cap_default); ++ spin_unlock(&task_capability_lock); ++} ++ ++static void move_task(struct task_struct *tsk, struct ve_struct *new, ++ struct ve_struct *old) ++{ ++ /* this probihibts ptracing of task entered to VPS from host system */ ++ tsk->mm->vps_dumpable = 0; ++ /* setup capabilities before enter */ ++ set_task_ve_caps(tsk, new); ++ ++ write_lock_irq(&tasklist_lock); ++ VE_TASK_INFO(tsk)->owner_env = new; ++ VE_TASK_INFO(tsk)->exec_env = new; ++ REMOVE_VE_LINKS(tsk); ++ SET_VE_LINKS(tsk); ++ ++ atomic_dec(&old->pcounter); ++ atomic_inc(&new->pcounter); ++ real_put_ve(old); ++ get_ve(new); ++ write_unlock_irq(&tasklist_lock); ++} ++ ++#ifdef CONFIG_VE_IPTABLES ++extern int init_netfilter(void); ++extern void fini_netfilter(void); ++#define init_ve_netfilter() init_netfilter() ++#define fini_ve_netfilter() fini_netfilter() ++ ++#define KSYMIPTINIT(mask, ve, full_mask, mod, name, args) \ ++({ \ ++ int ret = 0; \ ++ if (VE_IPT_CMP(mask, full_mask) && \ ++ VE_IPT_CMP((ve)->_iptables_modules, \ ++ full_mask & ~(full_mask##_MOD))) { \ ++ ret = KSYMERRCALL(1, mod, name, args); \ ++ if (ret == 0) \ ++ (ve)->_iptables_modules |= \ ++ full_mask##_MOD; \ ++ if (ret == 1) \ ++ ret = 0; \ ++ } \ ++ ret; \ ++}) ++ ++#define KSYMIPTFINI(mask, full_mask, mod, name, args) \ ++({ \ ++ if (VE_IPT_CMP(mask, full_mask##_MOD)) \ ++ KSYMSAFECALL_VOID(mod, name, args); \ ++}) ++ ++ ++static int do_ve_iptables(struct ve_struct *ve, __u64 init_mask, ++ int init_or_cleanup) ++{ ++ int err; ++ ++ err = 0; ++ if (!init_or_cleanup) ++ goto cleanup; ++ ++ /* init part */ ++#if defined(CONFIG_NETFILTER_XTABLES) || \ ++ defined(CONFIG_NETFILTER_XTABLES_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, ++ x_tables, init_xtables, ()); ++ if (err < 0) ++ goto err_xtables; ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, ++ xt_tcpudp, init_xt_tcpudp, ()); ++ if (err < 0) ++ goto err_xt_tcpudp; ++#endif ++#if defined(CONFIG_IP_NF_IPTABLES) || \ ++ defined(CONFIG_IP_NF_IPTABLES_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, ++ ip_tables, init_iptables, ()); ++ if (err < 0) ++ goto err_iptables; ++#endif ++#if defined(CONFIG_IP6_NF_IPTABLES) || \ ++ defined(CONFIG_IP6_NF_IPTABLES_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_IPTABLES, ++ ip6_tables, init_ip6tables, ()); ++ if (err < 0) ++ goto err_ip6tables; ++#endif ++#if defined(CONFIG_IP_NF_CONNTRACK) || \ ++ defined(CONFIG_IP_NF_CONNTRACK_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK, ++ ip_conntrack, init_iptable_conntrack, ()); ++ if (err < 0) ++ goto err_iptable_conntrack; ++#endif ++#if defined(CONFIG_IP_NF_FTP) || \ ++ defined(CONFIG_IP_NF_FTP_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_FTP, ++ ip_conntrack_ftp, init_iptable_ftp, ()); ++ if (err < 0) ++ goto err_iptable_ftp; ++#endif ++#if defined(CONFIG_IP_NF_IRC) || \ ++ defined(CONFIG_IP_NF_IRC_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_CONNTRACK_IRC, ++ ip_conntrack_irc, init_iptable_irc, ()); ++ if (err < 0) ++ goto err_iptable_irc; ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_CONNTRACK, ++ xt_conntrack, init_xt_conntrack_match, ()); ++ if (err < 0) ++ goto err_xt_conntrack_match; ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_STATE) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_STATE_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_STATE, ++ xt_state, init_xt_state, ()); ++ if (err < 0) ++ goto err_xt_state; ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_HELPER) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_HELPER_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_HELPER, ++ xt_helper, init_xt_helper, ()); ++ if (err < 0) ++ goto err_xt_helper; ++#endif ++#if defined(CONFIG_IP_NF_NAT) || \ ++ defined(CONFIG_IP_NF_NAT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, ++ ip_nat, ip_nat_init, ()); ++ if (err < 0) ++ goto err_iptable_nat; ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT, ++ iptable_nat, init_iptable_nat, ()); ++ if (err < 0) ++ goto err_iptable_nat2; ++#endif ++#if defined(CONFIG_IP_NF_NAT_FTP) || \ ++ defined(CONFIG_IP_NF_NAT_FTP_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_FTP, ++ ip_nat_ftp, init_iptable_nat_ftp, ()); ++ if (err < 0) ++ goto err_iptable_nat_ftp; ++#endif ++#if defined(CONFIG_IP_NF_NAT_IRC) || \ ++ defined(CONFIG_IP_NF_NAT_IRC_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_NAT_IRC, ++ ip_nat_irc, init_iptable_nat_irc, ()); ++ if (err < 0) ++ goto err_iptable_nat_irc; ++#endif ++#if defined(CONFIG_IP_NF_FILTER) || \ ++ defined(CONFIG_IP_NF_FILTER_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER, ++ iptable_filter, init_iptable_filter, ()); ++ if (err < 0) ++ goto err_iptable_filter; ++#endif ++#if defined(CONFIG_IP6_NF_FILTER) || \ ++ defined(CONFIG_IP6_NF_FILTER_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_FILTER, ++ ip6table_filter, init_ip6table_filter, ()); ++ if (err < 0) ++ goto err_ip6table_filter; ++#endif ++#if defined(CONFIG_IP_NF_MANGLE) || \ ++ defined(CONFIG_IP_NF_MANGLE_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE, ++ iptable_mangle, init_iptable_mangle, ()); ++ if (err < 0) ++ goto err_iptable_mangle; ++#endif ++#if defined(CONFIG_IP6_NF_MANGLE) || \ ++ defined(CONFIG_IP6_NF_MANGLE_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MANGLE, ++ ip6table_mangle, init_ip6table_mangle, ()); ++ if (err < 0) ++ goto err_ip6table_mangle; ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_LIMIT) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_LIMIT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LIMIT, ++ xt_limit, init_xt_limit, ()); ++ if (err < 0) ++ goto err_xt_limit; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \ ++ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_MULTIPORT, ++ ipt_multiport, init_iptable_multiport, ()); ++ if (err < 0) ++ goto err_iptable_multiport; ++#endif ++#if defined(CONFIG_IP6_NF_MATCH_MULTIPORT) || \ ++ defined(CONFIG_IP6_NF_MATCH_MULTIPORT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_MULTIPORT, ++ ip6t_multiport, init_ip6table_multiport, ()); ++ if (err < 0) ++ goto err_ip6table_multiport; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TOS) || \ ++ defined(CONFIG_IP_NF_MATCH_TOS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TOS, ++ ipt_tos, init_iptable_tos, ()); ++ if (err < 0) ++ goto err_iptable_tos; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TOS) || \ ++ defined(CONFIG_IP_NF_TARGET_TOS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TOS, ++ ipt_TOS, init_iptable_TOS, ()); ++ if (err < 0) ++ goto err_iptable_TOS; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_REJECT) || \ ++ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REJECT, ++ ipt_REJECT, init_iptable_REJECT, ()); ++ if (err < 0) ++ goto err_iptable_REJECT; ++#endif ++#if defined(CONFIG_IP6_NF_TARGET_REJECT) || \ ++ defined(CONFIG_IP6_NF_TARGET_REJECT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REJECT, ++ ip6t_REJECT, init_ip6table_REJECT, ()); ++ if (err < 0) ++ goto err_ip6table_REJECT; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \ ++ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_TCPMSS, ++ ipt_TCPMSS, init_iptable_TCPMSS, ()); ++ if (err < 0) ++ goto err_iptable_TCPMSS; ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TCPMSS, ++ xt_tcpmss, init_xt_tcpmss, ()); ++ if (err < 0) ++ goto err_xt_tcpmss; ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TTL) || \ ++ defined(CONFIG_IP_NF_MATCH_TTL_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_TTL, ++ ipt_ttl, init_iptable_ttl, ()); ++ if (err < 0) ++ goto err_iptable_ttl; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_LOG) || \ ++ defined(CONFIG_IP_NF_TARGET_LOG_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_LOG, ++ ipt_LOG, init_iptable_LOG, ()); ++ if (err < 0) ++ goto err_iptable_LOG; ++#endif ++#if defined(CONFIG_IP6_NF_TARGET_LOG) || \ ++ defined(CONFIG_IP6_NF_TARGET_LOG_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_LOG, ++ ip6t_LOG, init_ip6table_LOG, ()); ++ if (err < 0) ++ goto err_ip6table_LOG; ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_LENGTH) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_LENGTH_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_MATCH_LENGTH, ++ xt_length, init_xt_length, ()); ++ if (err < 0) ++ goto err_xt_length; ++#endif ++#if defined(CONFIG_IP_NF_TARGET_REDIRECT) || \ ++ defined(CONFIG_IP_NF_TARGET_REDIRECT_MODULE) ++ err = KSYMIPTINIT(init_mask, ve, VE_IP_TARGET_REDIRECT, ++ ipt_REDIRECT, init_iptable_REDIRECT, ()); ++ if (err < 0) ++ goto err_iptable_REDIRECT; ++#endif ++ return 0; ++ ++/* ------------------------------------------------------------------------- */ ++ ++cleanup: ++#if defined(CONFIG_IP_NF_TARGET_REDIRECT) || \ ++ defined(CONFIG_IP_NF_TARGET_REDIRECT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REDIRECT, ++ ipt_REDIRECT, fini_iptable_REDIRECT, ()); ++err_iptable_REDIRECT: ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_LENGTH) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_LENGTH_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LENGTH, ++ xt_length, fini_xt_length, ()); ++err_xt_length: ++#endif ++#if defined(CONFIG_IP6_NF_TARGET_LOG) || \ ++ defined(CONFIG_IP6_NF_TARGET_LOG_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_LOG, ++ ip6t_LOG, fini_ip6table_LOG, ()); ++err_ip6table_LOG: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_LOG) || \ ++ defined(CONFIG_IP_NF_TARGET_LOG_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_LOG, ++ ipt_LOG, fini_iptable_LOG, ()); ++err_iptable_LOG: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TTL) || \ ++ defined(CONFIG_IP_NF_MATCH_TTL_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TTL, ++ ipt_ttl, fini_iptable_ttl, ()); ++err_iptable_ttl: ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_TCPMSS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TCPMSS, ++ xt_tcpmss, fini_xt_tcpmss, ()); ++err_xt_tcpmss: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TCPMSS) || \ ++ defined(CONFIG_IP_NF_TARGET_TCPMSS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TCPMSS, ++ ipt_TCPMSS, fini_iptable_TCPMSS, ()); ++err_iptable_TCPMSS: ++#endif ++#if defined(CONFIG_IP6_NF_TARGET_REJECT) || \ ++ defined(CONFIG_IP6_NF_TARGET_REJECT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REJECT, ++ ip6t_REJECT, fini_ip6table_REJECT, ()); ++err_ip6table_REJECT: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_REJECT) || \ ++ defined(CONFIG_IP_NF_TARGET_REJECT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_REJECT, ++ ipt_REJECT, fini_iptable_REJECT, ()); ++err_iptable_REJECT: ++#endif ++#if defined(CONFIG_IP_NF_TARGET_TOS) || \ ++ defined(CONFIG_IP_NF_TARGET_TOS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_TARGET_TOS, ++ ipt_TOS, fini_iptable_TOS, ()); ++err_iptable_TOS: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_TOS) || \ ++ defined(CONFIG_IP_NF_MATCH_TOS_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_TOS, ++ ipt_tos, fini_iptable_tos, ()); ++err_iptable_tos: ++#endif ++#if defined(CONFIG_IP6_NF_MATCH_MULTIPORT) || \ ++ defined(CONFIG_IP6_NF_MATCH_MULTIPORT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_MULTIPORT, ++ ip6t_multiport, fini_ip6table_multiport, ()); ++err_ip6table_multiport: ++#endif ++#if defined(CONFIG_IP_NF_MATCH_MULTIPORT) || \ ++ defined(CONFIG_IP_NF_MATCH_MULTIPORT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_MULTIPORT, ++ ipt_multiport, fini_iptable_multiport, ()); ++err_iptable_multiport: ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_LIMIT) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_LIMIT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_LIMIT, ++ xt_limit, fini_xt_limit, ()); ++err_xt_limit: ++#endif ++#if defined(CONFIG_IP6_NF_MANGLE) || \ ++ defined(CONFIG_IP6_NF_MANGLE_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ++ ip6table_mangle, fini_ip6table_mangle, ()); ++err_ip6table_mangle: ++#endif ++#if defined(CONFIG_IP_NF_MANGLE) || \ ++ defined(CONFIG_IP_NF_MANGLE_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ++ iptable_mangle, fini_iptable_mangle, ()); ++err_iptable_mangle: ++#endif ++#if defined(CONFIG_IP6_NF_FILTER) || \ ++ defined(CONFIG_IP6_NF_FILTER_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ++ ip6table_filter, fini_ip6table_filter, ()); ++err_ip6table_filter: ++#endif ++#if defined(CONFIG_IP_NF_FILTER) || \ ++ defined(CONFIG_IP_NF_FILTER_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ++ iptable_filter, fini_iptable_filter, ()); ++err_iptable_filter: ++#endif ++#if defined(CONFIG_IP_NF_NAT_IRC) || \ ++ defined(CONFIG_IP_NF_NAT_IRC_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_IRC, ++ ip_nat_irc, fini_iptable_nat_irc, ()); ++err_iptable_nat_irc: ++#endif ++#if defined(CONFIG_IP_NF_NAT_FTP) || \ ++ defined(CONFIG_IP_NF_NAT_FTP_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT_FTP, ++ ip_nat_ftp, fini_iptable_nat_ftp, ()); ++err_iptable_nat_ftp: ++#endif ++#if defined(CONFIG_IP_NF_NAT) || \ ++ defined(CONFIG_IP_NF_NAT_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ++ iptable_nat, fini_iptable_nat, ()); ++err_iptable_nat2: ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ++ ip_nat, ip_nat_cleanup, ()); ++err_iptable_nat: ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_HELPER) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_HELPER_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_HELPER, ++ xt_helper, fini_xt_helper, ()); ++err_xt_helper: ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_STATE) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_STATE_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_STATE, ++ xt_state, fini_xt_state, ()); ++err_xt_state: ++#endif ++#if defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) || \ ++ defined(CONFIG_NETFILTER_XT_MATCH_CONNTRACK_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MATCH_CONNTRACK, ++ xt_conntrack, fini_xt_conntrack_match, ()); ++err_xt_conntrack_match: ++#endif ++#if defined(CONFIG_IP_NF_IRC) || \ ++ defined(CONFIG_IP_NF_IRC_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_IRC, ++ ip_conntrack_irc, fini_iptable_irc, ()); ++err_iptable_irc: ++#endif ++#if defined(CONFIG_IP_NF_FTP) || \ ++ defined(CONFIG_IP_NF_FTP_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK_FTP, ++ ip_conntrack_ftp, fini_iptable_ftp, ()); ++err_iptable_ftp: ++#endif ++#if defined(CONFIG_IP_NF_CONNTRACK) || \ ++ defined(CONFIG_IP_NF_CONNTRACK_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_CONNTRACK, ++ ip_conntrack, fini_iptable_conntrack, ()); ++err_iptable_conntrack: ++#endif ++#if defined(CONFIG_IP6_NF_IPTABLES) || \ ++ defined(CONFIG_IP6_NF_IPTABLES_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, ++ ip6_tables, fini_ip6tables, ()); ++err_ip6tables: ++#endif ++#if defined(CONFIG_IP_NF_IPTABLES) || \ ++ defined(CONFIG_IP_NF_IPTABLES_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, ++ ip_tables, fini_iptables, ()); ++err_iptables: ++#endif ++#if defined(CONFIG_NETFILTER_XTABLES) || \ ++ defined(CONFIG_NETFILTER_XTABLES_MODULE) ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, ++ xt_tcpudp, fini_xt_tcpudp, ()); ++err_xt_tcpudp: ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_IPTABLES, ++ x_tables, fini_xtables, ()); ++err_xtables: ++#endif ++ ve->_iptables_modules = 0; ++ ++ return err; ++} ++ ++static inline int init_ve_iptables(struct ve_struct *ve, __u64 init_mask) ++{ ++ return do_ve_iptables(ve, init_mask, 1); ++} ++ ++static inline void fini_ve_iptables(struct ve_struct *ve, __u64 init_mask) ++{ ++ (void)do_ve_iptables(ve, init_mask, 0); ++} ++ ++static void flush_ve_iptables(struct ve_struct *ve) ++{ ++ /* ++ * flush all rule tables first, ++ * this helps us to avoid refs to freed objs ++ */ ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ip_tables, ++ ipt_flush_table, (ve->_ipt_mangle_table)); ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_MANGLE, ip6_tables, ++ ip6t_flush_table, (ve->_ip6t_mangle_table)); ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ip_tables, ++ ipt_flush_table, (ve->_ve_ipt_filter_pf)); ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_FILTER, ip6_tables, ++ ip6t_flush_table, (ve->_ve_ip6t_filter_pf)); ++ KSYMIPTFINI(ve->_iptables_modules, VE_IP_NAT, ip_tables, ++ ipt_flush_table, (ve->_ip_conntrack->_ip_nat_table)); ++} ++#else ++#define init_ve_iptables(x, y) (0) ++#define fini_ve_iptables(x, y) do { } while (0) ++#define flush_ve_iptables(x) do { } while (0) ++#define init_ve_netfilter() (0) ++#define fini_ve_netfilter() do { } while (0) ++#endif ++ ++static struct list_head ve_hooks[VE_MAX_HOOKS]; ++static DECLARE_RWSEM(ve_hook_sem); ++ ++int ve_hook_register(struct ve_hook *vh) ++{ ++ struct list_head *lh; ++ struct ve_hook *tmp; ++ ++ down_write(&ve_hook_sem); ++ list_for_each(lh, &ve_hooks[vh->hooknum]) { ++ tmp = list_entry(lh, struct ve_hook, list); ++ if (vh->priority < tmp->priority) ++ break; ++ } ++ list_add_tail(&vh->list, lh); ++ up_write(&ve_hook_sem); ++ return 0; ++} ++EXPORT_SYMBOL(ve_hook_register); ++ ++void ve_hook_unregister(struct ve_hook *vh) ++{ ++ down_write(&ve_hook_sem); ++ list_del(&vh->list); ++ up_write(&ve_hook_sem); ++} ++EXPORT_SYMBOL(ve_hook_unregister); ++ ++static int ve_hook_iterate(unsigned int hooknum, void *data) ++{ ++ struct ve_hook *vh; ++ int err; ++ ++ err = 0; ++ down_read(&ve_hook_sem); ++ list_for_each_entry(vh, &ve_hooks[hooknum], list) { ++ if (!try_module_get(vh->owner)) ++ continue; ++ err = vh->hook(hooknum, data); ++ module_put(vh->owner); ++ if (err) ++ break; ++ } ++ ++ if (err) { ++ list_for_each_entry_continue_reverse(vh, ++ &ve_hooks[hooknum], list) { ++ if (!try_module_get(vh->owner)) ++ continue; ++ if (vh->undo) ++ vh->undo(hooknum, data); ++ module_put(vh->owner); ++ } ++ } ++ up_read(&ve_hook_sem); ++ return err; ++} ++ ++static void ve_hook_iterate_cleanup(unsigned int hooknum, void *data) ++{ ++ struct ve_hook *vh; ++ ++ down_read(&ve_hook_sem); ++ list_for_each_entry_reverse(vh, &ve_hooks[hooknum], list) { ++ if (!try_module_get(vh->owner)) ++ continue; ++ (void)vh->hook(hooknum, data); ++ module_put(vh->owner); ++ } ++ up_read(&ve_hook_sem); ++} ++ ++static int do_env_create(envid_t veid, unsigned int flags, u32 class_id, ++ env_create_param_t *data, int datalen) ++{ ++ struct task_struct *tsk; ++ struct ve_struct *old; ++ struct ve_struct *old_exec; ++ struct ve_struct *ve; ++ __u64 init_mask; ++ int err; ++ ++ tsk = current; ++ old = VE_TASK_INFO(tsk)->owner_env; ++ ++ if (!thread_group_leader(tsk)) ++ return -EINVAL; ++ ++ if (tsk->signal->tty) { ++ printk("ERR: VE init has controlling terminal\n"); ++ return -EINVAL; ++ } ++ if (tsk->signal->pgrp != tsk->pid || tsk->signal->session != tsk->pid) { ++ int may_setsid; ++ read_lock(&tasklist_lock); ++ may_setsid = (find_pid(PIDTYPE_PGID, tsk->pid) == NULL); ++ read_unlock(&tasklist_lock); ++ if (!may_setsid) { ++ printk("ERR: VE init is process group leader\n"); ++ return -EINVAL; ++ } ++ } ++ ++ ++ VZTRACE("%s: veid=%d classid=%d pid=%d\n", ++ __FUNCTION__, veid, class_id, current->pid); ++ ++ err = -ENOMEM; ++ ve = kmalloc(sizeof(struct ve_struct), GFP_KERNEL); ++ if (ve == NULL) ++ goto err_struct; ++ ++ init_ve_struct(ve, veid, class_id, data, tsk); ++ __module_get(THIS_MODULE); ++ down_write(&ve->op_sem); ++ if (flags & VE_LOCK) ++ ve->is_locked = 1; ++ if ((err = ve_list_add(ve)) < 0) ++ goto err_exist; ++ ++ /* this should be done before context switching */ ++ if ((err = init_printk(ve)) < 0) ++ goto err_log_wait; ++ ++ old_exec = set_exec_env(ve); ++ ++ if ((err = init_ve_sched(ve)) < 0) ++ goto err_sched; ++ ++ /* move user to VE */ ++ if ((err = set_user(0, 0)) < 0) ++ goto err_set_user; ++ ++ set_ve_root(ve, tsk); ++ ++ if ((err = init_ve_utsname(ve))) ++ goto err_utsname; ++ ++ if ((err = init_ve_mibs(ve))) ++ goto err_mibs; ++ ++ if ((err = init_ve_proc(ve))) ++ goto err_proc; ++ ++ if ((err = init_ve_sysctl(ve))) ++ goto err_sysctl; ++ ++ if ((err = init_ve_sysfs(ve))) ++ goto err_sysfs; ++ ++ if ((err = ve_arp_init(ve)) < 0) ++ goto err_route; ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if ((err = ve_ndisc_init(ve)) < 0) ++ goto err_route; ++#endif ++ ++ if ((err = init_ve_route(ve)) < 0) ++ goto err_route; ++ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ if ((err = init_ve_route6(ve)) < 0) ++ goto err_route; ++#endif ++ ++ if ((err = init_ve_netdev())) ++ goto err_dev; ++ ++ if ((err = init_ve_tty_drivers(ve)) < 0) ++ goto err_tty; ++ ++ if ((err = init_ve_shmem(ve))) ++ goto err_shmem; ++ ++ if ((err = init_ve_devpts(ve))) ++ goto err_devpts; ++ ++ if((err = init_ve_meminfo(ve))) ++ goto err_meminf; ++ ++ /* init SYSV IPC variables */ ++ if ((err = init_ve_ipc(ve)) < 0) ++ goto err_ipc; ++ ++ set_ve_caps(ve, tsk); ++ ++ /* It is safe to initialize netfilter here as routing initialization and ++ interface setup will be done below. This means that NO skb can be ++ passed inside. Den */ ++ /* iptables ve initialization for non ve0; ++ ve0 init is in module_init */ ++ if ((err = init_ve_netfilter()) < 0) ++ goto err_netfilter; ++ ++ init_mask = data ? data->iptables_mask : VE_IP_DEFAULT; ++ if ((err = init_ve_iptables(ve, init_mask)) < 0) ++ goto err_iptables; ++ ++ if ((err = alloc_vpid(tsk->pid, 1)) < 0) ++ goto err_vpid; ++ ++ if ((err = ve_hook_iterate(VE_HOOK_INIT, (void *)ve)) < 0) ++ goto err_ve_hook; ++ ++ /* finally: set vpids and move inside */ ++ move_task(tsk, ve, old); ++ ++ set_virt_pid(tsk, 1); ++ set_virt_tgid(tsk, 1); ++ ++ set_special_pids(tsk->pid, tsk->pid); ++ current->signal->tty_old_pgrp = 0; ++ set_virt_pgid(tsk, 1); ++ set_virt_sid(tsk, 1); ++ ++ ve->is_running = 1; ++ up_write(&ve->op_sem); ++ ++ printk(KERN_INFO "VPS: %d: started\n", veid); ++ return veid; ++ ++err_ve_hook: ++ free_vpid(1, ve); ++err_vpid: ++ fini_venet(ve); ++ fini_ve_iptables(ve, init_mask); ++err_iptables: ++ fini_ve_netfilter(); ++err_netfilter: ++ fini_ve_ipc(ve); ++err_ipc: ++ fini_ve_meminfo(ve); ++err_meminf: ++ fini_ve_devpts(ve); ++err_devpts: ++ fini_ve_shmem(ve); ++err_shmem: ++ fini_ve_tty_drivers(ve); ++err_tty: ++ fini_ve_netdev(); ++err_dev: ++ fini_ve_route(ve); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ fini_ve_route6(ve); ++#endif ++err_route: ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ ve_ndisc_fini(ve); ++#endif ++ ve_arp_fini(ve); ++ fini_ve_sysfs(ve); ++err_sysfs: ++ fini_ve_sysctl(ve); ++err_sysctl: ++ fini_ve_proc(ve); ++err_proc: ++ do_clean_devperms(ve->veid); /* register procfs adds devperms */ ++ fini_ve_mibs(ve); ++err_mibs: ++ /* free_ve_utsname() is called inside real_put_ve() */ ; ++err_utsname: ++ /* It is safe to restore current->envid here because ++ * ve_fairsched_detach does not use current->envid. */ ++ /* Really fairsched code uses current->envid in sys_fairsched_mknod ++ * only. It is correct if sys_fairsched_mknod is called from ++ * userspace. If sys_fairsched_mknod is called from ++ * ve_fairsched_attach, then node->envid and node->parent_node->envid ++ * are explicitly set to valid value after the call. */ ++ /* FIXME */ ++ VE_TASK_INFO(tsk)->owner_env = old; ++ VE_TASK_INFO(tsk)->exec_env = old_exec; ++ /* move user back */ ++ if (set_user(0, 0) < 0) ++ printk(KERN_WARNING"Can't restore UID\n"); ++ ++err_set_user: ++ fini_ve_sched(ve); ++err_sched: ++ (void)set_exec_env(old_exec); ++ ++ /* we can jump here having incorrect envid */ ++ VE_TASK_INFO(tsk)->owner_env = old; ++ fini_printk(ve); ++err_log_wait: ++ ve_list_del(ve); ++ up_write(&ve->op_sem); ++ ++ real_put_ve(ve); ++err_struct: ++ printk(KERN_INFO "VPS: %d: failed to start with err=%d\n", veid, err); ++ return err; ++ ++err_exist: ++ kfree(ve); ++ goto err_struct; ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE start/stop callbacks ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++int real_env_create(envid_t veid, unsigned flags, u32 class_id, ++ env_create_param_t *data, int datalen) ++{ ++ int status; ++ struct ve_struct *ve; ++ ++ if (!flags) { ++ status = get_exec_env()->veid; ++ goto out; ++ } ++ ++ status = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ ++ status = -EINVAL; ++ if ((flags & VE_TEST) && (flags & (VE_ENTER|VE_CREATE))) ++ goto out; ++ ++ status = -EINVAL; ++ ve = get_ve_by_id(veid); ++ if (ve) { ++ if (flags & VE_TEST) { ++ status = 0; ++ goto out_put; ++ } ++ if (flags & VE_EXCLUSIVE) { ++ status = -EACCES; ++ goto out_put; ++ } ++ if (flags & VE_CREATE) { ++ flags &= ~VE_CREATE; ++ flags |= VE_ENTER; ++ } ++ } else { ++ if (flags & (VE_TEST|VE_ENTER)) { ++ status = -ESRCH; ++ goto out; ++ } ++ } ++ ++ if (flags & VE_CREATE) { ++ status = do_env_create(veid, flags, class_id, data, datalen); ++ goto out; ++ } else if (flags & VE_ENTER) ++ status = do_env_enter(ve, flags); ++ ++ /* else: returning EINVAL */ ++ ++out_put: ++ real_put_ve(ve); ++out: ++ return status; ++} ++ ++static int do_env_enter(struct ve_struct *ve, unsigned int flags) ++{ ++ struct task_struct *tsk = current; ++ int err; ++ ++ VZTRACE("%s: veid=%d\n", __FUNCTION__, ve->veid); ++ ++ err = -EBUSY; ++ down_read(&ve->op_sem); ++ if (!ve->is_running) ++ goto out_up; ++ if (ve->is_locked && !(flags & VE_SKIPLOCK)) ++ goto out_up; ++ ++#ifdef CONFIG_FAIRSCHED ++ err = sys_fairsched_mvpr(current->pid, ve->veid); ++ if (err) ++ goto out_up; ++#endif ++ ++ ve_sched_attach(ve); ++ move_task(current, ve, VE_TASK_INFO(tsk)->owner_env); ++ err = VE_TASK_INFO(tsk)->owner_env->veid; ++ ++out_up: ++ up_read(&ve->op_sem); ++ return err; ++} ++ ++static void env_cleanup(struct ve_struct *ve) ++{ ++ struct ve_struct *old_ve; ++ ++ VZTRACE("real_do_env_cleanup\n"); ++ ++ down_read(&ve->op_sem); ++ old_ve = set_exec_env(ve); ++ ++ ve_hook_iterate_cleanup(VE_HOOK_FINI, (void *)ve); ++ ++ fini_venet(ve); ++ ++ /* no new packets in flight beyond this point */ ++ synchronize_net(); ++ /* skb hold dst_entry, and in turn lies in the ip fragment queue */ ++ ip_fragment_cleanup(ve); ++ ++ fini_ve_netdev(); ++ fini_ve_route(ve); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ fini_ve_route6(ve); ++#endif ++ ve_arp_fini(ve); ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ ve_ndisc_fini(ve); ++#endif ++ ++ /* kill iptables */ ++ /* No skb belonging to VE can exist at this point as unregister_netdev ++ is an operation awaiting until ALL skb's gone */ ++ flush_ve_iptables(ve); ++ fini_ve_iptables(ve, ve->_iptables_modules); ++ fini_ve_netfilter(); ++ ++ ve_ipc_cleanup(); ++ ++ fini_ve_sched(ve); ++ do_clean_devperms(ve->veid); ++ ++ fini_ve_devpts(ve); ++ fini_ve_shmem(ve); ++ fini_ve_sysfs(ve); ++ unregister_ve_tty_drivers(ve); ++ fini_ve_sysctl(ve); ++ fini_ve_proc(ve); ++ fini_ve_meminfo(ve); ++ ++ fini_ve_mibs(ve); ++ ++ (void)set_exec_env(old_ve); ++ fini_printk(ve); /* no printk can happen in ve context anymore */ ++ ++ ve_list_del(ve); ++ up_read(&ve->op_sem); ++ ++ real_put_ve(ve); ++} ++ ++static struct list_head ve_cleanup_list; ++static spinlock_t ve_cleanup_lock; ++ ++static DECLARE_COMPLETION(vzmond_complete); ++static struct task_struct *vzmond_thread; ++static volatile int stop_vzmond; ++ ++void real_do_env_cleanup(struct ve_struct *ve) ++{ ++ spin_lock(&ve_cleanup_lock); ++ list_add_tail(&ve->cleanup_list, &ve_cleanup_list); ++ spin_unlock(&ve_cleanup_lock); ++ wake_up_process(vzmond_thread); ++} ++ ++static void do_pending_env_cleanups(void) ++{ ++ struct ve_struct *ve; ++ ++ spin_lock(&ve_cleanup_lock); ++ while (1) { ++ if (list_empty(&ve_cleanup_list) || need_resched()) ++ break; ++ ve = list_entry(ve_cleanup_list.next, struct ve_struct, ++ cleanup_list); ++ list_del(&ve->cleanup_list); ++ spin_unlock(&ve_cleanup_lock); ++ env_cleanup(ve); ++ spin_lock(&ve_cleanup_lock); ++ } ++ spin_unlock(&ve_cleanup_lock); ++} ++ ++static int have_pending_cleanups(void) ++{ ++ return !list_empty(&ve_cleanup_list); ++} ++ ++static int vzmond(void *arg) ++{ ++ daemonize("vzmond"); ++ vzmond_thread = current; ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ while (!stop_vzmond) { ++ schedule(); ++ try_to_freeze(); ++ if (signal_pending(current)) ++ flush_signals(current); ++ ++ do_pending_env_cleanups(); ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (have_pending_cleanups()) ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ __set_task_state(current, TASK_RUNNING); ++ complete_and_exit(&vzmond_complete, 0); ++} ++ ++static int __init init_vzmond(void) ++{ ++ INIT_LIST_HEAD(&ve_cleanup_list); ++ spin_lock_init(&ve_cleanup_lock); ++ stop_vzmond = 0; ++ return kernel_thread(vzmond, NULL, 0); ++} ++ ++static void fini_vzmond(void) ++{ ++ stop_vzmond = 1; ++ wake_up_process(vzmond_thread); ++ wait_for_completion(&vzmond_complete); ++ WARN_ON(!list_empty(&ve_cleanup_list)); ++} ++ ++void real_do_env_free(struct ve_struct *ve) ++{ ++ VZTRACE("real_do_env_free\n"); ++ ++ ve_ipc_free(ve); /* free SYSV IPC resources */ ++ free_ve_tty_drivers(ve); ++ free_ve_utsname(ve); ++ free_ve_sysctl(ve); /* free per ve sysctl data */ ++ free_ve_filesystems(ve); ++ printk(KERN_INFO "VPS: %d: stopped\n", VEID(ve)); ++ kfree(ve); ++ ++ module_put(THIS_MODULE); ++} ++EXPORT_SYMBOL(real_do_env_free); ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE TTY handling ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++DCL_VE_OWNER(TTYDRV, struct tty_driver, owner_env) ++ ++static struct tty_driver *alloc_ve_tty_driver(struct tty_driver *base, ++ struct ve_struct *ve) ++{ ++ size_t size; ++ struct tty_driver *driver; ++ ++ driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL); ++ if (!driver) ++ goto out; ++ ++ memcpy(driver, base, sizeof(struct tty_driver)); ++ ++ driver->driver_state = NULL; ++ ++ size = base->num * 3 * sizeof(void *); ++ if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { ++ void **p; ++ p = kmalloc(size, GFP_KERNEL); ++ if (!p) ++ goto out_free; ++ memset(p, 0, size); ++ driver->ttys = (struct tty_struct **)p; ++ driver->termios = (struct termios **)(p + driver->num); ++ driver->termios_locked = (struct termios **)(p + driver->num * 2); ++ } else { ++ driver->ttys = NULL; ++ driver->termios = NULL; ++ driver->termios_locked = NULL; ++ } ++ ++ SET_VE_OWNER_TTYDRV(driver, ve); ++ driver->flags |= TTY_DRIVER_INSTALLED; ++ ++ return driver; ++ ++out_free: ++ kfree(driver); ++out: ++ return NULL; ++} ++ ++static void free_ve_tty_driver(struct tty_driver *driver) ++{ ++ if (!driver) ++ return; ++ ++ clear_termios(driver); ++ kfree(driver->ttys); ++ kfree(driver); ++} ++ ++static int alloc_ve_tty_drivers(struct ve_struct* ve) ++{ ++#ifdef CONFIG_LEGACY_PTYS ++ /* Traditional BSD devices */ ++ ve->pty_driver = alloc_ve_tty_driver(pty_driver, ve); ++ if (!ve->pty_driver) ++ goto out_mem; ++ ++ ve->pty_slave_driver = alloc_ve_tty_driver(pty_slave_driver, ve); ++ if (!ve->pty_slave_driver) ++ goto out_mem; ++ ++ ve->pty_driver->other = ve->pty_slave_driver; ++ ve->pty_slave_driver->other = ve->pty_driver; ++#endif ++ ++#ifdef CONFIG_UNIX98_PTYS ++ ve->ptm_driver = alloc_ve_tty_driver(ptm_driver, ve); ++ if (!ve->ptm_driver) ++ goto out_mem; ++ ++ ve->pts_driver = alloc_ve_tty_driver(pts_driver, ve); ++ if (!ve->pts_driver) ++ goto out_mem; ++ ++ ve->ptm_driver->other = ve->pts_driver; ++ ve->pts_driver->other = ve->ptm_driver; ++ ++ ve->allocated_ptys = kmalloc(sizeof(*ve->allocated_ptys), GFP_KERNEL); ++ if (!ve->allocated_ptys) ++ goto out_mem; ++ idr_init(ve->allocated_ptys); ++#endif ++ return 0; ++ ++out_mem: ++ free_ve_tty_drivers(ve); ++ return -ENOMEM; ++} ++ ++static void free_ve_tty_drivers(struct ve_struct* ve) ++{ ++#ifdef CONFIG_LEGACY_PTYS ++ free_ve_tty_driver(ve->pty_driver); ++ free_ve_tty_driver(ve->pty_slave_driver); ++ ve->pty_driver = ve->pty_slave_driver = NULL; ++#endif ++#ifdef CONFIG_UNIX98_PTYS ++ free_ve_tty_driver(ve->ptm_driver); ++ free_ve_tty_driver(ve->pts_driver); ++ kfree(ve->allocated_ptys); ++ ve->ptm_driver = ve->pts_driver = NULL; ++ ve->allocated_ptys = NULL; ++#endif ++} ++ ++static inline void __register_tty_driver(struct tty_driver *driver) ++{ ++ list_add(&driver->tty_drivers, &tty_drivers); ++} ++ ++static inline void __unregister_tty_driver(struct tty_driver *driver) ++{ ++ if (!driver) ++ return; ++ list_del(&driver->tty_drivers); ++} ++ ++static int register_ve_tty_drivers(struct ve_struct* ve) ++{ ++ write_lock_irq(&tty_driver_guard); ++#ifdef CONFIG_UNIX98_PTYS ++ __register_tty_driver(ve->ptm_driver); ++ __register_tty_driver(ve->pts_driver); ++#endif ++#ifdef CONFIG_LEGACY_PTYS ++ __register_tty_driver(ve->pty_driver); ++ __register_tty_driver(ve->pty_slave_driver); ++#endif ++ write_unlock_irq(&tty_driver_guard); ++ ++ return 0; ++} ++ ++static void unregister_ve_tty_drivers(struct ve_struct* ve) ++{ ++ VZTRACE("unregister_ve_tty_drivers\n"); ++ ++ write_lock_irq(&tty_driver_guard); ++ __unregister_tty_driver(ve->pty_driver); ++ __unregister_tty_driver(ve->pty_slave_driver); ++#ifdef CONFIG_UNIX98_PTYS ++ __unregister_tty_driver(ve->ptm_driver); ++ __unregister_tty_driver(ve->pts_driver); ++#endif ++ write_unlock_irq(&tty_driver_guard); ++} ++ ++static int init_ve_tty_drivers(struct ve_struct *ve) ++{ ++ int err; ++ ++ if ((err = alloc_ve_tty_drivers(ve))) ++ goto err_ttyalloc; ++ if ((err = register_ve_tty_drivers(ve))) ++ goto err_ttyreg; ++ return 0; ++ ++err_ttyreg: ++ free_ve_tty_drivers(ve); ++err_ttyalloc: ++ return err; ++} ++ ++static void fini_ve_tty_drivers(struct ve_struct *ve) ++{ ++ unregister_ve_tty_drivers(ve); ++ free_ve_tty_drivers(ve); ++} ++ ++/* ++ * Free the termios and termios_locked structures because ++ * we don't want to get memory leaks when modular tty ++ * drivers are removed from the kernel. ++ */ ++static void clear_termios(struct tty_driver *driver) ++{ ++ int i; ++ struct termios *tp; ++ ++ if (driver->termios == NULL) ++ return; ++ for (i = 0; i < driver->num; i++) { ++ tp = driver->termios[i]; ++ if (tp) { ++ driver->termios[i] = NULL; ++ kfree(tp); ++ } ++ tp = driver->termios_locked[i]; ++ if (tp) { ++ driver->termios_locked[i] = NULL; ++ kfree(tp); ++ } ++ } ++} ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * Pieces of VE network ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#include <asm/uaccess.h> ++#include <net/sock.h> ++#include <linux/netlink.h> ++#include <linux/rtnetlink.h> ++#include <net/route.h> ++#include <net/ip_fib.h> ++#endif ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static void ve_del_ip_addrs(struct net_device *dev) ++{ ++ struct in_device *in_dev; ++ ++ in_dev = in_dev_get(dev); ++ if (in_dev == NULL) ++ return; ++ ++ while (in_dev->ifa_list != NULL) { ++ inet_del_ifa(in_dev, &in_dev->ifa_list, 1); ++ } ++ in_dev_put(in_dev); ++} ++ ++static void ve_del_ipv6_addrs(struct net_device *dev) ++{ ++#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) ++ addrconf_ifdown(dev, 2); ++#endif ++} ++ ++static int ve_netdev_cleanup(struct net_device *dev, int to_ve) ++{ ++ int err; ++ ++ err = 0; ++ ve_del_ip_addrs(dev); ++ ve_del_ipv6_addrs(dev); ++ if ((dev->flags & IFF_UP) != 0) ++ err = dev_close(dev); ++ synchronize_net(); ++ dev_shutdown(dev); ++ dev_mc_discard(dev); ++ free_divert_blk(dev); ++ synchronize_net(); ++ ++ if (to_ve) ++ dev->orig_mtu = dev->mtu; ++ else { ++ int rc = dev_set_mtu(dev, dev->orig_mtu); ++ if (err == 0) ++ err = rc; ++ } ++ ++ return err; ++} ++ ++static void __ve_dev_move(struct net_device *dev, struct ve_struct *ve_src, ++ struct ve_struct *ve_dst, struct user_beancounter *exec_ub) ++{ ++ struct net_device **dp, *d; ++ struct user_beancounter *ub; ++ ++ for (d = ve_src->_net_dev_base, dp = NULL; d != NULL; ++ dp = &d->next, d = d->next) { ++ if (d == dev) { ++ hlist_del(&dev->name_hlist); ++ hlist_del(&dev->index_hlist); ++ if (ve_src->_net_dev_tail == &dev->next) ++ ve_src->_net_dev_tail = dp; ++ if (dp) ++ *dp = dev->next; ++ dev->next = NULL; ++ break; ++ } ++ } ++ *ve_dst->_net_dev_tail = dev; ++ ve_dst->_net_dev_tail = &dev->next; ++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ve_dst)); ++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ve_dst)); ++ dev->owner_env = ve_dst; ++ ++ ub = netdev_bc(dev)->exec_ub; ++ netdev_bc(dev)->exec_ub = get_beancounter(exec_ub); ++ put_beancounter(ub); ++} ++ ++static int ve_dev_add(envid_t veid, char *dev_name) ++{ ++ int err; ++ struct net_device *dev; ++ struct ve_struct *ve; ++ struct hlist_node *p; ++ ++ dev = NULL; ++ err = -ESRCH; ++ ++ ve = get_ve_by_id(veid); ++ if (ve == NULL) ++ goto out; ++ ++ rtnl_lock(); ++ ++ read_lock(&dev_base_lock); ++ hlist_for_each(p, dev_name_hash(dev_name, get_ve0())) { ++ struct net_device *d = hlist_entry(p, struct net_device, ++ name_hlist); ++ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { ++ dev = d; ++ break; ++ } ++ } ++ read_unlock(&dev_base_lock); ++ if (dev == NULL) ++ goto out_unlock; ++ ++ err = -EPERM; ++ if (!ve_is_dev_movable(dev)) ++ goto out_unlock; ++ ++ err = -EINVAL; ++ if (dev->flags & (IFF_SLAVE|IFF_MASTER)) ++ goto out_unlock; ++ ++ ve_netdev_cleanup(dev, 1); ++ ++ write_lock_bh(&dev_base_lock); ++ __ve_dev_move(dev, get_ve0(), ve, get_exec_ub()); ++ write_unlock_bh(&dev_base_lock); ++ ++ err = 0; ++ ++out_unlock: ++ rtnl_unlock(); ++ real_put_ve(ve); ++ ++ if (dev == NULL) ++ printk(KERN_WARNING "Device %s not found\n", dev_name); ++ ++out: ++ return err; ++} ++ ++static int ve_dev_del(envid_t veid, char *dev_name) ++{ ++ int err; ++ struct net_device *dev; ++ struct ve_struct *ve, *old_exec; ++ struct hlist_node *p; ++ ++ dev = NULL; ++ err = -ESRCH; ++ ++ ve = get_ve_by_id(veid); ++ if (ve == NULL) ++ goto out; ++ ++ rtnl_lock(); ++ ++ read_lock(&dev_base_lock); ++ hlist_for_each(p, dev_name_hash(dev_name, ve)) { ++ struct net_device *d = hlist_entry(p, struct net_device, ++ name_hlist); ++ if (strncmp(d->name, dev_name, IFNAMSIZ) == 0) { ++ dev = d; ++ break; ++ } ++ } ++ read_unlock(&dev_base_lock); ++ if (dev == NULL) ++ goto out_unlock; ++ ++ err = -EPERM; ++ if (!ve_is_dev_movable(dev)) ++ goto out_unlock; ++ ++ old_exec = set_exec_env(ve); ++ ve_netdev_cleanup(dev, 0); ++ (void)set_exec_env(old_exec); ++ ++ write_lock_bh(&dev_base_lock); ++ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); ++ write_unlock_bh(&dev_base_lock); ++ ++ err = 0; ++ ++out_unlock: ++ rtnl_unlock(); ++ real_put_ve(ve); ++ ++ if (dev == NULL) ++ printk(KERN_WARNING "Device %s not found\n", dev_name); ++ ++out: ++ return err; ++} ++ ++int real_ve_dev_map(envid_t veid, int op, char *dev_name) ++{ ++ int err; ++ err = -EPERM; ++ if (!capable(CAP_SETVEID)) ++ goto out; ++ switch (op) ++ { ++ case VE_NETDEV_ADD: ++ err = ve_dev_add(veid, dev_name); ++ break; ++ case VE_NETDEV_DEL: ++ err = ve_dev_del(veid, dev_name); ++ break; ++ default: ++ err = -EINVAL; ++ break; ++ } ++out: ++ return err; ++} ++ ++static void ve_mapped_devs_cleanup(struct ve_struct *ve) ++{ ++ struct net_device *dev; ++ ++ rtnl_lock(); ++ write_lock_bh(&dev_base_lock); ++restart: ++ for (dev = ve->_net_dev_base; dev != NULL; dev = dev->next) ++ { ++ if ((dev->features & NETIF_F_VENET) || ++ (dev == ve->_loopback_dev)) /* Skip loopback dev */ ++ continue; ++ write_unlock_bh(&dev_base_lock); ++ ve_netdev_cleanup(dev, 0); ++ write_lock_bh(&dev_base_lock); ++ __ve_dev_move(dev, ve, get_ve0(), netdev_bc(dev)->owner_ub); ++ goto restart; ++ } ++ write_unlock_bh(&dev_base_lock); ++ rtnl_unlock(); ++} ++#endif ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * VE information via /proc ++ * ++ ********************************************************************** ++ **********************************************************************/ ++#ifdef CONFIG_PROC_FS ++static int devperms_seq_show(struct seq_file *m, void *v) ++{ ++ struct devperms_struct *dp; ++ char dev_s[32], type_c; ++ unsigned use, type; ++ dev_t dev; ++ ++ dp = (struct devperms_struct *)v; ++ if (dp == (struct devperms_struct *)1L) { ++ seq_printf(m, "Version: 2.7\n"); ++ return 0; ++ } ++ ++ use = dp->type & VE_USE_MASK; ++ type = dp->type & S_IFMT; ++ dev = dp->dev; ++ ++ if ((use | VE_USE_MINOR) == use) ++ snprintf(dev_s, sizeof(dev_s), "%d:%d", MAJOR(dev), MINOR(dev)); ++ else if ((use | VE_USE_MAJOR) == use) ++ snprintf(dev_s, sizeof(dev_s), "%d:*", MAJOR(dp->dev)); ++ else ++ snprintf(dev_s, sizeof(dev_s), "*:*"); ++ ++ if (type == S_IFCHR) ++ type_c = 'c'; ++ else if (type == S_IFBLK) ++ type_c = 'b'; ++ else ++ type_c = '?'; ++ ++ seq_printf(m, "%10u %c %03o %s\n", dp->veid, type_c, dp->mask, dev_s); ++ return 0; ++} ++ ++static void *devperms_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ loff_t cpos; ++ long slot; ++ struct devperms_struct *dp; ++ ++ cpos = *pos; ++ read_lock(&devperms_hash_guard); ++ if (cpos-- == 0) ++ return (void *)1L; ++ ++ for (slot = 0; slot < DEVPERMS_HASH_SZ; slot++) ++ for (dp = devperms_hash[slot]; dp; dp = dp->devhash_next) ++ if (cpos-- == 0) { ++ m->private = (void *)slot; ++ return dp; ++ } ++ return NULL; ++} ++ ++static void *devperms_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ long slot; ++ struct devperms_struct *dp; ++ ++ dp = (struct devperms_struct *)v; ++ ++ if (dp == (struct devperms_struct *)1L) ++ slot = 0; ++ else if (dp->devhash_next == NULL) ++ slot = (long)m->private + 1; ++ else { ++ (*pos)++; ++ return dp->devhash_next; ++ } ++ ++ for (; slot < DEVPERMS_HASH_SZ; slot++) ++ if (devperms_hash[slot]) { ++ (*pos)++; ++ m->private = (void *)slot; ++ return devperms_hash[slot]; ++ } ++ return NULL; ++} ++ ++static void devperms_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&devperms_hash_guard); ++} ++ ++static struct seq_operations devperms_seq_op = { ++ .start = devperms_seq_start, ++ .next = devperms_seq_next, ++ .stop = devperms_seq_stop, ++ .show = devperms_seq_show, ++}; ++ ++static int devperms_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &devperms_seq_op); ++} ++ ++static struct file_operations proc_devperms_ops = { ++ .open = devperms_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++#if BITS_PER_LONG == 32 ++#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21) ++#define VESTAT_LINE_FMT "%10u %10lu %10lu %10lu %10lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n" ++#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n" ++#else ++#define VESTAT_LINE_WIDTH (12 * 21) ++#define VESTAT_LINE_FMT "%20u %20lu %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n" ++#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n" ++#endif ++ ++static int vestat_seq_show(struct seq_file *m, void *v) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ struct ve_struct *curve; ++ int cpu; ++ unsigned long user_ve, nice_ve, system_ve, uptime; ++ cycles_t uptime_cycles, idle_time, strv_time, used; ++ ++ curve = get_exec_env(); ++ if (ve == ve_list_head || ++ (!ve_is_super(curve) && ve == curve)) { ++ /* print header */ ++ seq_printf(m, "%-*s\n", ++ VESTAT_LINE_WIDTH - 1, ++ "Version: 2.2"); ++ seq_printf(m, VESTAT_HEAD_FMT, "VEID", ++ "user", "nice", "system", ++ "uptime", "idle", ++ "strv", "uptime", "used", ++ "maxlat", "totlat", "numsched"); ++ } ++ ++ if (ve == get_ve0()) ++ return 0; ++ ++ user_ve = nice_ve = system_ve = 0; ++ idle_time = strv_time = used = 0; ++ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ struct ve_cpu_stats *st; ++ ++ st = VE_CPU_STATS(ve, cpu); ++ user_ve += st->user; ++ nice_ve += st->nice; ++ system_ve += st->system; ++ used += VE_CPU_STATS(ve, cpu)->used_time; ++ idle_time += ve_sched_get_idle_time(ve, cpu); ++ } ++ uptime_cycles = get_cycles() - ve->start_cycles; ++ uptime = jiffies - ve->start_jiffies; ++ ++ seq_printf(m, VESTAT_LINE_FMT, ve->veid, ++ user_ve, nice_ve, system_ve, ++ uptime, idle_time, ++ strv_time, uptime_cycles, used, ++ ve->sched_lat_ve.last.maxlat, ++ ve->sched_lat_ve.last.totlat, ++ ve->sched_lat_ve.last.count); ++ return 0; ++} ++ ++static void *ve_seq_start(struct seq_file *m, loff_t *pos) ++{ ++ struct ve_struct *ve, *curve; ++ loff_t l; ++ ++ curve = get_exec_env(); ++ read_lock(&ve_list_guard); ++ if (!ve_is_super(curve)) { ++ if (*pos != 0) ++ return NULL; ++ return curve; ++ } ++ for (ve = ve_list_head, l = *pos; ++ ve != NULL && l > 0; ++ ve = ve->next, l--); ++ return ve; ++} ++ ++static void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct ve_struct *ve = (struct ve_struct *)v; ++ ++ if (!ve_is_super(get_exec_env())) ++ return NULL; ++ (*pos)++; ++ return ve->next; ++} ++ ++static void ve_seq_stop(struct seq_file *m, void *v) ++{ ++ read_unlock(&ve_list_guard); ++} ++ ++static struct seq_operations vestat_seq_op = { ++ start: ve_seq_start, ++ next: ve_seq_next, ++ stop: ve_seq_stop, ++ show: vestat_seq_show ++}; ++ ++static int vestat_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &vestat_seq_op); ++} ++ ++static struct file_operations proc_vestat_operations = { ++ open: vestat_open, ++ read: seq_read, ++ llseek: seq_lseek, ++ release: seq_release ++}; ++ ++static inline unsigned long ve_used_mem(struct user_beancounter *ub) ++{ ++ return ub->ub_parms[UB_OOMGUARPAGES].held; ++} ++ ++static inline void ve_mi_replace(struct meminfo *mi) ++{ ++ struct user_beancounter *ub; ++ unsigned long meminfo_val; ++ unsigned long nodettram; ++ unsigned long usedmem; ++ ++ meminfo_val = get_exec_env()->meminfo_val; ++ ++ if(!meminfo_val) ++ return; /* No virtualization */ ++ ++ nodettram = mi->si.totalram; ++ ub = current->mm->mm_ub; ++ usedmem = ve_used_mem(ub); ++ ++ memset(mi, 0, sizeof(*mi)); ++ ++ mi->si.totalram = (meminfo_val > nodettram) ? ++ nodettram : meminfo_val; ++ mi->si.freeram = (mi->si.totalram > usedmem) ? ++ (mi->si.totalram - usedmem) : 0; ++} ++ ++static int meminfo_call(struct vnotifier_block *self, ++ unsigned long event, void *arg, int old_ret) ++{ ++ if (event != VIRTINFO_MEMINFO) ++ return old_ret; ++ ++ ve_mi_replace((struct meminfo *)arg); ++ ++ return NOTIFY_OK; ++} ++ ++ ++static struct vnotifier_block meminfo_notifier_block = { ++ .notifier_call = meminfo_call ++}; ++ ++static int __init init_vecalls_proc(void) ++{ ++ struct proc_dir_entry *de; ++ ++ de = create_proc_glob_entry("vz/vestat", ++ S_IFREG|S_IRUSR, NULL); ++ if (de == NULL) { ++ /* create "vz" subdirectory, if not exist */ ++ (void) create_proc_glob_entry("vz", ++ S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ de = create_proc_glob_entry("vz/vestat", ++ S_IFREG|S_IRUSR, NULL); ++ } ++ if (de) ++ de->proc_fops = &proc_vestat_operations; ++ else ++ printk(KERN_WARNING ++ "VZMON: can't make vestat proc entry\n"); ++ ++ de = create_proc_entry("vz/devperms", S_IFREG | S_IRUSR, NULL); ++ if (de) ++ de->proc_fops = &proc_devperms_ops; ++ else ++ printk(KERN_WARNING ++ "VZMON: can't make devperms proc entry\n"); ++ ++ virtinfo_notifier_register(VITYPE_GENERAL, &meminfo_notifier_block); ++ ++ return 0; ++} ++ ++static void fini_vecalls_proc(void) ++{ ++ remove_proc_entry("vz/devperms", NULL); ++ remove_proc_entry("vz/vestat", NULL); ++ virtinfo_notifier_unregister(VITYPE_GENERAL, &meminfo_notifier_block); ++} ++#else ++#define init_vecalls_proc() (0) ++#define fini_vecalls_proc() do { } while (0) ++#endif /* CONFIG_PROC_FS */ ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * User ctl ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++int vzcalls_ioctl(struct inode *, struct file *, unsigned int, unsigned long); ++static struct vzioctlinfo vzcalls = { ++ type: VZCTLTYPE, ++ func: vzcalls_ioctl, ++ owner: THIS_MODULE, ++}; ++ ++int vzcalls_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ ++ err = -ENOTTY; ++ switch(cmd) { ++ case VZCTL_MARK_ENV_TO_DOWN: { ++ /* Compatibility issue */ ++ err = 0; ++ } ++ break; ++ case VZCTL_SETDEVPERMS: { ++ /* Device type was mistakenly declared as dev_t ++ * in the old user-kernel interface. ++ * That's wrong, dev_t is a kernel internal type. ++ * I use `unsigned' not having anything better in mind. ++ * 2001/08/11 SAW */ ++ struct vzctl_setdevperms s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_setdevperms(s.veid, s.type, ++ new_decode_dev(s.dev), s.mask); ++ } ++ break; ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ case VZCTL_VE_NETDEV: { ++ struct vzctl_ve_netdev d; ++ char *s; ++ err = -EFAULT; ++ if (copy_from_user(&d, (void *)arg, sizeof(d))) ++ break; ++ err = -ENOMEM; ++ s = kmalloc(IFNAMSIZ+1, GFP_KERNEL); ++ if (s == NULL) ++ break; ++ err = -EFAULT; ++ if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) { ++ s[IFNAMSIZ] = 0; ++ err = real_ve_dev_map(d.veid, d.op, s); ++ } ++ kfree(s); ++ } ++ break; ++#endif ++ case VZCTL_ENV_CREATE: { ++ struct vzctl_env_create s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = real_env_create(s.veid, s.flags, s.class_id, ++ NULL, 0); ++ } ++ break; ++ case VZCTL_ENV_CREATE_DATA: { ++ struct vzctl_env_create_data s; ++ env_create_param_t *data; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err=-EINVAL; ++ if (s.datalen < VZCTL_ENV_CREATE_DATA_MINLEN || ++ s.datalen > VZCTL_ENV_CREATE_DATA_MAXLEN || ++ s.data == 0) ++ break; ++ err = -ENOMEM; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if (!data) ++ break; ++ memset(data, 0, sizeof(*data)); ++ err = -EFAULT; ++ if (copy_from_user(data, (void *)s.data, s.datalen)) ++ goto free_data; ++ err = real_env_create(s.veid, s.flags, s.class_id, ++ data, s.datalen); ++free_data: ++ kfree(data); ++ } ++ break; ++ case VZCTL_GET_CPU_STAT: { ++ struct vzctl_cpustatctl s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = ve_get_cpu_stat(s.veid, s.cpustat); ++ } ++ break; ++ case VZCTL_VE_MEMINFO: { ++ struct vzctl_ve_meminfo s; ++ err = -EFAULT; ++ if (copy_from_user(&s, (void *)arg, sizeof(s))) ++ break; ++ err = ve_set_meminfo(s.veid, s.val); ++ } ++ break; ++ } ++ return err; ++} ++EXPORT_SYMBOL(real_env_create); ++ ++ ++/********************************************************************** ++ ********************************************************************** ++ * ++ * Init/exit stuff ++ * ++ ********************************************************************** ++ **********************************************************************/ ++ ++#ifdef CONFIG_VE_CALLS_MODULE ++static int __init init_vecalls_symbols(void) ++{ ++ KSYMRESOLVE(real_get_device_perms_ve); ++ KSYMRESOLVE(real_do_env_cleanup); ++ KSYMRESOLVE(real_do_env_free); ++ KSYMRESOLVE(real_update_load_avg_ve); ++ KSYMMODRESOLVE(vzmon); ++ return 0; ++} ++ ++static void fini_vecalls_symbols(void) ++{ ++ KSYMMODUNRESOLVE(vzmon); ++ KSYMUNRESOLVE(real_get_device_perms_ve); ++ KSYMUNRESOLVE(real_do_env_cleanup); ++ KSYMUNRESOLVE(real_do_env_free); ++ KSYMUNRESOLVE(real_update_load_avg_ve); ++} ++#else ++#define init_vecalls_symbols() (0) ++#define fini_vecalls_symbols() do { } while (0) ++#endif ++ ++static inline __init int init_vecalls_ioctls(void) ++{ ++ vzioctl_register(&vzcalls); ++ return 0; ++} ++ ++static inline void fini_vecalls_ioctls(void) ++{ ++ vzioctl_unregister(&vzcalls); ++} ++ ++static int __init vecalls_init(void) ++{ ++ int err; ++ int i; ++ ++ ve_list_head = get_ve0(); ++ ++ err = init_vzmond(); ++ if (err < 0) ++ goto out_vzmond; ++ ++ err = init_devperms_hash(); ++ if (err < 0) ++ goto out_perms; ++ ++ err = init_vecalls_symbols(); ++ if (err < 0) ++ goto out_sym; ++ ++ err = init_vecalls_proc(); ++ if (err < 0) ++ goto out_proc; ++ ++ err = init_vecalls_ioctls(); ++ if (err < 0) ++ goto out_ioctls; ++ ++ for (i = 0; i < VE_MAX_HOOKS; i++) ++ INIT_LIST_HEAD(&ve_hooks[i]); ++ ++ return 0; ++ ++out_ioctls: ++ fini_vecalls_proc(); ++out_proc: ++ fini_vecalls_symbols(); ++out_sym: ++ fini_devperms_hash(); ++out_perms: ++ fini_vzmond(); ++out_vzmond: ++ return err; ++} ++ ++static void vecalls_exit(void) ++{ ++ fini_vecalls_ioctls(); ++ fini_vecalls_proc(); ++ fini_vecalls_symbols(); ++ fini_devperms_hash(); ++ fini_vzmond(); ++} ++ ++EXPORT_SYMBOL(get_ve_by_id); ++EXPORT_SYMBOL(__find_ve_by_id); ++EXPORT_SYMBOL(ve_list_guard); ++EXPORT_SYMBOL(ve_list_head); ++EXPORT_SYMBOL(nr_ve); ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Control"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(vecalls_init) ++module_exit(vecalls_exit) +diff -upr linux-2.6.16.orig/kernel/veowner.c linux-2.6.16-026test015/kernel/veowner.c +--- linux-2.6.16.orig/kernel/veowner.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/veowner.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,308 @@ ++/* ++ * kernel/veowner.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/sched.h> ++#include <linux/ve.h> ++#include <linux/ve_owner.h> ++#include <linux/ve_proto.h> ++#include <linux/ipc.h> ++#include <linux/fs.h> ++#include <linux/proc_fs.h> ++#include <linux/file.h> ++#include <linux/mm.h> ++#include <linux/delay.h> ++#include <linux/vmalloc.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/list.h> ++#include <linux/inetdevice.h> ++#include <asm/system.h> ++#include <asm/io.h> ++ ++#include <net/tcp.h> ++ ++void prepare_ve0_process(struct task_struct *tsk) ++{ ++ set_virt_pid(tsk, tsk->pid); ++ set_virt_tgid(tsk, tsk->tgid); ++ if (tsk->signal) { ++ set_virt_pgid(tsk, tsk->signal->pgrp); ++ set_virt_sid(tsk, tsk->signal->session); ++ } ++ VE_TASK_INFO(tsk)->exec_env = get_ve0(); ++ VE_TASK_INFO(tsk)->owner_env = get_ve0(); ++ VE_TASK_INFO(tsk)->sleep_time = 0; ++ VE_TASK_INFO(tsk)->wakeup_stamp = 0; ++ VE_TASK_INFO(tsk)->sched_time = 0; ++ seqcount_init(&VE_TASK_INFO(tsk)->wakeup_lock); ++ ++ if (tsk->pid) { ++ SET_VE_LINKS(tsk); ++ atomic_inc(&get_ve0()->pcounter); ++ } ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++void prepare_ve0_loopback(void) ++{ ++ get_ve0()->_loopback_dev = &loopback_dev; ++} ++#endif ++ ++/* ++ * ------------------------------------------------------------------------ ++ * proc entries ++ * ------------------------------------------------------------------------ ++ */ ++ ++#ifdef CONFIG_PROC_FS ++static void proc_move(struct proc_dir_entry *ddir, ++ struct proc_dir_entry *sdir, ++ const char *name) ++{ ++ struct proc_dir_entry **p, *q; ++ int len; ++ ++ len = strlen(name); ++ for (p = &sdir->subdir, q = *p; q != NULL; p = &q->next, q = *p) ++ if (proc_match(len, name, q)) ++ break; ++ if (q == NULL) ++ return; ++ *p = q->next; ++ q->parent = ddir; ++ q->next = ddir->subdir; ++ ddir->subdir = q; ++} ++static void prepare_proc_misc(void) ++{ ++ static char *table[] = { ++ "loadavg", ++ "uptime", ++ "meminfo", ++ "version", ++ "stat", ++ "filesystems", ++ "locks", ++ "swaps", ++ "mounts", ++ "net", ++ "cpuinfo", ++ "sysvipc", ++ "sys", ++ "fs", ++ "vz", ++ "user_beancounters", ++ "cmdline", ++ "vmstat", ++ "modules", ++ "kmsg", ++ NULL, ++ }; ++ char **p; ++ ++ for (p = table; *p != NULL; p++) ++ proc_move(&proc_root, ve0.proc_root, *p); ++} ++int prepare_proc(void) ++{ ++ struct ve_struct *envid; ++ struct proc_dir_entry *de; ++ struct proc_dir_entry *ve_root; ++ ++ envid = set_exec_env(&ve0); ++ ve_root = ve0.proc_root->subdir; ++ /* move the whole tree to be visible in VE0 only */ ++ ve0.proc_root->subdir = proc_root.subdir; ++ for (de = ve0.proc_root->subdir; de->next != NULL; de = de->next) ++ de->parent = ve0.proc_root; ++ de->parent = ve0.proc_root; ++ de->next = ve_root; ++ ++ /* move back into the global scope some specific entries */ ++ proc_root.subdir = NULL; ++ prepare_proc_misc(); ++ proc_net = proc_mkdir("net", ve0.proc_root); ++ proc_net_stat = proc_mkdir("stat", proc_net); ++ proc_mkdir("vz", 0); ++#ifdef CONFIG_SYSVIPC ++ proc_mkdir("sysvipc", 0); ++#endif ++ proc_root_fs = proc_mkdir("fs", 0); ++ /* XXX proc_tty_init(); */ ++ ++ /* XXX process inodes */ ++ ++ (void)set_exec_env(envid); ++ ++ (void)create_proc_glob_entry("vz", S_IFDIR|S_IRUGO|S_IXUGO, NULL); ++ return 0; ++} ++ ++static struct proc_dir_entry ve0_proc_root = { ++ .name = "/proc", ++ .namelen = 5, ++ .mode = S_IFDIR | S_IRUGO | S_IXUGO, ++ .nlink = 2 ++}; ++ ++void prepare_ve0_proc_root(void) ++{ ++ ve0.proc_root = &ve0_proc_root; ++} ++#endif ++ ++/* ++ * ------------------------------------------------------------------------ ++ * Virtualized sysctl ++ * ------------------------------------------------------------------------ ++ */ ++ ++static int semmin[4] = { 1, 1, 1, 1 }; ++static int semmax[4] = { 8000, INT_MAX, 1000, IPCMNI }; ++static ctl_table kern_table[] = { ++ {KERN_NODENAME, "hostname", system_utsname.nodename, 64, ++ 0644, NULL, &proc_doutsstring, &sysctl_string}, ++ {KERN_DOMAINNAME, "domainname", system_utsname.domainname, 64, ++ 0644, NULL, &proc_doutsstring, &sysctl_string}, ++#ifdef CONFIG_SYSVIPC ++#define get_ve0_field(fname) &ve0._##fname ++ {KERN_SHMMAX, "shmmax", get_ve0_field(shm_ctlmax), sizeof (size_t), ++ 0644, NULL, &proc_doulongvec_minmax }, ++ {KERN_SHMALL, "shmall", get_ve0_field(shm_ctlall), sizeof (size_t), ++ 0644, NULL, &proc_doulongvec_minmax }, ++ {KERN_SHMMNI, "shmmni", get_ve0_field(shm_ctlmni), sizeof (int), ++ 0644, NULL, &proc_dointvec_minmax, NULL, ++ NULL, &semmin[0], &semmax[3] }, ++ {KERN_MSGMAX, "msgmax", get_ve0_field(msg_ctlmax), sizeof (int), ++ 0644, NULL, &proc_dointvec }, ++ {KERN_MSGMNI, "msgmni", get_ve0_field(msg_ctlmni), sizeof (int), ++ 0644, NULL, &proc_dointvec_minmax, NULL, ++ NULL, &semmin[0], &semmax[3] }, ++ {KERN_MSGMNB, "msgmnb", get_ve0_field(msg_ctlmnb), sizeof (int), ++ 0644, NULL, &proc_dointvec }, ++ {KERN_SEM, "sem", get_ve0_field(sem_ctls), 4*sizeof (int), ++ 0644, NULL, &proc_dointvec }, ++#endif ++ {0} ++}; ++static ctl_table root_table[] = { ++ {CTL_KERN, "kernel", NULL, 0, 0555, kern_table}, ++ {0} ++}; ++extern int ip_rt_src_check; ++extern int ve_area_access_check; ++static ctl_table vz_ipv4_route_table[] = { ++ { ++ ctl_name: NET_IPV4_ROUTE_SRC_CHECK, ++ procname: "src_check", ++ data: &ip_rt_src_check, ++ maxlen: sizeof(int), ++ mode: 0644, ++ proc_handler: &proc_dointvec, ++ }, ++ { 0 } ++}; ++static ctl_table vz_ipv4_table[] = { ++ {NET_IPV4_ROUTE, "route", NULL, 0, 0555, vz_ipv4_route_table}, ++ { 0 } ++}; ++static ctl_table vz_net_table[] = { ++ {NET_IPV4, "ipv4", NULL, 0, 0555, vz_ipv4_table}, ++ { 0 } ++}; ++static ctl_table vz_fs_table[] = { ++ { ++ ctl_name: 226, ++ procname: "ve-area-access-check", ++ data: &ve_area_access_check, ++ maxlen: sizeof(int), ++ mode: 0644, ++ proc_handler: &proc_dointvec, ++ }, ++ { 0 } ++}; ++static ctl_table root_table2[] = { ++ {CTL_NET, "net", NULL, 0, 0555, vz_net_table}, ++ {CTL_FS, "fs", NULL, 0, 0555, vz_fs_table}, ++ { 0 } ++}; ++int prepare_sysctl(void) ++{ ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(&ve0); ++ ve0.kern_header = register_sysctl_table(root_table, 1); ++ register_sysctl_table(root_table2, 0); ++ (void)set_exec_env(envid); ++ return 0; ++} ++ ++void prepare_ve0_sysctl(void) ++{ ++ INIT_LIST_HEAD(&ve0.sysctl_lh); ++#ifdef CONFIG_SYSCTL ++ ve0.proc_sys_root = proc_mkdir("sys", 0); ++#endif ++} ++ ++/* ++ * ------------------------------------------------------------------------ ++ * XXX init_ve_system ++ * ------------------------------------------------------------------------ ++ */ ++ ++void init_ve_system(void) ++{ ++ struct task_struct *init_entry, *p, *tsk; ++ struct ve_struct *ptr; ++ unsigned long flags; ++ int i; ++ ++ ptr = get_ve0(); ++ (void)get_ve(ptr); ++ atomic_set(&ptr->pcounter, 1); ++ ++ /* Don't forget about idle tasks */ ++ write_lock_irqsave(&tasklist_lock, flags); ++ for (i = 0; i < NR_CPUS; i++) { ++ tsk = idle_task(i); ++ if (tsk == NULL) ++ continue; ++ ++ prepare_ve0_process(tsk); ++ } ++ do_each_thread_all(p, tsk) { ++ prepare_ve0_process(tsk); ++ } while_each_thread_all(p, tsk); ++ write_unlock_irqrestore(&tasklist_lock, flags); ++ ++ init_entry = child_reaper; ++ ptr->init_entry = init_entry; ++ /* XXX: why? */ ++ cap_set_full(ptr->cap_default); ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ptr->_ipv4_devconf = &ipv4_devconf; ++ ptr->_ipv4_devconf_dflt = &ipv4_devconf_dflt; ++#endif ++ ++ read_lock(&init_entry->fs->lock); ++ ptr->fs_rootmnt = init_entry->fs->rootmnt; ++ ptr->fs_root = init_entry->fs->root; ++ read_unlock(&init_entry->fs->lock); ++ ++ /* common prepares */ ++#ifdef CONFIG_PROC_FS ++ prepare_proc(); ++#endif ++ prepare_sysctl(); ++ prepare_ipc(); ++} +diff -upr linux-2.6.16.orig/kernel/vzdev.c linux-2.6.16-026test015/kernel/vzdev.c +--- linux-2.6.16.orig/kernel/vzdev.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/vzdev.c 2006-07-04 14:41:39.000000000 +0400 +@@ -0,0 +1,129 @@ ++/* ++ * kernel/vzdev.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/fs.h> ++#include <linux/list.h> ++#include <linux/init.h> ++#include <linux/module.h> ++#include <linux/vzctl.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/vzcalluser.h> ++#include <asm/uaccess.h> ++#include <asm/pgalloc.h> ++#include <linux/device.h> ++#include <linux/smp_lock.h> ++ ++#define VZCTL_MAJOR 126 ++#define VZCTL_NAME "vzctl" ++ ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo Interface"); ++MODULE_LICENSE("GPL v2"); ++ ++static LIST_HEAD(ioctls); ++static spinlock_t ioctl_lock = SPIN_LOCK_UNLOCKED; ++ ++int vzctl_ioctl(struct inode *ino, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err; ++ struct list_head *p; ++ struct vzioctlinfo *inf; ++ ++ err = -ENOTTY; ++ spin_lock(&ioctl_lock); ++ list_for_each(p, &ioctls) { ++ inf = list_entry(p, struct vzioctlinfo, list); ++ if (inf->type != _IOC_TYPE(cmd)) ++ continue; ++ ++ err = try_module_get(inf->owner) ? 0 : -EBUSY; ++ spin_unlock(&ioctl_lock); ++ if (!err) { ++ unlock_kernel(); ++ err = (*inf->func)(ino, file, cmd, arg); ++ lock_kernel(); ++ module_put(inf->owner); ++ } ++ return err; ++ } ++ spin_unlock(&ioctl_lock); ++ return err; ++} ++ ++void vzioctl_register(struct vzioctlinfo *inf) ++{ ++ spin_lock(&ioctl_lock); ++ list_add(&inf->list, &ioctls); ++ spin_unlock(&ioctl_lock); ++} ++ ++void vzioctl_unregister(struct vzioctlinfo *inf) ++{ ++ spin_lock(&ioctl_lock); ++ list_del_init(&inf->list); ++ spin_unlock(&ioctl_lock); ++} ++ ++EXPORT_SYMBOL(vzioctl_register); ++EXPORT_SYMBOL(vzioctl_unregister); ++ ++/* ++ * Init/exit stuff. ++ */ ++static struct file_operations vzctl_fops = { ++ .owner = THIS_MODULE, ++ .ioctl = vzctl_ioctl, ++}; ++ ++static struct class *vzctl_class; ++ ++static void __exit vzctl_exit(void) ++{ ++ class_device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0)); ++ class_destroy(vzctl_class); ++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); ++} ++ ++static int __init vzctl_init(void) ++{ ++ int ret; ++ struct class_device *class_err; ++ ++ ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops); ++ if (ret < 0) ++ goto out; ++ ++ vzctl_class = class_create(THIS_MODULE, "vzctl"); ++ if (IS_ERR(vzctl_class)) { ++ ret = PTR_ERR(vzctl_class); ++ goto out_cleandev; ++ } ++ ++ class_err = class_device_create(vzctl_class, NULL, MKDEV(VZCTL_MAJOR, 0), ++ NULL, VZCTL_NAME); ++ if (IS_ERR(class_err)) { ++ ret = PTR_ERR(class_err); ++ goto out_rmclass; ++ } ++ ++ goto out; ++ ++out_rmclass: ++ class_destroy(vzctl_class); ++out_cleandev: ++ unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME); ++out: ++ return ret; ++} ++ ++module_init(vzctl_init) ++module_exit(vzctl_exit); +diff -upr linux-2.6.16.orig/kernel/vzwdog.c linux-2.6.16-026test015/kernel/vzwdog.c +--- linux-2.6.16.orig/kernel/vzwdog.c 2006-07-04 14:41:41.000000000 +0400 ++++ linux-2.6.16-026test015/kernel/vzwdog.c 2006-07-04 14:41:38.000000000 +0400 +@@ -0,0 +1,278 @@ ++/* ++ * kernel/vzwdog.c ++ * ++ * Copyright (C) 2000-2005 SWsoft ++ * All rights reserved. ++ * ++ * Licensing governed by "linux/COPYING.SWsoft" file. ++ * ++ */ ++ ++#include <linux/sched.h> ++#include <linux/fs.h> ++#include <linux/list.h> ++#include <linux/ctype.h> ++#include <linux/kobject.h> ++#include <linux/genhd.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/kernel_stat.h> ++#include <linux/smp_lock.h> ++#include <linux/errno.h> ++#include <linux/suspend.h> ++#include <linux/ve.h> ++#include <linux/vzstat.h> ++ ++/* Staff regading kernel thread polling VE validity */ ++static int sleep_timeout = 60; ++static pid_t wdog_thread_pid; ++static int wdog_thread_continue = 1; ++static DECLARE_COMPLETION(license_thread_exited); ++ ++extern void show_mem(void); ++extern struct ve_struct *ve_list_head; ++ ++#if 0 ++static char page[PAGE_SIZE]; ++ ++static void parse_irq_list(int len) ++{ ++ int i, k, skip; ++ for (i = 0; i < len; ) { ++ k = i; ++ while (i < len && page[i] != '\n' && page[i] != ':') ++ i++; ++ skip = 0; ++ if (i < len && page[i] != '\n') { ++ i++; /* skip ':' */ ++ while (i < len && (page[i] == ' ' || page[i] == '0')) ++ i++; ++ skip = (i < len && (page[i] < '0' || page[i] > '9')); ++ while (i < len && page[i] != '\n') ++ i++; ++ } ++ if (!skip) ++ printk("\n%.*s", i - k, page + k); ++ if (i < len) ++ i++; /* skip '\n' */ ++ } ++} ++#endif ++ ++static void show_irq_list(void) ++{ ++#if 0 ++ i = KSYMSAFECALL(int, get_irq_list, (page)); ++ parse_irq_list(i); /* Safe, zero was returned if unassigned */ ++#endif ++} ++ ++static void show_alloc_latency(void) ++{ ++ static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = { ++ "A0", ++ "L0", ++ "H0", ++ "L1", ++ "H1" ++ }; ++ int i; ++ ++ printk("lat: "); ++ for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) { ++ struct kstat_lat_struct *p; ++ cycles_t maxlat, avg0, avg1, avg2; ++ ++ p = &kstat_glob.alloc_lat[i]; ++ spin_lock_irq(&kstat_glb_lock); ++ maxlat = p->last.maxlat; ++ avg0 = p->avg[0]; ++ avg1 = p->avg[1]; ++ avg2 = p->avg[2]; ++ spin_unlock_irq(&kstat_glb_lock); ++ ++ printk("%s %Lu (%Lu %Lu %Lu)", ++ alloc_descr[i], ++ maxlat, ++ avg0, ++ avg1, ++ avg2); ++ } ++ printk("\n"); ++} ++ ++static void show_schedule_latency(void) ++{ ++ struct kstat_lat_pcpu_struct *p; ++ cycles_t maxlat, totlat, avg0, avg1, avg2; ++ unsigned long count; ++ ++ p = &kstat_glob.sched_lat; ++ spin_lock_irq(&kstat_glb_lock); ++ maxlat = p->last.maxlat; ++ totlat = p->last.totlat; ++ count = p->last.count; ++ avg0 = p->avg[0]; ++ avg1 = p->avg[1]; ++ avg2 = p->avg[2]; ++ spin_unlock_irq(&kstat_glb_lock); ++ ++ printk("sched lat: %Lu/%Lu/%lu (%Lu %Lu %Lu)\n", ++ maxlat, ++ totlat, ++ count, ++ avg0, ++ avg1, ++ avg2); ++} ++ ++static void show_header(void) ++{ ++ struct timeval tv; ++ ++ do_gettimeofday(&tv); ++ printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n", ++ tv.tv_sec, tv.tv_usec, ++ get_jiffies_64(), smp_processor_id()); ++#ifdef CONFIG_FAIRSCHED ++ printk("*** cycles_per_jiffy %lu jiffies_per_second %u ***\n", ++ cycles_per_jiffy, HZ); ++#else ++ printk("*** jiffies_per_second %u ***\n", HZ); ++#endif ++} ++ ++static void show_pgdatinfo(void) ++{ ++ pg_data_t *pgdat; ++ ++ printk("pgdat:"); ++ for_each_pgdat(pgdat) { ++ printk(" %d: %lu,%lu,%lu,%p", ++ pgdat->node_id, ++ pgdat->node_start_pfn, ++ pgdat->node_present_pages, ++ pgdat->node_spanned_pages, ++ pgdat->node_mem_map); ++ } ++ printk("\n"); ++} ++ ++static void show_diskio(void) ++{ ++ struct gendisk *gd; ++ char buf[BDEVNAME_SIZE]; ++ ++ printk("disk_io: "); ++ ++ down_read(&block_subsys.rwsem); ++ list_for_each_entry(gd, &block_subsys.kset.list, kobj.entry) { ++ char *name; ++ name = disk_name(gd, 0, buf); ++ if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) && ++ isdigit(name[4])) ++ continue; ++ if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) && ++ isdigit(name[3])) ++ continue; ++ printk("(%u,%u) %s r(%u %u %u) w(%u %u %u)\n", ++ gd->major, gd->first_minor, ++ name, ++ disk_stat_read(gd, ios[READ]), ++ disk_stat_read(gd, sectors[READ]), ++ disk_stat_read(gd, merges[READ]), ++ disk_stat_read(gd, ios[WRITE]), ++ disk_stat_read(gd, sectors[WRITE]), ++ disk_stat_read(gd, merges[WRITE])); ++ } ++ up_read(&block_subsys.rwsem); ++ ++ printk("\n"); ++} ++ ++static void show_nrprocs(void) ++{ ++ unsigned long _nr_running, _nr_sleeping, ++ _nr_unint, _nr_zombie, _nr_dead, _nr_stopped; ++ ++ _nr_running = nr_running(); ++ _nr_unint = nr_uninterruptible(); ++ _nr_sleeping = nr_sleeping(); ++ _nr_zombie = nr_zombie; ++ _nr_dead = atomic_read(&nr_dead); ++ _nr_stopped = nr_stopped(); ++ ++ printk("VEnum: %d, proc R %lu, S %lu, D %lu, " ++ "Z %lu, X %lu, T %lu (tot %d)\n", ++ nr_ve, _nr_running, _nr_sleeping, _nr_unint, ++ _nr_zombie, _nr_dead, _nr_stopped, nr_threads); ++} ++ ++static void wdog_print(void) ++{ ++ show_header(); ++ show_irq_list(); ++ show_pgdatinfo(); ++ show_mem(); ++ show_diskio(); ++ show_schedule_latency(); ++ show_alloc_latency(); ++ show_nrprocs(); ++} ++ ++static int wdog_loop(void* data) ++{ ++ struct task_struct *tsk = current; ++ DECLARE_WAIT_QUEUE_HEAD(thread_wait_queue); ++ ++ /* ++ * This thread doesn't need any user-level access, ++ * so get rid of all our resources ++ */ ++ daemonize("wdogd"); ++ ++ spin_lock_irq(&tsk->sighand->siglock); ++ sigfillset(&tsk->blocked); ++ sigdelset(&tsk->blocked, SIGHUP); ++ recalc_sigpending(); ++ spin_unlock_irq(&tsk->sighand->siglock); ++ ++ while (wdog_thread_continue) { ++ wdog_print(); ++ interruptible_sleep_on_timeout(&thread_wait_queue, ++ sleep_timeout*HZ); ++ try_to_freeze(); ++ /* clear all signals */ ++ if (signal_pending(tsk)) ++ flush_signals(tsk); ++ } ++ ++ complete_and_exit(&license_thread_exited, 0); ++} ++ ++static int __init wdog_init(void) ++{ ++ wdog_thread_pid = kernel_thread(wdog_loop, NULL, 0); ++ if (wdog_thread_pid < 0) ++ return wdog_thread_pid; ++ ++ return 0; ++} ++ ++static void __exit wdog_exit(void) ++{ ++ wdog_thread_continue = 0; ++ if (wdog_thread_pid > 0) { ++ kill_proc(wdog_thread_pid, SIGHUP, 1); ++ wait_for_completion(&license_thread_exited); ++ } ++} ++ ++module_param(sleep_timeout, int, 0); ++MODULE_AUTHOR("SWsoft <info@sw-soft.com>"); ++MODULE_DESCRIPTION("Virtuozzo WDOG"); ++MODULE_LICENSE("GPL v2"); ++ ++module_init(wdog_init) ++module_exit(wdog_exit) +diff -upr linux-2.6.16.orig/lib/Kconfig.debug linux-2.6.16-026test015/lib/Kconfig.debug +--- linux-2.6.16.orig/lib/Kconfig.debug 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/lib/Kconfig.debug 2006-07-04 14:41:39.000000000 +0400 +@@ -48,7 +48,7 @@ config LOG_BUF_SHIFT + + config DETECT_SOFTLOCKUP + bool "Detect Soft Lockups" +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && !SCHED_VCPU + default y + help + Say Y here to enable the kernel to detect "soft lockups", +diff -upr linux-2.6.16.orig/lib/bust_spinlocks.c linux-2.6.16-026test015/lib/bust_spinlocks.c +--- linux-2.6.16.orig/lib/bust_spinlocks.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/lib/bust_spinlocks.c 2006-07-04 14:41:37.000000000 +0400 +@@ -20,19 +20,11 @@ void bust_spinlocks(int yes) + if (yes) { + oops_in_progress = 1; + } else { +- int loglevel_save = console_loglevel; + #ifdef CONFIG_VT + unblank_screen(); + #endif + oops_in_progress = 0; +- /* +- * OK, the message is on the console. Now we call printk() +- * without oops_in_progress set so that printk() will give klogd +- * and the blanked console a poke. Hold onto your hats... +- */ +- console_loglevel = 15; /* NMI oopser may have shut the console up */ +- printk(" "); +- console_loglevel = loglevel_save; ++ wake_up_klogd(); + } + } + +diff -upr linux-2.6.16.orig/mm/filemap_xip.c linux-2.6.16-026test015/mm/filemap_xip.c +--- linux-2.6.16.orig/mm/filemap_xip.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/filemap_xip.c 2006-07-04 14:41:37.000000000 +0400 +@@ -15,6 +15,7 @@ + #include <linux/rmap.h> + #include <asm/tlbflush.h> + #include "filemap.h" ++#include <ub/ub_vmpages.h> + + /* + * This is a file read routine for execute in place files, and uses +@@ -190,7 +191,10 @@ __xip_unmap (struct address_space * mapp + flush_cache_page(vma, address, pte_pfn(*pte)); + pteval = ptep_clear_flush(vma, address, pte); + page_remove_rmap(page); ++ pb_remove_ref(page, mm); ++ ub_unused_privvm_inc(mm, vma); + dec_mm_counter(mm, file_rss); ++ dec_vma_rss(vma); + BUG_ON(pte_dirty(pteval)); + pte_unmap_unlock(pte, ptl); + page_cache_release(page); +diff -upr linux-2.6.16.orig/mm/fremap.c linux-2.6.16-026test015/mm/fremap.c +--- linux-2.6.16.orig/mm/fremap.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/fremap.c 2006-07-04 14:41:39.000000000 +0400 +@@ -20,6 +20,8 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) + { +@@ -34,6 +36,7 @@ static int zap_pte(struct mm_struct *mm, + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); ++ pb_remove_ref(page, mm); + page_cache_release(page); + } + } else { +@@ -57,6 +60,10 @@ int install_page(struct mm_struct *mm, s + pte_t *pte; + pte_t pte_val; + spinlock_t *ptl; ++ struct page_beancounter *pbc; ++ ++ if (unlikely(pb_alloc(&pbc))) ++ goto out_nopb; + + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) +@@ -75,11 +82,15 @@ int install_page(struct mm_struct *mm, s + if (page_mapcount(page) > INT_MAX/2) + goto unlock; + +- if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) ++ if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) { ++ ub_unused_privvm_dec(mm, vma); + inc_mm_counter(mm, file_rss); ++ inc_vma_rss(vma); ++ } + + flush_icache_page(vma, page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); ++ pb_add_ref(page, mm, &pbc); + page_add_file_rmap(page); + pte_val = *pte; + update_mmu_cache(vma, addr, pte_val); +@@ -87,6 +98,8 @@ int install_page(struct mm_struct *mm, s + unlock: + pte_unmap_unlock(pte, ptl); + out: ++ pb_free(&pbc); ++out_nopb: + return err; + } + EXPORT_SYMBOL(install_page); +@@ -109,7 +122,9 @@ int install_file_pte(struct mm_struct *m + + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { + update_hiwater_rss(mm); ++ ub_unused_privvm_inc(mm, vma); + dec_mm_counter(mm, file_rss); ++ dec_vma_rss(vma); + } + + set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); +@@ -220,4 +235,5 @@ asmlinkage long sys_remap_file_pages(uns + + return err; + } ++EXPORT_SYMBOL_GPL(sys_remap_file_pages); + +diff -upr linux-2.6.16.orig/mm/madvise.c linux-2.6.16-026test015/mm/madvise.c +--- linux-2.6.16.orig/mm/madvise.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/madvise.c 2006-07-04 14:41:36.000000000 +0400 +@@ -168,6 +168,9 @@ static long madvise_remove(struct vm_are + return -EINVAL; + } + ++ if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) ++ return -EACCES; ++ + mapping = vma->vm_file->f_mapping; + + offset = (loff_t)(start - vma->vm_start) +diff -upr linux-2.6.16.orig/mm/memory.c linux-2.6.16-026test015/mm/memory.c +--- linux-2.6.16.orig/mm/memory.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/memory.c 2006-07-04 14:41:39.000000000 +0400 +@@ -58,6 +58,8 @@ + #include <linux/swapops.h> + #include <linux/elf.h> + ++#include <ub/ub_vmpages.h> ++ + #ifndef CONFIG_NEED_MULTIPLE_NODES + /* use the per-pgdat data instead for discontigmem - mbligh */ + unsigned long max_mapnr; +@@ -81,6 +83,7 @@ unsigned long vmalloc_earlyreserve; + EXPORT_SYMBOL(num_physpages); + EXPORT_SYMBOL(high_memory); + EXPORT_SYMBOL(vmalloc_earlyreserve); ++EXPORT_SYMBOL_GPL(empty_zero_page); + + int randomize_va_space __read_mostly = 1; + +@@ -103,18 +106,21 @@ void pgd_clear_bad(pgd_t *pgd) + pgd_ERROR(*pgd); + pgd_clear(pgd); + } ++EXPORT_SYMBOL_GPL(pgd_clear_bad); + + void pud_clear_bad(pud_t *pud) + { + pud_ERROR(*pud); + pud_clear(pud); + } ++EXPORT_SYMBOL_GPL(pud_clear_bad); + + void pmd_clear_bad(pmd_t *pmd) + { + pmd_ERROR(*pmd); + pmd_clear(pmd); + } ++EXPORT_SYMBOL_GPL(pmd_clear_bad); + + /* + * Note: this doesn't free the actual pages themselves. That +@@ -318,6 +324,7 @@ int __pte_alloc(struct mm_struct *mm, pm + spin_unlock(&mm->page_table_lock); + return 0; + } ++EXPORT_SYMBOL_GPL(__pte_alloc); + + int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) + { +@@ -418,6 +425,7 @@ struct page *vm_normal_page(struct vm_ar + */ + return pfn_to_page(pfn); + } ++EXPORT_SYMBOL_GPL(vm_normal_page); + + /* + * copy one vm_area from one task to the other. Assumes the page tables +@@ -428,7 +436,7 @@ struct page *vm_normal_page(struct vm_ar + static inline void + copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, +- unsigned long addr, int *rss) ++ unsigned long addr, int *rss, struct page_beancounter **pbc) + { + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; +@@ -471,6 +479,7 @@ copy_one_pte(struct mm_struct *dst_mm, s + if (page) { + get_page(page); + page_dup_rmap(page); ++ pb_dup_ref(page, dst_mm, pbc); + rss[!!PageAnon(page)]++; + } + +@@ -478,20 +487,36 @@ out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); + } + ++#define pte_ptrs(a) (PTRS_PER_PTE - ((a >> PAGE_SHIFT)&(PTRS_PER_PTE - 1))) ++#ifdef CONFIG_USER_RESOURCE ++#define same_ub(mm1, mm2) ((mm1)->mm_ub == (mm2)->mm_ub) ++#else ++#define same_ub(mm1, mm2) (1) ++#endif ++ + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, ++ pmd_t *dst_pmd, pmd_t *src_pmd, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; +- int rss[2]; ++ int rss[2], rss_tot; ++ struct page_beancounter *pbc; ++ int err; + ++ err = -ENOMEM; ++ pbc = same_ub(src_mm, dst_mm) ? PBC_COPY_SAME : NULL; + again: ++ if (pbc != PBC_COPY_SAME && pb_alloc_list(&pbc, pte_ptrs(addr))) ++ goto out; + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) +- return -ENOMEM; ++ goto out; ++ + src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock(src_ptl); +@@ -512,22 +537,32 @@ again: + progress++; + continue; + } +- copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); ++ copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, ++ vma, addr, rss, &pbc); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + spin_unlock(src_ptl); + pte_unmap_nested(src_pte - 1); ++ rss_tot = rss[0] + rss[1]; ++ add_vma_rss(dst_vma, rss_tot); ++ ub_unused_privvm_sub(dst_mm, dst_vma, rss_tot); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); + if (addr != end) + goto again; +- return 0; ++ ++ err = 0; ++out: ++ pb_free_list(&pbc); ++ return err; + } + + static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, ++ pud_t *dst_pud, pud_t *src_pud, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pmd_t *src_pmd, *dst_pmd; +@@ -542,14 +577,16 @@ static inline int copy_pmd_range(struct + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; + } + + static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, ++ pgd_t *dst_pgd, pgd_t *src_pgd, ++ struct vm_area_struct *dst_vma, ++ struct vm_area_struct *vma, + unsigned long addr, unsigned long end) + { + pud_t *src_pud, *dst_pud; +@@ -564,19 +601,20 @@ static inline int copy_pud_range(struct + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; + } + +-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +- struct vm_area_struct *vma) ++int __copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *vma, ++ unsigned long addr, size_t size) + { ++ struct mm_struct *dst_mm = dst_vma->vm_mm; ++ struct mm_struct *src_mm = vma->vm_mm; + pgd_t *src_pgd, *dst_pgd; + unsigned long next; +- unsigned long addr = vma->vm_start; +- unsigned long end = vma->vm_end; ++ unsigned long end = addr + size; + + /* + * Don't copy ptes where a page fault will fill them correctly. +@@ -599,11 +637,22 @@ int copy_page_range(struct mm_struct *ds + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, +- vma, addr, next)) ++ dst_vma, vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; + } ++EXPORT_SYMBOL_GPL(__copy_page_range); ++ ++int copy_page_range(struct mm_struct *dst, struct mm_struct *src, ++ struct vm_area_struct *dst_vma, struct vm_area_struct *vma) ++{ ++ if (dst_vma->vm_mm != dst) ++ BUG(); ++ if (vma->vm_mm != src) ++ BUG(); ++ return __copy_page_range(dst_vma, vma, vma->vm_start, vma->vm_end-vma->vm_start); ++} + + static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, +@@ -615,6 +664,7 @@ static unsigned long zap_pte_range(struc + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; ++ int rss; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { +@@ -668,6 +718,7 @@ static unsigned long zap_pte_range(struc + file_rss--; + } + page_remove_rmap(page); ++ pb_remove_ref(page, mm); + tlb_remove_page(tlb, page); + continue; + } +@@ -682,6 +733,9 @@ static unsigned long zap_pte_range(struc + pte_clear_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + ++ rss = -(file_rss + anon_rss); ++ ub_unused_privvm_add(mm, vma, rss); ++ sub_vma_rss(vma, rss); + add_mm_rss(mm, file_rss, anon_rss); + pte_unmap_unlock(pte - 1, ptl); + +@@ -1087,12 +1141,14 @@ int get_user_pages(struct task_struct *t + } + EXPORT_SYMBOL(get_user_pages); + +-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, ++static int zeromap_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t prot) + { + pte_t *pte; + spinlock_t *ptl; ++ struct mm_struct *mm; + ++ mm = vma->vm_mm; + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; +@@ -1102,6 +1158,8 @@ static int zeromap_pte_range(struct mm_s + page_cache_get(page); + page_add_file_rmap(page); + inc_mm_counter(mm, file_rss); ++ inc_vma_rss(vma); ++ ub_unused_privvm_dec(mm, vma); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, zero_pte); + } while (pte++, addr += PAGE_SIZE, addr != end); +@@ -1109,35 +1167,35 @@ static int zeromap_pte_range(struct mm_s + return 0; + } + +-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, ++static inline int zeromap_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t prot) + { + pmd_t *pmd; + unsigned long next; + +- pmd = pmd_alloc(mm, pud, addr); ++ pmd = pmd_alloc(vma->vm_mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); +- if (zeromap_pte_range(mm, pmd, addr, next, prot)) ++ if (zeromap_pte_range(vma, pmd, addr, next, prot)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; + } + +-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, ++static inline int zeromap_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t prot) + { + pud_t *pud; + unsigned long next; + +- pud = pud_alloc(mm, pgd, addr); ++ pud = pud_alloc(vma->vm_mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); +- if (zeromap_pmd_range(mm, pud, addr, next, prot)) ++ if (zeromap_pmd_range(vma, pud, addr, next, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +@@ -1149,15 +1207,14 @@ int zeromap_page_range(struct vm_area_st + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; +- struct mm_struct *mm = vma->vm_mm; + int err; + + BUG_ON(addr >= end); +- pgd = pgd_offset(mm, addr); ++ pgd = pgd_offset(vma->vm_mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); +- err = zeromap_pud_range(mm, pgd, addr, next, prot); ++ err = zeromap_pud_range(vma, pgd, addr, next, prot); + if (err) + break; + } while (pgd++, addr = next, addr != end); +@@ -1183,11 +1240,14 @@ pte_t * fastcall get_locked_pte(struct m + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) ++static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) + { + int retval; + pte_t *pte; +- spinlock_t *ptl; ++ spinlock_t *ptl; ++ struct mm_struct *mm; ++ ++ mm = vma->vm_mm; + + retval = -EINVAL; + if (PageAnon(page)) +@@ -1204,6 +1264,7 @@ static int insert_page(struct mm_struct + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); ++ inc_vma_rss(vma); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + +@@ -1240,7 +1301,7 @@ int vm_insert_page(struct vm_area_struct + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; +- return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); ++ return insert_page(vma, addr, page, vma->vm_page_prot); + } + EXPORT_SYMBOL(vm_insert_page); + +@@ -1449,6 +1510,7 @@ static int do_wp_page(struct mm_struct * + struct page *old_page, *new_page; + pte_t entry; + int ret = VM_FAULT_MINOR; ++ struct page_beancounter *pbc; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) +@@ -1476,6 +1538,9 @@ static int do_wp_page(struct mm_struct * + gotten: + pte_unmap_unlock(page_table, ptl); + ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + if (old_page == ZERO_PAGE(address)) { +@@ -1496,12 +1561,16 @@ gotten: + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { + page_remove_rmap(old_page); ++ pb_remove_ref(old_page, mm); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } +- } else ++ } else { ++ ub_unused_privvm_dec(mm, vma); + inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); ++ } + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); +@@ -1510,6 +1579,7 @@ gotten: + lazy_mmu_prot_update(entry); + lru_cache_add_active(new_page); + page_add_new_anon_rmap(new_page, vma, address); ++ pb_add_ref(new_page, mm, &pbc); + + /* Free the old page.. */ + new_page = old_page; +@@ -1519,10 +1589,13 @@ gotten: + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); ++ pb_free(&pbc); + unlock: + pte_unmap_unlock(page_table, ptl); + return ret; + oom: ++ pb_free(&pbc); ++oom_nopb: + if (old_page) + page_cache_release(old_page); + return VM_FAULT_OOM; +@@ -1877,10 +1950,16 @@ static int do_swap_page(struct mm_struct + swp_entry_t entry; + pte_t pte; + int ret = VM_FAULT_MINOR; ++ struct page_beancounter *pbc; ++ cycles_t start; + + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) +- goto out; ++ goto out_nostat; ++ ++ if (unlikely(pb_alloc(&pbc))) ++ return VM_FAULT_OOM; + ++ start = get_cycles(); + entry = pte_to_swp_entry(orig_pte); + again: + page = lookup_swap_cache(entry); +@@ -1928,6 +2007,8 @@ again: + /* The page isn't present yet, go ahead with the fault. */ + + inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); ++ ub_swapin_inc(mm); + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); +@@ -1937,6 +2018,8 @@ again: + flush_icache_page(vma, page); + set_pte_at(mm, address, page_table, pte); + page_add_anon_rmap(page, vma, address); ++ pb_add_ref(page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + + swap_free(entry); + if (vm_swap_full()) +@@ -1947,7 +2030,7 @@ again: + if (do_wp_page(mm, vma, address, + page_table, pmd, ptl, pte) == VM_FAULT_OOM) + ret = VM_FAULT_OOM; +- goto out; ++ goto out_wp; + } + + /* No need to invalidate - it was non-present before */ +@@ -1955,10 +2038,16 @@ again: + lazy_mmu_prot_update(pte); + unlock: + pte_unmap_unlock(page_table, ptl); +-out: ++out_wp: ++ pb_free(&pbc); ++ spin_lock_irq(&kstat_glb_lock); ++ KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start); ++ spin_unlock_irq(&kstat_glb_lock); ++out_nostat: + return ret; + out_nomap: + pte_unmap_unlock(page_table, ptl); ++ pb_free(&pbc); + unlock_page(page); + page_cache_release(page); + return ret; +@@ -1976,11 +2065,15 @@ static int do_anonymous_page(struct mm_s + struct page *page; + spinlock_t *ptl; + pte_t entry; ++ struct page_beancounter *pbc; + + if (write_access) { + /* Allocate our own private page. */ + pte_unmap(page_table); + ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; ++ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage(vma, address); +@@ -1996,7 +2089,10 @@ static int do_anonymous_page(struct mm_s + inc_mm_counter(mm, anon_rss); + lru_cache_add_active(page); + page_add_new_anon_rmap(page, vma, address); ++ pb_add_ref(page, mm, &pbc); + } else { ++ pbc = NULL; ++ + /* Map the ZERO_PAGE - vm_page_prot is readonly */ + page = ZERO_PAGE(address); + page_cache_get(page); +@@ -2010,18 +2106,23 @@ static int do_anonymous_page(struct mm_s + page_add_file_rmap(page); + } + ++ inc_vma_rss(vma); ++ ub_unused_privvm_dec(mm, vma); + set_pte_at(mm, address, page_table, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + unlock: ++ pb_free(&pbc); + pte_unmap_unlock(page_table, ptl); + return VM_FAULT_MINOR; + release: + page_cache_release(page); + goto unlock; + oom: ++ pb_free(&pbc); ++oom_nopb: + return VM_FAULT_OOM; + } + +@@ -2049,6 +2150,7 @@ static int do_no_page(struct mm_struct * + unsigned int sequence = 0; + int ret = VM_FAULT_MINOR; + int anon = 0; ++ struct page_beancounter *pbc; + + pte_unmap(page_table); + BUG_ON(vma->vm_flags & VM_PFNMAP); +@@ -2058,6 +2160,9 @@ static int do_no_page(struct mm_struct * + sequence = mapping->truncate_count; + smp_rmb(); /* serializes i_size against truncate_count */ + } ++ ++ if (unlikely(pb_alloc(&pbc))) ++ goto oom_nopb; + retry: + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); + /* +@@ -2070,9 +2175,9 @@ retry: + + /* no page was available -- either SIGBUS or OOM */ + if (new_page == NOPAGE_SIGBUS) +- return VM_FAULT_SIGBUS; ++ goto bus_nopg; + if (new_page == NOPAGE_OOM) +- return VM_FAULT_OOM; ++ goto oom_nopg; + + /* + * Should we do an early C-O-W break? +@@ -2131,6 +2236,9 @@ retry: + inc_mm_counter(mm, file_rss); + page_add_file_rmap(new_page); + } ++ inc_vma_rss(vma); ++ pb_add_ref(new_page, mm, &pbc); ++ ub_unused_privvm_dec(mm, vma); + } else { + /* One of our sibling threads was faster, back out. */ + page_cache_release(new_page); +@@ -2142,10 +2250,18 @@ retry: + lazy_mmu_prot_update(entry); + unlock: + pte_unmap_unlock(page_table, ptl); ++ pb_free(&pbc); + return ret; + oom: + page_cache_release(new_page); ++oom_nopg: ++ pb_free(&pbc); ++oom_nopb: + return VM_FAULT_OOM; ++ ++bus_nopg: ++ pb_free(&pbc); ++ return VM_FAULT_SIGBUS; + } + + /* +@@ -2314,6 +2430,8 @@ int __pud_alloc(struct mm_struct *mm, pg + } + #endif /* __PAGETABLE_PUD_FOLDED */ + ++EXPORT_SYMBOL_GPL(__pud_alloc); ++ + #ifndef __PAGETABLE_PMD_FOLDED + /* + * Allocate page middle directory. +@@ -2348,6 +2466,8 @@ int __pmd_alloc(struct mm_struct *mm, pu + } + #endif /* __PAGETABLE_PMD_FOLDED */ + ++EXPORT_SYMBOL_GPL(__pmd_alloc); ++ + int make_pages_present(unsigned long addr, unsigned long end) + { + int ret, len, write; +diff -upr linux-2.6.16.orig/mm/mempolicy.c linux-2.6.16-026test015/mm/mempolicy.c +--- linux-2.6.16.orig/mm/mempolicy.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/mempolicy.c 2006-07-04 14:41:38.000000000 +0400 +@@ -933,7 +933,7 @@ asmlinkage long sys_migrate_pages(pid_t + + /* Find the mm_struct */ + read_lock(&tasklist_lock); +- task = pid ? find_task_by_pid(pid) : current; ++ task = pid ? find_task_by_pid_ve(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; +@@ -1796,7 +1796,6 @@ static void gather_stats(struct page *pa + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +- cond_resched(); + } + + #ifdef CONFIG_HUGETLB_PAGE +diff -upr linux-2.6.16.orig/mm/mempool.c linux-2.6.16-026test015/mm/mempool.c +--- linux-2.6.16.orig/mm/mempool.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/mempool.c 2006-07-04 14:41:37.000000000 +0400 +@@ -14,6 +14,7 @@ + #include <linux/mempool.h> + #include <linux/blkdev.h> + #include <linux/writeback.h> ++#include <linux/kmem_cache.h> + + static void add_element(mempool_t *pool, void *element) + { +@@ -78,6 +79,8 @@ mempool_t *mempool_create_node(int min_n + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; ++ if (alloc_fn == mempool_alloc_slab) ++ kmem_mark_nocharge((kmem_cache_t *)pool_data); + + /* + * First pre-allocate the guaranteed number of buffers. +@@ -119,6 +122,7 @@ int mempool_resize(mempool_t *pool, int + unsigned long flags; + + BUG_ON(new_min_nr <= 0); ++ gfp_mask &= ~__GFP_UBC; + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { +@@ -212,6 +216,7 @@ void * mempool_alloc(mempool_t *pool, gf + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ ++ gfp_mask &= ~__GFP_UBC; + + gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + +diff -upr linux-2.6.16.orig/mm/mlock.c linux-2.6.16-026test015/mm/mlock.c +--- linux-2.6.16.orig/mm/mlock.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/mlock.c 2006-07-04 14:41:39.000000000 +0400 +@@ -8,9 +8,11 @@ + #include <linux/capability.h> + #include <linux/mman.h> + #include <linux/mm.h> ++#include <linux/module.h> + #include <linux/mempolicy.h> + #include <linux/syscalls.h> + ++#include <ub/ub_vmpages.h> + + static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, unsigned int newflags) +@@ -25,6 +27,14 @@ static int mlock_fixup(struct vm_area_st + goto out; + } + ++ if (newflags & VM_LOCKED) { ++ ret = ub_locked_charge(mm, end - start); ++ if (ret < 0) { ++ *prev = vma; ++ goto out; ++ } ++ } ++ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma)); +@@ -38,13 +48,13 @@ static int mlock_fixup(struct vm_area_st + if (start != vma->vm_start) { + ret = split_vma(mm, vma, start, 1); + if (ret) +- goto out; ++ goto out_uncharge; + } + + if (end != vma->vm_end) { + ret = split_vma(mm, vma, end, 0); + if (ret) +- goto out; ++ goto out_uncharge; + } + + success: +@@ -63,13 +73,19 @@ success: + pages = -pages; + if (!(newflags & VM_IO)) + ret = make_pages_present(start, end); +- } ++ } else ++ ub_locked_uncharge(mm, end - start); + + vma->vm_mm->locked_vm -= pages; + out: + if (ret == -ENOMEM) + ret = -EAGAIN; + return ret; ++ ++out_uncharge: ++ if (newflags & VM_LOCKED) ++ ub_locked_uncharge(mm, end - start); ++ goto out; + } + + static int do_mlock(unsigned long start, size_t len, int on) +@@ -146,6 +162,7 @@ asmlinkage long sys_mlock(unsigned long + up_write(¤t->mm->mmap_sem); + return error; + } ++EXPORT_SYMBOL_GPL(sys_mlock); + + asmlinkage long sys_munlock(unsigned long start, size_t len) + { +@@ -158,6 +175,7 @@ asmlinkage long sys_munlock(unsigned lon + up_write(¤t->mm->mmap_sem); + return ret; + } ++EXPORT_SYMBOL_GPL(sys_munlock); + + static int do_mlockall(int flags) + { +diff -upr linux-2.6.16.orig/mm/mmap.c linux-2.6.16-026test015/mm/mmap.c +--- linux-2.6.16.orig/mm/mmap.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/mmap.c 2006-07-04 14:41:39.000000000 +0400 +@@ -25,14 +25,18 @@ + #include <linux/mount.h> + #include <linux/mempolicy.h> + #include <linux/rmap.h> ++#include <linux/virtinfo.h> + + #include <asm/uaccess.h> + #include <asm/cacheflush.h> + #include <asm/tlb.h> + ++#include <ub/ub_vmpages.h> ++ + static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); ++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft); + + /* + * WARNING: the debugging will use recursive algorithms so never enable this +@@ -87,6 +91,16 @@ int __vm_enough_memory(long pages, int c + + vm_acct_memory(pages); + ++ switch (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_ENOUGHMEM, ++ (void *)pages) ++ & (NOTIFY_OK | NOTIFY_FAIL)) { ++ case NOTIFY_OK: ++ return 0; ++ case NOTIFY_FAIL: ++ vm_unacct_memory(pages); ++ return -ENOMEM; ++ } ++ + /* + * Sometimes we want to use more memory than we have + */ +@@ -201,11 +215,16 @@ static struct vm_area_struct *remove_vma + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); ++ ++ ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start, ++ vma->vm_flags, vma->vm_file); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + mpol_free(vma_policy(vma)); ++ if (get_vma_rss(vma)) ++ warn_bad_rss(vma, 0); + kmem_cache_free(vm_area_cachep, vma); + return next; + } +@@ -242,7 +261,7 @@ asmlinkage unsigned long sys_brk(unsigne + goto out; + + /* Ok, looks good - let it rip. */ +- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) ++ if (__do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk) + goto out; + set_brk: + mm->brk = brk; +@@ -726,7 +745,7 @@ struct vm_area_struct *vma_merge(struct + else + next = mm->mmap; + area = next; +- if (next && next->vm_end == end) /* cases 6, 7, 8 */ ++ if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = next->vm_next; + + /* +@@ -746,11 +765,22 @@ struct vm_area_struct *vma_merge(struct + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma)) { + /* cases 1, 6 */ ++ add_vma_rss(prev, get_vma_rss(next)); ++ if (area != next) /* case 6 */ ++ add_vma_rss(prev, get_vma_rss(area)); + vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL); +- } else /* cases 2, 5, 7 */ ++ } else { /* cases 2, 5, 7 */ ++ if (next && addr == next->vm_start) { /* case 5 */ ++ unsigned long rss; ++ rss = pages_in_vma_range(next, addr, end); ++ sub_vma_rss(next, rss); ++ add_vma_rss(prev, rss); ++ } else if (area != next) /* case 7 */ ++ add_vma_rss(prev, get_vma_rss(area)); + vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL); ++ } + return prev; + } + +@@ -761,12 +791,19 @@ struct vm_area_struct *vma_merge(struct + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen)) { +- if (prev && addr < prev->vm_end) /* case 4 */ ++ if (prev && addr < prev->vm_end) { /* case 4 */ ++ unsigned long rss; ++ rss = pages_in_vma_range(prev, addr, end); ++ sub_vma_rss(prev, rss); ++ add_vma_rss(next, rss); + vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL); +- else /* cases 3, 8 */ ++ } else { /* cases 3, 8 */ ++ if (area != next) /* case 8 */ ++ add_vma_rss(area, get_vma_rss(next)); + vma_adjust(area, addr, next->vm_end, + next->vm_pgoff - pglen, NULL); ++ } + return area; + } + +@@ -1033,6 +1070,10 @@ munmap_back: + } + } + ++ if (ub_memory_charge(mm, len, vm_flags, file, ++ (flags & MAP_EXECPRIO ? UB_SOFT : UB_HARD))) ++ goto charge_error; ++ + /* + * Can we just expand an old private anonymous mapping? + * The VM_SHARED test is necessary because shmem_zero_setup +@@ -1048,7 +1089,8 @@ munmap_back: + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ +- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); ++ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | ++ (flags & MAP_EXECPRIO ? __GFP_SOFT_UBC : 0)); + if (!vma) { + error = -ENOMEM; + goto unacct_error; +@@ -1107,6 +1149,19 @@ munmap_back: + if (correct_wcount) + atomic_inc(&inode->i_writecount); + } else { ++ unsigned long rss; ++ ++ rss = get_vma_rss(vma); ++ if (rss > 0) { ++ if (prev->vm_next && prev->vm_next->vm_start == addr) ++ /* vma_merge expanded next vm_area */ ++ add_vma_rss(prev->vm_next, rss); ++ else ++ /* vma_merge expanded prev vm_area ++ * and probably splitted it with next ++ */ ++ add_vma_rss(prev, rss); ++ } + if (file) { + if (correct_wcount) + atomic_inc(&inode->i_writecount); +@@ -1142,6 +1197,8 @@ unmap_and_free_vma: + free_vma: + kmem_cache_free(vm_area_cachep, vma); + unacct_error: ++ ub_memory_uncharge(mm, len, vm_flags, file); ++charge_error: + if (charged) + vm_unacct_memory(charged); + return error; +@@ -1471,12 +1528,16 @@ static int acct_stack_growth(struct vm_a + return -ENOMEM; + } + ++ if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags, ++ vma->vm_file, UB_SOFT)) ++ goto fail_charge; ++ + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory(grow)) +- return -ENOMEM; ++ goto fail_sec; + + /* Ok, everything looks good - let it rip */ + mm->total_vm += grow; +@@ -1484,6 +1545,11 @@ static int acct_stack_growth(struct vm_a + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + return 0; ++ ++fail_sec: ++ ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file); ++fail_charge: ++ return -ENOMEM; + } + + #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +@@ -1744,8 +1810,13 @@ int split_vma(struct mm_struct * mm, str + else + vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ /* protected with mmap sem */ ++ set_vma_rss(vma, pages_in_vma(vma)); ++ set_vma_rss(new, pages_in_vma(new)); ++ + return 0; + } ++EXPORT_SYMBOL_GPL(split_vma); + + /* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the +@@ -1839,7 +1910,7 @@ static inline void verify_mm_writelocked + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +-unsigned long do_brk(unsigned long addr, unsigned long len) ++static unsigned long __do_brk(unsigned long addr, unsigned long len, int soft) + { + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; +@@ -1891,11 +1962,14 @@ unsigned long do_brk(unsigned long addr, + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + +- if (security_vm_enough_memory(len >> PAGE_SHIFT)) +- return -ENOMEM; +- + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + ++ if (ub_memory_charge(mm, len, flags, NULL, soft)) ++ goto fail_charge; ++ ++ if (security_vm_enough_memory(len >> PAGE_SHIFT)) ++ goto fail_sec; ++ + /* Can we just expand an old private anonymous mapping? */ + if (vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL)) +@@ -1904,11 +1978,11 @@ unsigned long do_brk(unsigned long addr, + /* + * create a vma struct for an anonymous mapping + */ +- vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); +- if (!vma) { +- vm_unacct_memory(len >> PAGE_SHIFT); +- return -ENOMEM; +- } ++ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL | ++ (soft == UB_SOFT ? __GFP_SOFT_UBC : 0)); ++ if (!vma) ++ goto fail_alloc; ++ + memset(vma, 0, sizeof(*vma)); + + vma->vm_mm = mm; +@@ -1925,8 +1999,19 @@ out: + make_pages_present(addr, addr + len); + } + return addr; ++ ++fail_alloc: ++ vm_unacct_memory(len >> PAGE_SHIFT); ++fail_sec: ++ ub_memory_uncharge(mm, len, flags, NULL); ++fail_charge: ++ return -ENOMEM; + } + ++unsigned long do_brk(unsigned long addr, unsigned long len) ++{ ++ return __do_brk(addr, len, UB_SOFT); ++} + EXPORT_SYMBOL(do_brk); + + /* Release all mmaps. */ +@@ -2036,6 +2121,7 @@ struct vm_area_struct *copy_vma(struct v + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; ++ set_vma_rss(new_vma, 0); + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) +diff -upr linux-2.6.16.orig/mm/mprotect.c linux-2.6.16-026test015/mm/mprotect.c +--- linux-2.6.16.orig/mm/mprotect.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/mprotect.c 2006-07-04 14:41:39.000000000 +0400 +@@ -9,6 +9,7 @@ + */ + + #include <linux/mm.h> ++#include <linux/module.h> + #include <linux/hugetlb.h> + #include <linux/slab.h> + #include <linux/shm.h> +@@ -25,6 +26,8 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot) + { +@@ -109,12 +112,20 @@ mprotect_fixup(struct vm_area_struct *vm + pgprot_t newprot; + pgoff_t pgoff; + int error; ++ unsigned long ch_size; ++ int ch_dir; + + if (newflags == oldflags) { + *pprev = vma; + return 0; + } + ++ error = -ENOMEM; ++ ch_size = nrpages - pages_in_vma_range(vma, start, end); ++ ch_dir = ub_protected_charge(mm, ch_size, newflags, vma); ++ if (ch_dir == PRIVVM_ERROR) ++ goto fail_ch; ++ + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we +@@ -127,7 +138,7 @@ mprotect_fixup(struct vm_area_struct *vm + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { + charged = nrpages; + if (security_vm_enough_memory(charged)) +- return -ENOMEM; ++ goto fail_sec; + newflags |= VM_ACCOUNT; + } + } +@@ -169,10 +180,16 @@ success: + change_protection(vma, start, end, newprot); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + vm_stat_account(mm, newflags, vma->vm_file, nrpages); ++ if (ch_dir == PRIVVM_TO_SHARED) ++ __ub_unused_privvm_dec(mm, ch_size); + return 0; + + fail: + vm_unacct_memory(charged); ++fail_sec: ++ if (ch_dir == PRIVVM_TO_PRIVATE) ++ __ub_unused_privvm_dec(mm, ch_size); ++fail_ch: + return error; + } + +@@ -280,3 +297,4 @@ out: + up_write(¤t->mm->mmap_sem); + return error; + } ++EXPORT_SYMBOL_GPL(sys_mprotect); +diff -upr linux-2.6.16.orig/mm/mremap.c linux-2.6.16-026test015/mm/mremap.c +--- linux-2.6.16.orig/mm/mremap.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/mremap.c 2006-07-04 14:41:37.000000000 +0400 +@@ -23,6 +23,8 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) + { + pgd_t *pgd; +@@ -106,6 +108,8 @@ static void move_ptes(struct vm_area_str + pte = ptep_clear_flush(vma, old_addr, old_pte); + /* ZERO_PAGE can be dependant on virtual addr */ + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); ++ dec_vma_rss(vma); ++ inc_vma_rss(new_vma); + set_pte_at(mm, new_addr, new_pte, pte); + } + +@@ -166,17 +170,21 @@ static unsigned long move_vma(struct vm_ + unsigned long hiwater_vm; + int split = 0; + ++ if (ub_memory_charge(mm, new_len, vm_flags, ++ vma->vm_file, UB_HARD)) ++ goto err; ++ + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) +- return -ENOMEM; ++ goto err_nomem; + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); + if (!new_vma) +- return -ENOMEM; ++ goto err_nomem; + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); + if (moved_len < old_len) { +@@ -235,7 +243,13 @@ static unsigned long move_vma(struct vm_ + new_addr + new_len); + } + +- return new_addr; ++ if (new_addr != -ENOMEM) ++ return new_addr; ++ ++err_nomem: ++ ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file); ++err: ++ return -ENOMEM; + } + + /* +@@ -359,7 +373,15 @@ unsigned long do_mremap(unsigned long ad + max_addr = vma->vm_next->vm_start; + /* can we just expand the current mapping? */ + if (max_addr - addr >= new_len) { +- int pages = (new_len - old_len) >> PAGE_SHIFT; ++ int len; ++ int pages; ++ ++ len = new_len - old_len; ++ pages = len >> PAGE_SHIFT; ++ ret = -ENOMEM; ++ if (ub_memory_charge(mm, len, vma->vm_flags, ++ vma->vm_file, UB_HARD)) ++ goto out; + + vma_adjust(vma, vma->vm_start, + addr + new_len, vma->vm_pgoff, NULL); +diff -upr linux-2.6.16.orig/mm/oom_kill.c linux-2.6.16-026test015/mm/oom_kill.c +--- linux-2.6.16.orig/mm/oom_kill.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/oom_kill.c 2006-07-04 14:41:38.000000000 +0400 +@@ -176,7 +176,7 @@ static struct task_struct *select_bad_pr + *ppoints = 0; + + do_posix_clock_monotonic_gettime(&uptime); +- do_each_thread(g, p) { ++ do_each_thread_all(g, p) { + unsigned long points; + int releasing; + +@@ -205,7 +205,7 @@ static struct task_struct *select_bad_pr + chosen = p; + *ppoints = points; + } +- } while_each_thread(g, p); ++ } while_each_thread_all(g, p); + return chosen; + } + +@@ -261,10 +261,10 @@ static struct mm_struct *oom_kill_task(t + * kill all processes that share the ->mm (i.e. all threads), + * but are in a different thread group + */ +- do_each_thread(g, q) ++ do_each_thread_all(g, q) { + if (q->mm == mm && q->tgid != p->tgid) + __oom_kill_task(q, message); +- while_each_thread(g, q); ++ } while_each_thread_all(g, q); + + return mm; + } +diff -upr linux-2.6.16.orig/mm/page_alloc.c linux-2.6.16-026test015/mm/page_alloc.c +--- linux-2.6.16.orig/mm/page_alloc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/page_alloc.c 2006-07-04 14:41:38.000000000 +0400 +@@ -41,6 +41,8 @@ + #include <asm/tlbflush.h> + #include "internal.h" + ++#include <ub/ub_mem.h> ++ + /* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner +@@ -50,6 +52,7 @@ EXPORT_SYMBOL(node_online_map); + nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; + EXPORT_SYMBOL(node_possible_map); + struct pglist_data *pgdat_list __read_mostly; ++EXPORT_SYMBOL(pgdat_list); + unsigned long totalram_pages __read_mostly; + unsigned long totalhigh_pages __read_mostly; + long nr_swap_pages; +@@ -153,7 +156,8 @@ static void bad_page(struct page *page) + 1 << PG_reclaim | + 1 << PG_slab | + 1 << PG_swapcache | +- 1 << PG_writeback ); ++ 1 << PG_writeback | ++ 1 << PG_buddy ); + set_page_count(page, 0); + reset_page_mapcount(page); + page->mapping = NULL; +@@ -224,12 +228,12 @@ static inline unsigned long page_order(s + + static inline void set_page_order(struct page *page, int order) { + set_page_private(page, order); +- __SetPagePrivate(page); ++ __SetPageBuddy(page); + } + + static inline void rmv_page_order(struct page *page) + { +- __ClearPagePrivate(page); ++ __ClearPageBuddy(page); + set_page_private(page, 0); + } + +@@ -268,11 +272,13 @@ __find_combined_index(unsigned long page + * This function checks whether a page is free && is the buddy + * we can do coalesce a page and its buddy if + * (a) the buddy is not in a hole && +- * (b) the buddy is free && +- * (c) the buddy is on the buddy system && +- * (d) a page and its buddy have the same order. +- * for recording page's order, we use page_private(page) and PG_private. ++ * (b) the buddy is in the buddy system && ++ * (c) a page and its buddy have the same order. + * ++ * For recording whether a page is in the buddy system, we use PG_buddy. ++ * Setting, clearing, and testing PG_buddy is serialized by zone->lock. ++ * ++ * For recording page's order, we use page_private(page). + */ + static inline int page_is_buddy(struct page *page, int order) + { +@@ -281,10 +287,10 @@ static inline int page_is_buddy(struct p + return 0; + #endif + +- if (PagePrivate(page) && +- (page_order(page) == order) && +- page_count(page) == 0) ++ if (PageBuddy(page) && page_order(page) == order) { ++ BUG_ON(page_count(page) != 0); + return 1; ++ } + return 0; + } + +@@ -301,7 +307,7 @@ static inline int page_is_buddy(struct p + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous +- * free pages of length of (1 << order) and marked with PG_Private.Page's ++ * free pages of length of (1 << order) and marked with PG_buddy. Page's + * order is recorded in page_private(page) field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were +@@ -364,7 +370,8 @@ static inline int free_pages_check(struc + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback | +- 1 << PG_reserved )))) ++ 1 << PG_reserved | ++ 1 << PG_buddy )))) + bad_page(page); + if (PageDirty(page)) + __ClearPageDirty(page); +@@ -434,6 +441,7 @@ static void __free_pages_ok(struct page + return; + + kernel_map_pages(page, 1 << order, 0); ++ ub_page_uncharge(page, order); + local_irq_save(flags); + __mod_page_state(pgfree, 1 << order); + free_one_page(page_zone(page), page, order); +@@ -522,7 +530,8 @@ static int prep_new_page(struct page *pa + 1 << PG_slab | + 1 << PG_swapcache | + 1 << PG_writeback | +- 1 << PG_reserved )))) ++ 1 << PG_reserved | ++ 1 << PG_buddy )))) + bad_page(page); + + /* +@@ -721,6 +730,7 @@ static void fastcall free_hot_cold_page( + kernel_map_pages(page, 1, 0); + + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; ++ ub_page_uncharge(page, 0); + local_irq_save(flags); + __inc_page_state(pgfree); + list_add(&page->lru, &pcp->list); +@@ -894,6 +904,28 @@ get_page_from_freelist(gfp_t gfp_mask, u + return page; + } + ++static void __alloc_collect_stats(unsigned int gfp_mask, ++ unsigned int order, struct page *page, cycles_t time) ++{ ++ int ind; ++ unsigned long flags; ++ ++ time = get_cycles() - time; ++ if (!(gfp_mask & __GFP_WAIT)) ++ ind = 0; ++ else if (!(gfp_mask & __GFP_HIGHMEM)) ++ ind = (order > 0 ? 2 : 1); ++ else ++ ind = (order > 0 ? 4 : 3); ++ spin_lock_irqsave(&kstat_glb_lock, flags); ++ KSTAT_LAT_ADD(&kstat_glob.alloc_lat[ind], time); ++ if (!page) ++ kstat_glob.alloc_fails[ind]++; ++ spin_unlock_irqrestore(&kstat_glb_lock, flags); ++} ++ ++int alloc_fail_warn; ++ + /* + * This is the 'heart' of the zoned buddy allocator. + */ +@@ -909,6 +941,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i + int do_retry; + int alloc_flags; + int did_some_progress; ++ cycles_t start; + + might_sleep_if(wait); + +@@ -920,6 +953,7 @@ restart: + return NULL; + } + ++ start = get_cycles(); + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) +@@ -944,7 +978,8 @@ restart: + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; +- alloc_flags |= ALLOC_CPUSET; ++ if (wait) ++ alloc_flags |= ALLOC_CPUSET; + + /* + * Go through the zonelist again. Let __GFP_HIGH and allocations +@@ -1038,14 +1073,22 @@ rebalance: + } + + nopage: +- if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { ++ __alloc_collect_stats(gfp_mask, order, page, start); ++ if (alloc_fail_warn && !(gfp_mask & __GFP_NOWARN) && ++ printk_ratelimit()) { + printk(KERN_WARNING "%s: page allocation failure." + " order:%d, mode:0x%x\n", + p->comm, order, gfp_mask); + dump_stack(); + show_mem(); + } ++ return NULL; ++ + got_pg: ++ if (ub_page_charge(page, order, gfp_mask)) { ++ __free_pages(page, order); ++ page = NULL; ++ } + return page; + } + +@@ -2378,7 +2421,10 @@ static void *vmstat_start(struct seq_fil + m->private = ps; + if (!ps) + return ERR_PTR(-ENOMEM); +- get_full_page_state(ps); ++ if (ve_is_super(get_exec_env())) ++ get_full_page_state(ps); ++ else ++ memset(ps, 0, sizeof(*ps)); + ps->pgpgin /= 2; /* sectors -> kbytes */ + ps->pgpgout /= 2; + return (unsigned long *)ps + *pos; +diff -upr linux-2.6.16.orig/mm/rmap.c linux-2.6.16-026test015/mm/rmap.c +--- linux-2.6.16.orig/mm/rmap.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/rmap.c 2006-07-04 14:41:39.000000000 +0400 +@@ -56,6 +56,8 @@ + + #include <asm/tlbflush.h> + ++#include <ub/ub_vmpages.h> ++ + //#define RMAP_DEBUG /* can be enabled only for debugging */ + + kmem_cache_t *anon_vma_cachep; +@@ -117,6 +119,7 @@ int anon_vma_prepare(struct vm_area_stru + } + return 0; + } ++EXPORT_SYMBOL_GPL(anon_vma_prepare); + + void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) + { +@@ -145,6 +148,7 @@ void anon_vma_link(struct vm_area_struct + spin_unlock(&anon_vma->lock); + } + } ++EXPORT_SYMBOL_GPL(anon_vma_link); + + void anon_vma_unlink(struct vm_area_struct *vma) + { +@@ -180,14 +184,15 @@ static void anon_vma_ctor(void *data, km + void __init anon_vma_init(void) + { + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), +- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor, NULL); ++ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_UBC, ++ anon_vma_ctor, NULL); + } + + /* + * Getting a lock on a stable anon_vma from a page off the LRU is + * tricky: page_lock_anon_vma rely on RCU to guard against the races. + */ +-static struct anon_vma *page_lock_anon_vma(struct page *page) ++struct anon_vma *page_lock_anon_vma(struct page *page) + { + struct anon_vma *anon_vma = NULL; + unsigned long anon_mapping; +@@ -205,6 +210,7 @@ out: + rcu_read_unlock(); + return anon_vma; + } ++EXPORT_SYMBOL_GPL(page_lock_anon_vma); + + #ifdef CONFIG_MIGRATION + /* +@@ -220,6 +226,7 @@ void remove_from_swap(struct page *page) + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; ++ struct page_beancounter *pb; + + if (!PageSwapCache(page)) + return; +@@ -229,6 +236,10 @@ void remove_from_swap(struct page *page) + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; + ++ pb = NULL; ++ if (pb_alloc_all(&pb)) ++ return; ++ + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ +@@ -236,10 +247,12 @@ void remove_from_swap(struct page *page) + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) +- remove_vma_swap(vma, page); ++ remove_vma_swap(vma, page, &pb); + + spin_unlock(&anon_vma->lock); + delete_from_swap_cache(page); ++ ++ pb_free_list(&pb); + } + EXPORT_SYMBOL(remove_from_swap); + #endif +@@ -638,7 +651,11 @@ static int try_to_unmap_one(struct page + } else + dec_mm_counter(mm, file_rss); + ++ dec_vma_rss(vma); + page_remove_rmap(page); ++ ub_unused_privvm_inc(mm, vma); ++ ub_unmap_inc(mm); ++ pb_remove_ref(page, mm); + page_cache_release(page); + + out_unmap: +@@ -729,8 +746,12 @@ static void try_to_unmap_cluster(unsigne + set_page_dirty(page); + + page_remove_rmap(page); ++ ub_unmap_inc(mm); ++ pb_remove_ref(page, mm); ++ ub_unused_privvm_inc(mm, vma); + page_cache_release(page); + dec_mm_counter(mm, file_rss); ++ dec_vma_rss(vma); + (*mapcount)--; + } + pte_unmap_unlock(pte - 1, ptl); +diff -upr linux-2.6.16.orig/mm/shmem.c linux-2.6.16-026test015/mm/shmem.c +--- linux-2.6.16.orig/mm/shmem.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/shmem.c 2006-07-04 14:41:39.000000000 +0400 +@@ -50,6 +50,8 @@ + #include <asm/div64.h> + #include <asm/pgtable.h> + ++#include <ub/ub_vmpages.h> ++ + /* This magic number is used in glibc for posix shared memory */ + #define TMPFS_MAGIC 0x01021994 + +@@ -211,7 +213,7 @@ static void shmem_free_blocks(struct ino + * + * It has to be called with the spinlock held. + */ +-static void shmem_recalc_inode(struct inode *inode) ++static void shmem_recalc_inode(struct inode *inode, long swp_freed) + { + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; +@@ -221,6 +223,8 @@ static void shmem_recalc_inode(struct in + info->alloced -= freed; + shmem_unacct_blocks(info->flags, freed); + shmem_free_blocks(inode, freed); ++ if (freed > swp_freed) ++ ub_tmpfs_respages_sub(info, freed - swp_freed); + } + } + +@@ -326,6 +330,11 @@ static void shmem_swp_set(struct shmem_i + struct page *page = kmap_atomic_to_page(entry); + set_page_private(page, page_private(page) + incdec); + } ++ ++ if (incdec == 1) ++ ub_tmpfs_respages_dec(info); ++ else ++ ub_tmpfs_respages_inc(info); + } + + /* +@@ -342,14 +351,24 @@ static swp_entry_t *shmem_swp_alloc(stru + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct page *page = NULL; + swp_entry_t *entry; ++ unsigned long ub_val; + + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + return ERR_PTR(-EINVAL); + ++ ub_val = 0; ++ if (info->next_index <= index) { ++ ub_val = index + 1 - info->next_index; ++ if (ub_shmpages_charge(info, ub_val)) ++ return ERR_PTR(-ENOSPC); ++ } ++ + while (!(entry = shmem_swp_entry(info, index, &page))) { +- if (sgp == SGP_READ) +- return shmem_swp_map(ZERO_PAGE(0)); ++ if (sgp == SGP_READ) { ++ entry = shmem_swp_map(ZERO_PAGE(0)); ++ goto out; ++ } + /* + * Test free_blocks against 1 not 0, since we have 1 data + * page (and perhaps indirect index pages) yet to allocate: +@@ -359,7 +378,8 @@ static swp_entry_t *shmem_swp_alloc(stru + spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_blocks <= 1) { + spin_unlock(&sbinfo->stat_lock); +- return ERR_PTR(-ENOSPC); ++ entry = ERR_PTR(-ENOSPC); ++ goto out; + } + sbinfo->free_blocks--; + inode->i_blocks += BLOCKS_PER_PAGE; +@@ -367,31 +387,43 @@ static swp_entry_t *shmem_swp_alloc(stru + } + + spin_unlock(&info->lock); +- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); ++ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | ++ __GFP_ZERO | __GFP_UBC); + if (page) + set_page_private(page, 0); + spin_lock(&info->lock); + + if (!page) { +- shmem_free_blocks(inode, 1); +- return ERR_PTR(-ENOMEM); ++ entry = ERR_PTR(-ENOMEM); ++ goto out_block; + } + if (sgp != SGP_WRITE && + ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + entry = ERR_PTR(-EINVAL); +- break; ++ goto out_dir; + } +- if (info->next_index <= index) ++ if (info->next_index <= index) { ++ ub_val = 0; + info->next_index = index + 1; ++ } + } + if (page) { + /* another task gave its page, or truncated the file */ + shmem_free_blocks(inode, 1); + shmem_dir_free(page); + } +- if (info->next_index <= index && !IS_ERR(entry)) ++ if (info->next_index <= index) + info->next_index = index + 1; + return entry; ++ ++out_dir: ++ shmem_dir_free(page); ++out_block: ++ shmem_free_blocks(inode, 1); ++out: ++ if (ub_val) ++ ub_shmpages_uncharge(info, ub_val); ++ return entry; + } + + /* +@@ -484,6 +516,7 @@ static void shmem_truncate_range(struct + return; + + spin_lock(&info->lock); ++ ub_shmpages_uncharge(info, info->next_index - idx); + info->flags |= SHMEM_TRUNCATE; + if (likely(end == (loff_t) -1)) { + limit = info->next_index; +@@ -613,7 +646,7 @@ done2: + info->swapped -= nr_swaps_freed; + if (nr_pages_to_free) + shmem_free_blocks(inode, nr_pages_to_free); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, nr_swaps_freed); + spin_unlock(&info->lock); + + /* +@@ -696,6 +729,7 @@ static void shmem_delete_inode(struct in + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } ++ shmi_ub_put(info); + clear_inode(inode); + } + +@@ -817,6 +851,12 @@ int shmem_unuse(swp_entry_t entry, struc + return found; + } + ++#ifdef CONFIG_USER_RESOURCE ++#define shm_get_swap_page(info) (get_swap_page((info)->shmi_ub)) ++#else ++#define shm_get_swap_page(info) (get_swap_page(NULL)) ++#endif ++ + /* + * Move the page from the page cache to the swap cache. + */ +@@ -837,12 +877,12 @@ static int shmem_writepage(struct page * + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto redirty; +- swap = get_swap_page(); ++ swap = shm_get_swap_page(info); + if (!swap.val) + goto redirty; + + spin_lock(&info->lock); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, 0); + if (index >= info->next_index) { + BUG_ON(!(info->flags & SHMEM_TRUNCATE)); + goto unlock; +@@ -1030,7 +1070,7 @@ repeat: + goto failed; + + spin_lock(&info->lock); +- shmem_recalc_inode(inode); ++ shmem_recalc_inode(inode, 0); + entry = shmem_swp_alloc(info, idx, sgp); + if (IS_ERR(entry)) { + spin_unlock(&info->lock); +@@ -1206,6 +1246,7 @@ repeat: + spin_unlock(&info->lock); + flush_dcache_page(filepage); + SetPageUptodate(filepage); ++ ub_tmpfs_respages_inc(info); + } + done: + if (*pagep != filepage) { +@@ -1307,28 +1348,6 @@ shmem_get_policy(struct vm_area_struct * + } + #endif + +-int shmem_lock(struct file *file, int lock, struct user_struct *user) +-{ +- struct inode *inode = file->f_dentry->d_inode; +- struct shmem_inode_info *info = SHMEM_I(inode); +- int retval = -ENOMEM; +- +- spin_lock(&info->lock); +- if (lock && !(info->flags & VM_LOCKED)) { +- if (!user_shm_lock(inode->i_size, user)) +- goto out_nomem; +- info->flags |= VM_LOCKED; +- } +- if (!lock && (info->flags & VM_LOCKED) && user) { +- user_shm_unlock(inode->i_size, user); +- info->flags &= ~VM_LOCKED; +- } +- retval = 0; +-out_nomem: +- spin_unlock(&info->lock); +- return retval; +-} +- + int shmem_mmap(struct file *file, struct vm_area_struct *vma) + { + file_accessed(file); +@@ -1365,6 +1384,7 @@ shmem_get_inode(struct super_block *sb, + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); ++ shmi_ub_set(info, get_exec_ub()); + spin_lock_init(&info->lock); + INIT_LIST_HEAD(&info->swaplist); + +@@ -2100,6 +2120,7 @@ static int shmem_fill_super(struct super + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = TMPFS_MAGIC; + sb->s_op = &shmem_ops; ++ sb->s_time_gran = 1; + + inode = shmem_get_inode(sb, S_IFDIR | mode, 0); + if (!inode) +@@ -2172,6 +2193,7 @@ static struct address_space_operations s + .prepare_write = shmem_prepare_write, + .commit_write = simple_commit_write, + #endif ++ .migratepage = migrate_page, + }; + + static struct file_operations shmem_file_operations = { +@@ -2226,6 +2248,10 @@ static struct vm_operations_struct shmem + #endif + }; + ++int is_shmem_mapping(struct address_space *map) ++{ ++ return (map != NULL && map->a_ops == &shmem_aops); ++} + + static struct super_block *shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +@@ -2233,13 +2259,19 @@ static struct super_block *shmem_get_sb( + return get_sb_nodev(fs_type, flags, data, shmem_fill_super); + } + +-static struct file_system_type tmpfs_fs_type = { ++struct file_system_type tmpfs_fs_type = { + .owner = THIS_MODULE, + .name = "tmpfs", + .get_sb = shmem_get_sb, + .kill_sb = kill_litter_super, + }; ++EXPORT_SYMBOL(tmpfs_fs_type); ++ ++#ifdef CONFIG_VE ++#define shm_mnt (get_exec_env()->shmem_mnt) ++#else + static struct vfsmount *shm_mnt; ++#endif + + static int __init init_tmpfs(void) + { +@@ -2276,6 +2308,36 @@ out3: + } + module_init(init_tmpfs) + ++static inline int shm_charge_ahead(struct inode *inode) ++{ ++#ifdef CONFIG_USER_RESOURCE ++ struct shmem_inode_info *info = SHMEM_I(inode); ++ unsigned long idx; ++ swp_entry_t *entry; ++ ++ if (!inode->i_size) ++ return 0; ++ idx = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; ++ /* ++ * Just touch info to allocate space for entry and ++ * make all UBC checks ++ */ ++ spin_lock(&info->lock); ++ entry = shmem_swp_alloc(info, idx, SGP_CACHE); ++ if (IS_ERR(entry)) ++ goto err; ++ shmem_swp_unmap(entry); ++ spin_unlock(&info->lock); ++ return 0; ++ ++err: ++ spin_unlock(&info->lock); ++ return PTR_ERR(entry); ++#else ++ return 0; ++#endif ++} ++ + /* + * shmem_file_setup - get an unlinked file living in tmpfs + * +@@ -2323,6 +2385,10 @@ struct file *shmem_file_setup(char *name + d_instantiate(dentry, inode); + inode->i_size = size; + inode->i_nlink = 0; /* It is unlinked */ ++ error = shm_charge_ahead(inode); ++ if (error) ++ goto close_file; ++ + file->f_vfsmnt = mntget(shm_mnt); + file->f_dentry = dentry; + file->f_mapping = inode->i_mapping; +@@ -2338,6 +2404,7 @@ put_memory: + shmem_unacct_size(flags, size); + return ERR_PTR(error); + } ++EXPORT_SYMBOL_GPL(shmem_file_setup); + + /* + * shmem_zero_setup - setup a shared anonymous mapping +@@ -2355,6 +2422,8 @@ int shmem_zero_setup(struct vm_area_stru + + if (vma->vm_file) + fput(vma->vm_file); ++ else if (vma->vm_flags & VM_WRITE) ++ __ub_unused_privvm_dec(vma->vm_mm, size >> PAGE_SHIFT); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + return 0; +diff -upr linux-2.6.16.orig/mm/slab.c linux-2.6.16-026test015/mm/slab.c +--- linux-2.6.16.orig/mm/slab.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/slab.c 2006-07-04 14:41:38.000000000 +0400 +@@ -105,32 +105,19 @@ + #include <linux/nodemask.h> + #include <linux/mempolicy.h> + #include <linux/mutex.h> ++#include <linux/kmem_slab.h> ++#include <linux/kmem_cache.h> + + #include <asm/uaccess.h> + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + #include <asm/page.h> + +-/* +- * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, +- * SLAB_RED_ZONE & SLAB_POISON. +- * 0 for faster, smaller code (especially in the critical paths). +- * +- * STATS - 1 to collect stats for /proc/slabinfo. +- * 0 for faster, smaller code (especially in the critical paths). +- * +- * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) +- */ ++#include <ub/ub_mem.h> + +-#ifdef CONFIG_DEBUG_SLAB +-#define DEBUG 1 +-#define STATS 1 +-#define FORCED_DEBUG 1 +-#else +-#define DEBUG 0 +-#define STATS 0 +-#define FORCED_DEBUG 0 +-#endif ++#define DEBUG SLAB_DEBUG ++#define STATS SLAB_STATS ++#define FORCED_DEBUG SLAB_FORCED_DEBUG + + /* Shouldn't this be in a header file somewhere? */ + #define BYTES_PER_WORD sizeof(void *) +@@ -173,134 +160,20 @@ + SLAB_NO_REAP | SLAB_CACHE_DMA | \ + SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ +- SLAB_DESTROY_BY_RCU) ++ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE) + #else + # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ + SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ +- SLAB_DESTROY_BY_RCU) ++ SLAB_DESTROY_BY_RCU | SLAB_UBC | SLAB_NO_CHARGE) + #endif + +-/* +- * kmem_bufctl_t: +- * +- * Bufctl's are used for linking objs within a slab +- * linked offsets. +- * +- * This implementation relies on "struct page" for locating the cache & +- * slab an object belongs to. +- * This allows the bufctl structure to be small (one int), but limits +- * the number of objects a slab (not a cache) can contain when off-slab +- * bufctls are used. The limit is the size of the largest general cache +- * that does not use off-slab slabs. +- * For 32bit archs with 4 kB pages, is this 56. +- * This is not serious, as it is only for large objects, when it is unwise +- * to have too many per slab. +- * Note: This limit can be raised by introducing a general cache whose size +- * is less than 512 (PAGE_SIZE<<3), but greater than 256. +- */ +- +-typedef unsigned int kmem_bufctl_t; +-#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +-#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +-#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) +- + /* Max number of objs-per-slab for caches which use off-slab slabs. + * Needed to avoid a possible looping condition in cache_grow(). + */ + static unsigned long offslab_limit; + + /* +- * struct slab +- * +- * Manages the objs in a slab. Placed either at the beginning of mem allocated +- * for a slab, or allocated from an general cache. +- * Slabs are chained into three list: fully used, partial, fully free slabs. +- */ +-struct slab { +- struct list_head list; +- unsigned long colouroff; +- void *s_mem; /* including colour offset */ +- unsigned int inuse; /* num of objs active in slab */ +- kmem_bufctl_t free; +- unsigned short nodeid; +-}; +- +-/* +- * struct slab_rcu +- * +- * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to +- * arrange for kmem_freepages to be called via RCU. This is useful if +- * we need to approach a kernel structure obliquely, from its address +- * obtained without the usual locking. We can lock the structure to +- * stabilize it and check it's still at the given address, only if we +- * can be sure that the memory has not been meanwhile reused for some +- * other kind of object (which our subsystem's lock might corrupt). +- * +- * rcu_read_lock before reading the address, then rcu_read_unlock after +- * taking the spinlock within the structure expected at that address. +- * +- * We assume struct slab_rcu can overlay struct slab when destroying. +- */ +-struct slab_rcu { +- struct rcu_head head; +- struct kmem_cache *cachep; +- void *addr; +-}; +- +-/* +- * struct array_cache +- * +- * Purpose: +- * - LIFO ordering, to hand out cache-warm objects from _alloc +- * - reduce the number of linked list operations +- * - reduce spinlock operations +- * +- * The limit is stored in the per-cpu structure to reduce the data cache +- * footprint. +- * +- */ +-struct array_cache { +- unsigned int avail; +- unsigned int limit; +- unsigned int batchcount; +- unsigned int touched; +- spinlock_t lock; +- void *entry[0]; /* +- * Must have this definition in here for the proper +- * alignment of array_cache. Also simplifies accessing +- * the entries. +- * [0] is for gcc 2.95. It should really be []. +- */ +-}; +- +-/* bootstrap: The caches do not work without cpuarrays anymore, +- * but the cpuarrays are allocated from the generic caches... +- */ +-#define BOOT_CPUCACHE_ENTRIES 1 +-struct arraycache_init { +- struct array_cache cache; +- void *entries[BOOT_CPUCACHE_ENTRIES]; +-}; +- +-/* +- * The slab lists for all objects. +- */ +-struct kmem_list3 { +- struct list_head slabs_partial; /* partial list first, better asm code */ +- struct list_head slabs_full; +- struct list_head slabs_free; +- unsigned long free_objects; +- unsigned long next_reap; +- int free_touched; +- unsigned int free_limit; +- unsigned int colour_next; /* Per-node cache coloring */ +- spinlock_t list_lock; +- struct array_cache *shared; /* shared per node */ +- struct array_cache **alien; /* on other nodes */ +-}; +- +-/* + * Need this for bootstrapping a per node allocator. + */ + #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) +@@ -364,79 +237,6 @@ static void kmem_list3_init(struct kmem_ + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +-/* +- * struct kmem_cache +- * +- * manages a cache. +- */ +- +-struct kmem_cache { +-/* 1) per-cpu data, touched during every alloc/free */ +- struct array_cache *array[NR_CPUS]; +- unsigned int batchcount; +- unsigned int limit; +- unsigned int shared; +- unsigned int buffer_size; +-/* 2) touched by every alloc & free from the backend */ +- struct kmem_list3 *nodelists[MAX_NUMNODES]; +- unsigned int flags; /* constant flags */ +- unsigned int num; /* # of objs per slab */ +- spinlock_t spinlock; +- +-/* 3) cache_grow/shrink */ +- /* order of pgs per slab (2^n) */ +- unsigned int gfporder; +- +- /* force GFP flags, e.g. GFP_DMA */ +- gfp_t gfpflags; +- +- size_t colour; /* cache colouring range */ +- unsigned int colour_off; /* colour offset */ +- struct kmem_cache *slabp_cache; +- unsigned int slab_size; +- unsigned int dflags; /* dynamic flags */ +- +- /* constructor func */ +- void (*ctor) (void *, struct kmem_cache *, unsigned long); +- +- /* de-constructor func */ +- void (*dtor) (void *, struct kmem_cache *, unsigned long); +- +-/* 4) cache creation/removal */ +- const char *name; +- struct list_head next; +- +-/* 5) statistics */ +-#if STATS +- unsigned long num_active; +- unsigned long num_allocations; +- unsigned long high_mark; +- unsigned long grown; +- unsigned long reaped; +- unsigned long errors; +- unsigned long max_freeable; +- unsigned long node_allocs; +- unsigned long node_frees; +- atomic_t allochit; +- atomic_t allocmiss; +- atomic_t freehit; +- atomic_t freemiss; +-#endif +-#if DEBUG +- /* +- * If debugging is enabled, then the allocator can add additional +- * fields and/or padding to every object. buffer_size contains the total +- * object size including these internal fields, the following two +- * variables contain the offset to the user object and its size. +- */ +- int obj_offset; +- int obj_size; +-#endif +-}; +- +-#define CFLGS_OFF_SLAB (0x80000000UL) +-#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +- + #define BATCHREFILL_LIMIT 16 + /* Optimization question: fewer reaps means less + * probability for unnessary cpucache drain/refill cycles. +@@ -573,42 +373,6 @@ static void **dbg_userword(struct kmem_c + #define BREAK_GFP_ORDER_LO 0 + static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; + +-/* Functions for storing/retrieving the cachep and or slab from the +- * global 'mem_map'. These are used to find the slab an obj belongs to. +- * With kfree(), these are used to find the cache which an obj belongs to. +- */ +-static inline void page_set_cache(struct page *page, struct kmem_cache *cache) +-{ +- page->lru.next = (struct list_head *)cache; +-} +- +-static inline struct kmem_cache *page_get_cache(struct page *page) +-{ +- return (struct kmem_cache *)page->lru.next; +-} +- +-static inline void page_set_slab(struct page *page, struct slab *slab) +-{ +- page->lru.prev = (struct list_head *)slab; +-} +- +-static inline struct slab *page_get_slab(struct page *page) +-{ +- return (struct slab *)page->lru.prev; +-} +- +-static inline struct kmem_cache *virt_to_cache(const void *obj) +-{ +- struct page *page = virt_to_page(obj); +- return page_get_cache(page); +-} +- +-static inline struct slab *virt_to_slab(const void *obj) +-{ +- struct page *page = virt_to_page(obj); +- return page_get_slab(page); +-} +- + /* These are the default caches for kmalloc. Custom caches can have other sizes. */ + struct cache_sizes malloc_sizes[] = { + #define CACHE(x) { .cs_size = (x) }, +@@ -715,9 +479,17 @@ struct kmem_cache *kmem_find_general_cac + } + EXPORT_SYMBOL(kmem_find_general_cachep); + +-static size_t slab_mgmt_size(size_t nr_objs, size_t align) ++static size_t slab_mgmt_size_noalign(size_t nr_objs, int flags) + { +- return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); ++ size_t size_noub; ++ ++ size_noub = sizeof(struct slab) + nr_objs * sizeof(kmem_bufctl_t); ++ return ALIGN(size_noub, UB_ALIGN(flags)) + nr_objs * UB_EXTRA(flags); ++} ++ ++static size_t slab_mgmt_size(size_t nr_objs, size_t align, int flags) ++{ ++ return ALIGN(slab_mgmt_size_noalign(nr_objs, flags), align); + } + + /* Calculate the number of objects and left-over bytes for a given +@@ -761,20 +533,23 @@ static void cache_estimate(unsigned long + * into account. + */ + nr_objs = (slab_size - sizeof(struct slab)) / +- (buffer_size + sizeof(kmem_bufctl_t)); ++ (buffer_size + sizeof(kmem_bufctl_t) + ++ UB_EXTRA(flags)); + + /* + * This calculated number will be either the right + * amount, or one greater than what we want. + */ +- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size +- > slab_size) ++ if (slab_mgmt_size(nr_objs, align, flags) + ++ nr_objs * buffer_size > slab_size) + nr_objs--; ++ BUG_ON(slab_mgmt_size(nr_objs, align, flags) + ++ nr_objs * buffer_size > slab_size); + + if (nr_objs > SLAB_LIMIT) + nr_objs = SLAB_LIMIT; + +- mgmt_size = slab_mgmt_size(nr_objs, align); ++ mgmt_size = slab_mgmt_size(nr_objs, align, flags); + } + *num = nr_objs; + *left_over = slab_size - nr_objs*buffer_size - mgmt_size; +@@ -1254,6 +1029,7 @@ void __init kmem_cache_init(void) + sizes[INDEX_AC].cs_size, + ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS | ++ SLAB_UBC|SLAB_NO_CHARGE | + SLAB_PANIC), NULL, NULL); + + if (INDEX_AC != INDEX_L3) +@@ -1261,8 +1037,9 @@ void __init kmem_cache_init(void) + kmem_cache_create(names[INDEX_L3].name, + sizes[INDEX_L3].cs_size, + ARCH_KMALLOC_MINALIGN, +- (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, +- NULL); ++ (ARCH_KMALLOC_FLAGS | ++ SLAB_UBC | SLAB_NO_CHARGE | ++ SLAB_PANIC), NULL, NULL); + + while (sizes->cs_size != ULONG_MAX) { + /* +@@ -1277,14 +1054,14 @@ void __init kmem_cache_init(void) + sizes->cs_size, + ARCH_KMALLOC_MINALIGN, + (ARCH_KMALLOC_FLAGS ++ | SLAB_UBC ++ | SLAB_NO_CHARGE + | SLAB_PANIC), + NULL, NULL); + + /* Inc off-slab bufctl limit until the ceiling is hit. */ +- if (!(OFF_SLAB(sizes->cs_cachep))) { +- offslab_limit = sizes->cs_size - sizeof(struct slab); +- offslab_limit /= sizeof(kmem_bufctl_t); +- } ++ if (!(OFF_SLAB(sizes->cs_cachep))) ++ offslab_limit = sizes->cs_size; + + sizes->cs_dmacachep = kmem_cache_create(names->name_dma, + sizes->cs_size, +@@ -1704,8 +1481,13 @@ static inline size_t calculate_slab_orde + continue; + + /* More than offslab_limit objects will cause problems */ +- if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit) +- break; ++ if (flags & CFLGS_OFF_SLAB) { ++ unsigned long slab_size; ++ ++ slab_size = slab_mgmt_size_noalign(num, flags); ++ if (slab_size > offslab_limit) ++ break; ++ } + + /* Found something acceptable - save it away */ + cachep->num = num; +@@ -1950,8 +1732,7 @@ kmem_cache_create (const char *name, siz + cachep = NULL; + goto oops; + } +- slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) +- + sizeof(struct slab), align); ++ slab_size = slab_mgmt_size(cachep->num, align, flags); + + /* + * If the slab has been placed off-slab, and we have enough space then +@@ -1964,8 +1745,7 @@ kmem_cache_create (const char *name, siz + + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ +- slab_size = +- cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); ++ slab_size = slab_mgmt_size_noalign(cachep->num, flags); + } + + cachep->colour_off = cache_line_size(); +@@ -2045,6 +1825,7 @@ kmem_cache_create (const char *name, siz + + /* cache setup completed, link it into the list */ + list_add(&cachep->next, &cache_chain); ++ set_cache_objuse(cachep); + oops: + if (!cachep && (flags & SLAB_PANIC)) + panic("kmem_cache_create(): failed to create slab `%s'\n", +@@ -2266,6 +2047,8 @@ int kmem_cache_destroy(struct kmem_cache + kfree(l3); + } + } ++ ++ ub_kmemcache_free(cachep); + kmem_cache_free(&cache_cache, cachep); + + unlock_cpu_hotplug(); +@@ -2282,7 +2065,8 @@ static struct slab *alloc_slabmgmt(struc + + if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ +- slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); ++ slabp = kmem_cache_alloc(cachep->slabp_cache, ++ local_flags & (~__GFP_UBC)); + if (!slabp) + return NULL; + } else { +@@ -2292,15 +2076,11 @@ static struct slab *alloc_slabmgmt(struc + slabp->inuse = 0; + slabp->colouroff = colour_off; + slabp->s_mem = objp + colour_off; ++ init_slab_ubps(cachep, slabp); + + return slabp; + } + +-static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) +-{ +- return (kmem_bufctl_t *) (slabp + 1); +-} +- + static void cache_init_objs(struct kmem_cache *cachep, + struct slab *slabp, unsigned long ctor_flags) + { +@@ -2470,7 +2250,7 @@ static int cache_grow(struct kmem_cache + /* Get mem for the objs. + * Attempt to allocate a physical page from 'nodeid', + */ +- if (!(objp = kmem_getpages(cachep, flags, nodeid))) ++ if (!(objp = kmem_getpages(cachep, flags & (~__GFP_UBC), nodeid))) + goto failed; + + /* Get slab management. */ +@@ -2823,6 +2603,11 @@ __cache_alloc(struct kmem_cache *cachep, + objp = cache_alloc_debugcheck_after(cachep, flags, objp, + caller); + prefetchw(objp); ++ ++ if (objp && ub_slab_charge(objp, flags)) { ++ kmem_cache_free(cachep, objp); ++ objp = NULL; ++ } + return objp; + } + +@@ -2997,6 +2782,8 @@ static inline void __cache_free(struct k + check_irq_off(); + objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); + ++ ub_slab_uncharge(objp); ++ + /* Make sure we are not freeing a object from another + * node to the array cache on this cpu. + */ +@@ -3128,6 +2915,10 @@ void *kmem_cache_alloc_node(struct kmem_ + ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, + __builtin_return_address(0)); + ++ if (ptr && ub_slab_charge(ptr, flags)) { ++ kmem_cache_free(cachep, ptr); ++ ptr = NULL; ++ } + return ptr; + } + EXPORT_SYMBOL(kmem_cache_alloc_node); +@@ -3543,6 +3334,7 @@ static void cache_reap(void *unused) + return; + } + ++ {KSTAT_PERF_ENTER(cache_reap) + list_for_each(walk, &cache_chain) { + struct kmem_cache *searchp; + struct list_head *p; +@@ -3608,6 +3400,7 @@ static void cache_reap(void *unused) + check_irq_on(); + mutex_unlock(&cache_chain_mutex); + next_reap_node(); ++ KSTAT_PERF_LEAVE(cache_reap)} + /* Setup the next iteration */ + schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); + } +diff -upr linux-2.6.16.orig/mm/swap_state.c linux-2.6.16-026test015/mm/swap_state.c +--- linux-2.6.16.orig/mm/swap_state.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/swap_state.c 2006-07-04 14:41:38.000000000 +0400 +@@ -18,6 +18,8 @@ + + #include <asm/pgtable.h> + ++#include <ub/ub_vmpages.h> ++ + /* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_list, to make sync_page look nicer, and to allow +@@ -52,14 +54,18 @@ static struct { + unsigned long find_total; + unsigned long noent_race; + unsigned long exist_race; ++ unsigned long remove_race; + } swap_cache_info; ++EXPORT_SYMBOL(swap_cache_info); + + void show_swap_cache_info(void) + { +- printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", ++ printk("Swap cache: add %lu, delete %lu, find %lu/%lu, " ++ "race %lu+%lu+%lu\n", + swap_cache_info.add_total, swap_cache_info.del_total, + swap_cache_info.find_success, swap_cache_info.find_total, +- swap_cache_info.noent_race, swap_cache_info.exist_race); ++ swap_cache_info.noent_race, swap_cache_info.exist_race, ++ swap_cache_info.remove_race); + printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); + } +@@ -151,7 +157,14 @@ int add_to_swap(struct page * page, gfp_ + BUG(); + + for (;;) { +- entry = get_swap_page(); ++ struct user_beancounter *ub; ++ ++ ub = pb_grab_page_ub(page); ++ if (IS_ERR(ub)) ++ return 0; ++ ++ entry = get_swap_page(ub); ++ put_beancounter(ub); + if (!entry.val) + return 0; + +@@ -252,10 +265,13 @@ int move_from_swap_cache(struct page *pa + */ + static inline void free_swap_cache(struct page *page) + { +- if (PageSwapCache(page) && !TestSetPageLocked(page)) { ++ if (!PageSwapCache(page)) ++ return; ++ if (!TestSetPageLocked(page)) { + remove_exclusive_swap_page(page); + unlock_page(page); +- } ++ } else ++ INC_CACHE_INFO(remove_race); + } + + /* +diff -upr linux-2.6.16.orig/mm/swapfile.c linux-2.6.16-026test015/mm/swapfile.c +--- linux-2.6.16.orig/mm/swapfile.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/swapfile.c 2006-07-04 14:41:39.000000000 +0400 +@@ -33,6 +33,8 @@ + #include <asm/tlbflush.h> + #include <linux/swapops.h> + ++#include <ub/ub_vmpages.h> ++ + DEFINE_SPINLOCK(swap_lock); + unsigned int nr_swapfiles; + long total_swap_pages; +@@ -172,7 +174,7 @@ no_page: + return 0; + } + +-swp_entry_t get_swap_page(void) ++swp_entry_t get_swap_page(struct user_beancounter *ub) + { + struct swap_info_struct *si; + pgoff_t offset; +@@ -202,6 +204,7 @@ swp_entry_t get_swap_page(void) + offset = scan_swap_map(si); + if (offset) { + spin_unlock(&swap_lock); ++ ub_swapentry_inc(si, offset, ub); + return swp_entry(type, offset); + } + next = swap_list.next; +@@ -277,6 +280,7 @@ static int swap_entry_free(struct swap_i + count--; + p->swap_map[offset] = count; + if (!count) { ++ ub_swapentry_dec(p, offset); + if (offset < p->lowest_bit) + p->lowest_bit = offset; + if (offset > p->highest_bit) +@@ -423,11 +427,18 @@ void free_swap_and_cache(swp_entry_t ent + * force COW, vm_page_prot omits write permission from any private vma. + */ + static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, +- unsigned long addr, swp_entry_t entry, struct page *page) ++ unsigned long addr, swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { +- inc_mm_counter(vma->vm_mm, anon_rss); ++ struct mm_struct *mm; ++ ++ mm = vma->vm_mm; ++ inc_mm_counter(mm, anon_rss); ++ inc_vma_rss(vma); ++ ub_unused_privvm_dec(mm, vma); ++ pb_add_ref(page, mm, pb); + get_page(page); +- set_pte_at(vma->vm_mm, addr, pte, ++ set_pte_at(mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); + page_add_anon_rmap(page, vma, addr); + swap_free(entry); +@@ -440,7 +451,8 @@ static void unuse_pte(struct vm_area_str + + static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pte_t swp_pte = swp_entry_to_pte(entry); + pte_t *pte; +@@ -454,7 +466,7 @@ static int unuse_pte_range(struct vm_are + * Test inline before going to call unuse_pte. + */ + if (unlikely(pte_same(*pte, swp_pte))) { +- unuse_pte(vma, pte++, addr, entry, page); ++ unuse_pte(vma, pte++, addr, entry, page, pb); + found = 1; + break; + } +@@ -465,7 +477,8 @@ static int unuse_pte_range(struct vm_are + + static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pmd_t *pmd; + unsigned long next; +@@ -475,7 +488,7 @@ static inline int unuse_pmd_range(struct + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; +- if (unuse_pte_range(vma, pmd, addr, next, entry, page)) ++ if (unuse_pte_range(vma, pmd, addr, next, entry, page, pb)) + return 1; + } while (pmd++, addr = next, addr != end); + return 0; +@@ -483,7 +496,8 @@ static inline int unuse_pmd_range(struct + + static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pud_t *pud; + unsigned long next; +@@ -493,14 +507,15 @@ static inline int unuse_pud_range(struct + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; +- if (unuse_pmd_range(vma, pud, addr, next, entry, page)) ++ if (unuse_pmd_range(vma, pud, addr, next, entry, page, pb)) + return 1; + } while (pud++, addr = next, addr != end); + return 0; + } + + static int unuse_vma(struct vm_area_struct *vma, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + pgd_t *pgd; + unsigned long addr, end, next; +@@ -521,14 +536,15 @@ static int unuse_vma(struct vm_area_stru + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; +- if (unuse_pud_range(vma, pgd, addr, next, entry, page)) ++ if (unuse_pud_range(vma, pgd, addr, next, entry, page, pb)) + return 1; + } while (pgd++, addr = next, addr != end); + return 0; + } + + static int unuse_mm(struct mm_struct *mm, +- swp_entry_t entry, struct page *page) ++ swp_entry_t entry, struct page *page, ++ struct page_beancounter **pb) + { + struct vm_area_struct *vma; + +@@ -543,7 +559,7 @@ static int unuse_mm(struct mm_struct *mm + lock_page(page); + } + for (vma = mm->mmap; vma; vma = vma->vm_next) { +- if (vma->anon_vma && unuse_vma(vma, entry, page)) ++ if (vma->anon_vma && unuse_vma(vma, entry, page, pb)) + break; + } + up_read(&mm->mmap_sem); +@@ -555,11 +571,12 @@ static int unuse_mm(struct mm_struct *mm + } + + #ifdef CONFIG_MIGRATION +-int remove_vma_swap(struct vm_area_struct *vma, struct page *page) ++int remove_vma_swap(struct vm_area_struct *vma, struct page *page, ++ struct page_beancounter **pb) + { + swp_entry_t entry = { .val = page_private(page) }; + +- return unuse_vma(vma, entry, page); ++ return unuse_vma(vma, entry, page, pb); + } + #endif + +@@ -618,6 +635,7 @@ static int try_to_unuse(unsigned int typ + int retval = 0; + int reset_overflow = 0; + int shmem; ++ struct page_beancounter *pb; + + /* + * When searching mms for an entry, a good strategy is to +@@ -670,6 +688,13 @@ again: + break; + } + ++ pb = NULL; ++ if (pb_alloc_all(&pb)) { ++ page_cache_release(page); ++ retval = -ENOMEM; ++ break; ++ } ++ + /* + * Don't hold on to start_mm if it looks like exiting. + */ +@@ -698,6 +723,20 @@ again: + } + wait_on_page_writeback(page); + ++ /* If read failed we cannot map not-uptodate page to ++ * user space. Actually, we are in serious troubles, ++ * we do not even know what process to kill. So, the only ++ * variant remains: to stop swapoff() and allow someone ++ * to kill processes to zap invalid pages. ++ */ ++ if (unlikely(!PageUptodate(page))) { ++ pb_free_list(&pb); ++ unlock_page(page); ++ page_cache_release(page); ++ retval = -EIO; ++ break; ++ } ++ + /* + * Remove all references to entry. + * Whenever we reach init_mm, there's no address space +@@ -709,7 +748,7 @@ again: + if (start_mm == &init_mm) + shmem = shmem_unuse(entry, page); + else +- retval = unuse_mm(start_mm, entry, page); ++ retval = unuse_mm(start_mm, entry, page, &pb); + } + if (*swap_map > 1) { + int set_start_mm = (*swap_map >= swcount); +@@ -741,7 +780,7 @@ again: + set_start_mm = 1; + shmem = shmem_unuse(entry, page); + } else +- retval = unuse_mm(mm, entry, page); ++ retval = unuse_mm(mm, entry, page, &pb); + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); +@@ -755,6 +794,8 @@ again: + mmput(start_mm); + start_mm = new_start_mm; + } ++ ++ pb_free_list(&pb); + if (retval) { + unlock_page(page); + page_cache_release(page); +@@ -1100,6 +1141,10 @@ asmlinkage long sys_swapoff(const char _ + int i, type, prev; + int err; + ++ /* VE admin check is just to be on the safe side, the admin may affect ++ * swaps only if he has access to special, i.e. if he has been granted ++ * access to the block device or if the swap file is in the area ++ * visible to him. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -1199,6 +1244,7 @@ asmlinkage long sys_swapoff(const char _ + spin_unlock(&swap_lock); + mutex_unlock(&swapon_mutex); + vfree(swap_map); ++ ub_swap_fini(p); + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); +@@ -1557,6 +1603,11 @@ asmlinkage long sys_swapon(const char __ + goto bad_swap; + } + ++ if (ub_swap_init(p, maxpages)) { ++ error = -ENOMEM; ++ goto bad_swap; ++ } ++ + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); + p->flags = SWP_ACTIVE; +diff -upr linux-2.6.16.orig/mm/vmalloc.c linux-2.6.16-026test015/mm/vmalloc.c +--- linux-2.6.16.orig/mm/vmalloc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/vmalloc.c 2006-07-04 14:41:37.000000000 +0400 +@@ -20,6 +20,8 @@ + #include <asm/uaccess.h> + #include <asm/tlbflush.h> + ++#include <ub/ub_debug.h> ++ + + DEFINE_RWLOCK(vmlist_lock); + struct vm_struct *vmlist; +@@ -256,6 +258,68 @@ struct vm_struct *get_vm_area_node(unsig + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); + } + ++struct vm_struct * get_vm_area_best(unsigned long size, unsigned long flags) ++{ ++ unsigned long addr, best_addr, delta, best_delta; ++ struct vm_struct **p, **best_p, *tmp, *area; ++ ++ area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); ++ if (!area) ++ return NULL; ++ ++ size += PAGE_SIZE; /* one-page gap at the end */ ++ addr = VMALLOC_START; ++ best_addr = 0UL; ++ best_p = NULL; ++ best_delta = PAGE_ALIGN(VMALLOC_END) - VMALLOC_START; ++ ++ write_lock(&vmlist_lock); ++ for (p = &vmlist; (tmp = *p) && ++ (tmp->addr <= (void *)PAGE_ALIGN(VMALLOC_END)); ++ p = &tmp->next) { ++ if ((size + addr) < addr) ++ break; ++ delta = (unsigned long) tmp->addr - (size + addr); ++ if (delta < best_delta) { ++ best_delta = delta; ++ best_addr = addr; ++ best_p = p; ++ } ++ addr = tmp->size + (unsigned long) tmp->addr; ++ if (addr > VMALLOC_END-size) ++ break; ++ } ++ ++ if (!tmp || (tmp->addr > (void *)PAGE_ALIGN(VMALLOC_END))) { ++ /* check free area after list end */ ++ delta = (unsigned long) PAGE_ALIGN(VMALLOC_END) - (size + addr); ++ if (delta < best_delta) { ++ best_delta = delta; ++ best_addr = addr; ++ best_p = p; ++ } ++ } ++ if (best_addr) { ++ area->flags = flags; ++ /* allocate at the end of this area */ ++ area->addr = (void *)(best_addr + best_delta); ++ area->size = size; ++ area->next = *best_p; ++ area->pages = NULL; ++ area->nr_pages = 0; ++ area->phys_addr = 0; ++ *best_p = area; ++ /* check like in __vunmap */ ++ WARN_ON((PAGE_SIZE - 1) & (unsigned long)area->addr); ++ } else { ++ kfree(area); ++ area = NULL; ++ } ++ write_unlock(&vmlist_lock); ++ ++ return area; ++} ++ + /* Caller must hold vmlist_lock */ + struct vm_struct *__remove_vm_area(void *addr) + { +@@ -296,7 +360,7 @@ struct vm_struct *remove_vm_area(void *a + return v; + } + +-void __vunmap(void *addr, int deallocate_pages) ++void __vunmap(void *addr, int deallocate_pages, int uncharge) + { + struct vm_struct *area; + +@@ -320,6 +384,8 @@ void __vunmap(void *addr, int deallocate + if (deallocate_pages) { + int i; + ++ if (uncharge) ++ dec_vmalloc_charged(area); + for (i = 0; i < area->nr_pages; i++) { + if (unlikely(!area->pages[i])) + BUG(); +@@ -350,7 +416,7 @@ void __vunmap(void *addr, int deallocate + void vfree(void *addr) + { + BUG_ON(in_interrupt()); +- __vunmap(addr, 1); ++ __vunmap(addr, 1, 1); + } + EXPORT_SYMBOL(vfree); + +@@ -367,7 +433,7 @@ EXPORT_SYMBOL(vfree); + void vunmap(void *addr) + { + BUG_ON(in_interrupt()); +- __vunmap(addr, 0); ++ __vunmap(addr, 0, 0); + } + EXPORT_SYMBOL(vunmap); + +@@ -439,10 +505,12 @@ void *__vmalloc_area_node(struct vm_stru + + if (map_vm_area(area, prot, &pages)) + goto fail; ++ ++ inc_vmalloc_charged(area, gfp_mask); + return area->addr; + + fail: +- vfree(area->addr); ++ __vunmap(area->addr, 1, 0); + return NULL; + } + +@@ -486,6 +554,21 @@ void *__vmalloc(unsigned long size, gfp_ + } + EXPORT_SYMBOL(__vmalloc); + ++static void *____vmalloc(unsigned long size, gfp_t mask, pgprot_t prot) ++{ ++ struct vm_struct *area; ++ ++ size = PAGE_ALIGN(size); ++ if (!size || (size >> PAGE_SHIFT) > num_physpages) ++ return NULL; ++ ++ area = get_vm_area_best(size, VM_ALLOC); ++ if (!area) ++ return NULL; ++ ++ return __vmalloc_area_node(area, mask, prot, -1); ++} ++ + /** + * vmalloc - allocate virtually contiguous memory + * +@@ -503,6 +586,26 @@ void *vmalloc(unsigned long size) + } + EXPORT_SYMBOL(vmalloc); + ++void *ub_vmalloc(unsigned long size) ++{ ++ return __vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); ++} ++EXPORT_SYMBOL(ub_vmalloc); ++ ++void *vmalloc_best(unsigned long size) ++{ ++ return ____vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); ++} ++ ++EXPORT_SYMBOL(vmalloc_best); ++ ++void *ub_vmalloc_best(unsigned long size) ++{ ++ return ____vmalloc(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL); ++} ++ ++EXPORT_SYMBOL(ub_vmalloc_best); ++ + /** + * vmalloc_node - allocate memory on a specific node + * +@@ -521,6 +624,12 @@ void *vmalloc_node(unsigned long size, i + } + EXPORT_SYMBOL(vmalloc_node); + ++void *ub_vmalloc_node(unsigned long size, int node) ++{ ++ return __vmalloc_node(size, GFP_KERNEL_UBC | __GFP_HIGHMEM, PAGE_KERNEL, node); ++} ++EXPORT_SYMBOL(ub_vmalloc_node); ++ + #ifndef PAGE_KERNEL_EXEC + # define PAGE_KERNEL_EXEC PAGE_KERNEL + #endif +@@ -631,3 +740,37 @@ finished: + read_unlock(&vmlist_lock); + return buf - buf_start; + } ++ ++void vprintstat(void) ++{ ++ struct vm_struct *p, *last_p = NULL; ++ unsigned long addr, size, free_size, max_free_size; ++ int num; ++ ++ addr = VMALLOC_START; ++ size = max_free_size = 0; ++ num = 0; ++ ++ read_lock(&vmlist_lock); ++ for (p = vmlist; p; p = p->next) { ++ free_size = (unsigned long)p->addr - addr; ++ if (free_size > max_free_size) ++ max_free_size = free_size; ++ addr = (unsigned long)p->addr + p->size; ++ size += p->size; ++ ++num; ++ last_p = p; ++ } ++ if (last_p) { ++ free_size = VMALLOC_END - ++ ((unsigned long)last_p->addr + last_p->size); ++ if (free_size > max_free_size) ++ max_free_size = free_size; ++ } ++ read_unlock(&vmlist_lock); ++ ++ printk("VMALLOC Used: %luKB Total: %luKB Entries: %d\n" ++ " Max_Free: %luKB Start: %lx End: %lx\n", ++ size/1024, (VMALLOC_END - VMALLOC_START)/1024, num, ++ max_free_size/1024, VMALLOC_START, VMALLOC_END); ++} +diff -upr linux-2.6.16.orig/mm/vmscan.c linux-2.6.16-026test015/mm/vmscan.c +--- linux-2.6.16.orig/mm/vmscan.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/mm/vmscan.c 2006-07-04 14:41:38.000000000 +0400 +@@ -949,6 +949,17 @@ redo: + goto unlock_both; + } + ++ /* Make sure the dirty bit is up to date */ ++ if (try_to_unmap(page, 1) == SWAP_FAIL) { ++ rc = -EPERM; ++ goto unlock_both; ++ } ++ ++ if (page_mapcount(page)) { ++ rc = -EAGAIN; ++ goto unlock_both; ++ } ++ + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean +@@ -1243,6 +1254,7 @@ refill_inactive_zone(struct zone *zone, + reclaim_mapped = 1; + } + ++ {KSTAT_PERF_ENTER(refill_inact) + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, +@@ -1322,6 +1334,7 @@ refill_inactive_zone(struct zone *zone, + local_irq_enable(); + + pagevec_release(&pvec); ++ KSTAT_PERF_LEAVE(refill_inact)} + } + + /* +@@ -1438,6 +1451,7 @@ int try_to_free_pages(struct zone **zone + unsigned long lru_pages = 0; + int i; + ++ KSTAT_PERF_ENTER(ttfp); + sc.gfp_mask = gfp_mask; + sc.may_writepage = !laptop_mode; + sc.may_swap = 1; +@@ -1500,6 +1514,7 @@ out: + + zone->prev_priority = zone->temp_priority; + } ++ KSTAT_PERF_LEAVE(ttfp); + return ret; + } + +@@ -1832,7 +1847,8 @@ static int __init kswapd_init(void) + swap_setup(); + for_each_pgdat(pgdat) + pgdat->kswapd +- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); ++ = find_task_by_pid_all(kernel_thread(kswapd, ++ pgdat, CLONE_KERNEL)); + total_memory = nr_free_pagecache_pages(); + hotcpu_notifier(cpu_callback, 0); + return 0; +diff -upr linux-2.6.16.orig/net/atm/clip.c linux-2.6.16-026test015/net/atm/clip.c +--- linux-2.6.16.orig/net/atm/clip.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/atm/clip.c 2006-07-04 14:41:36.000000000 +0400 +@@ -613,12 +613,19 @@ static int clip_create(int number) + + + static int clip_device_event(struct notifier_block *this,unsigned long event, +- void *dev) ++ void *arg) + { ++ struct net_device *dev = arg; ++ ++ if (event == NETDEV_UNREGISTER) { ++ neigh_ifdown(&clip_tbl, dev); ++ return NOTIFY_DONE; ++ } ++ + /* ignore non-CLIP devices */ +- if (((struct net_device *) dev)->type != ARPHRD_ATM || +- ((struct net_device *) dev)->hard_start_xmit != clip_start_xmit) ++ if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit) + return NOTIFY_DONE; ++ + switch (event) { + case NETDEV_UP: + DPRINTK("clip_device_event NETDEV_UP\n"); +@@ -686,14 +693,12 @@ static struct notifier_block clip_inet_n + static void atmarpd_close(struct atm_vcc *vcc) + { + DPRINTK("atmarpd_close\n"); +- atmarpd = NULL; /* assumed to be atomic */ +- barrier(); +- unregister_inetaddr_notifier(&clip_inet_notifier); +- unregister_netdevice_notifier(&clip_dev_notifier); +- if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) +- printk(KERN_ERR "atmarpd_close: closing with requests " +- "pending\n"); ++ ++ rtnl_lock(); ++ atmarpd = NULL; + skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); ++ rtnl_unlock(); ++ + DPRINTK("(done)\n"); + module_put(THIS_MODULE); + } +@@ -714,7 +719,12 @@ static struct atm_dev atmarpd_dev = { + + static int atm_init_atmarp(struct atm_vcc *vcc) + { +- if (atmarpd) return -EADDRINUSE; ++ rtnl_lock(); ++ if (atmarpd) { ++ rtnl_unlock(); ++ return -EADDRINUSE; ++ } ++ + if (start_timer) { + start_timer = 0; + init_timer(&idle_timer); +@@ -731,10 +741,7 @@ static int atm_init_atmarp(struct atm_vc + vcc->push = NULL; + vcc->pop = NULL; /* crash */ + vcc->push_oam = NULL; /* crash */ +- if (register_netdevice_notifier(&clip_dev_notifier)) +- printk(KERN_ERR "register_netdevice_notifier failed\n"); +- if (register_inetaddr_notifier(&clip_inet_notifier)) +- printk(KERN_ERR "register_inetaddr_notifier failed\n"); ++ rtnl_unlock(); + return 0; + } + +@@ -992,6 +999,8 @@ static int __init atm_clip_init(void) + + clip_tbl_hook = &clip_tbl; + register_atm_ioctl(&clip_ioctl_ops); ++ register_netdevice_notifier(&clip_dev_notifier); ++ register_inetaddr_notifier(&clip_inet_notifier); + + #ifdef CONFIG_PROC_FS + { +@@ -1012,6 +1021,9 @@ static void __exit atm_clip_exit(void) + + remove_proc_entry("arp", atm_proc_root); + ++ unregister_inetaddr_notifier(&clip_inet_notifier); ++ unregister_netdevice_notifier(&clip_dev_notifier); ++ + deregister_atm_ioctl(&clip_ioctl_ops); + + /* First, stop the idle timer, so it stops banging +diff -upr linux-2.6.16.orig/net/bridge/br_netfilter.c linux-2.6.16-026test015/net/bridge/br_netfilter.c +--- linux-2.6.16.orig/net/bridge/br_netfilter.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/bridge/br_netfilter.c 2006-07-04 14:41:36.000000000 +0400 +@@ -739,6 +739,15 @@ out: + return NF_STOLEN; + } + ++static int br_nf_dev_queue_xmit(struct sk_buff *skb) ++{ ++ if (skb->protocol == htons(ETH_P_IP) && ++ skb->len > skb->dev->mtu && ++ !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) ++ return ip_fragment(skb, br_dev_queue_push_xmit); ++ else ++ return br_dev_queue_push_xmit(skb); ++} + + /* PF_BRIDGE/POST_ROUTING ********************************************/ + static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, +@@ -798,7 +807,7 @@ static unsigned int br_nf_post_routing(u + realoutdev = nf_bridge->netoutdev; + #endif + NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev, +- br_dev_queue_push_xmit); ++ br_nf_dev_queue_xmit); + + return NF_STOLEN; + +@@ -843,7 +852,7 @@ static unsigned int ip_sabotage_out(unsi + if ((out->hard_start_xmit == br_dev_xmit && + okfn != br_nf_forward_finish && + okfn != br_nf_local_out_finish && +- okfn != br_dev_queue_push_xmit) ++ okfn != br_nf_dev_queue_xmit) + #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + || ((out->priv_flags & IFF_802_1Q_VLAN) && + VLAN_DEV_INFO(out)->real_dev->hard_start_xmit == br_dev_xmit) +diff -upr linux-2.6.16.orig/net/compat.c linux-2.6.16-026test015/net/compat.c +--- linux-2.6.16.orig/net/compat.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/compat.c 2006-07-04 14:41:36.000000000 +0400 +@@ -308,107 +308,6 @@ void scm_detach_fds_compat(struct msghdr + } + + /* +- * For now, we assume that the compatibility and native version +- * of struct ipt_entry are the same - sfr. FIXME +- */ +-struct compat_ipt_replace { +- char name[IPT_TABLE_MAXNAMELEN]; +- u32 valid_hooks; +- u32 num_entries; +- u32 size; +- u32 hook_entry[NF_IP_NUMHOOKS]; +- u32 underflow[NF_IP_NUMHOOKS]; +- u32 num_counters; +- compat_uptr_t counters; /* struct ipt_counters * */ +- struct ipt_entry entries[0]; +-}; +- +-static int do_netfilter_replace(int fd, int level, int optname, +- char __user *optval, int optlen) +-{ +- struct compat_ipt_replace __user *urepl; +- struct ipt_replace __user *repl_nat; +- char name[IPT_TABLE_MAXNAMELEN]; +- u32 origsize, tmp32, num_counters; +- unsigned int repl_nat_size; +- int ret; +- int i; +- compat_uptr_t ucntrs; +- +- urepl = (struct compat_ipt_replace __user *)optval; +- if (get_user(origsize, &urepl->size)) +- return -EFAULT; +- +- /* Hack: Causes ipchains to give correct error msg --RR */ +- if (optlen != sizeof(*urepl) + origsize) +- return -ENOPROTOOPT; +- +- /* XXX Assumes that size of ipt_entry is the same both in +- * native and compat environments. +- */ +- repl_nat_size = sizeof(*repl_nat) + origsize; +- repl_nat = compat_alloc_user_space(repl_nat_size); +- +- ret = -EFAULT; +- if (put_user(origsize, &repl_nat->size)) +- goto out; +- +- if (!access_ok(VERIFY_READ, urepl, optlen) || +- !access_ok(VERIFY_WRITE, repl_nat, optlen)) +- goto out; +- +- if (__copy_from_user(name, urepl->name, sizeof(urepl->name)) || +- __copy_to_user(repl_nat->name, name, sizeof(repl_nat->name))) +- goto out; +- +- if (__get_user(tmp32, &urepl->valid_hooks) || +- __put_user(tmp32, &repl_nat->valid_hooks)) +- goto out; +- +- if (__get_user(tmp32, &urepl->num_entries) || +- __put_user(tmp32, &repl_nat->num_entries)) +- goto out; +- +- if (__get_user(num_counters, &urepl->num_counters) || +- __put_user(num_counters, &repl_nat->num_counters)) +- goto out; +- +- if (__get_user(ucntrs, &urepl->counters) || +- __put_user(compat_ptr(ucntrs), &repl_nat->counters)) +- goto out; +- +- if (__copy_in_user(&repl_nat->entries[0], +- &urepl->entries[0], +- origsize)) +- goto out; +- +- for (i = 0; i < NF_IP_NUMHOOKS; i++) { +- if (__get_user(tmp32, &urepl->hook_entry[i]) || +- __put_user(tmp32, &repl_nat->hook_entry[i]) || +- __get_user(tmp32, &urepl->underflow[i]) || +- __put_user(tmp32, &repl_nat->underflow[i])) +- goto out; +- } +- +- /* +- * Since struct ipt_counters just contains two u_int64_t members +- * we can just do the access_ok check here and pass the (converted) +- * pointer into the standard syscall. We hope that the pointer is +- * not misaligned ... +- */ +- if (!access_ok(VERIFY_WRITE, compat_ptr(ucntrs), +- num_counters * sizeof(struct ipt_counters))) +- goto out; +- +- +- ret = sys_setsockopt(fd, level, optname, +- (char __user *)repl_nat, repl_nat_size); +- +-out: +- return ret; +-} +- +-/* + * A struct sock_filter is architecture independent. + */ + struct compat_sock_fprog { +@@ -460,10 +359,6 @@ static int do_set_sock_timeout(int fd, i + asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, + char __user *optval, int optlen) + { +- /* SO_SET_REPLACE seems to be the same in all levels */ +- if (optname == IPT_SO_SET_REPLACE) +- return do_netfilter_replace(fd, level, optname, +- optval, optlen); + if (level == SOL_SOCKET && optname == SO_ATTACH_FILTER) + return do_set_attach_filter(fd, level, optname, + optval, optlen); +diff -upr linux-2.6.16.orig/net/core/datagram.c linux-2.6.16-026test015/net/core/datagram.c +--- linux-2.6.16.orig/net/core/datagram.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/datagram.c 2006-07-04 14:41:37.000000000 +0400 +@@ -56,6 +56,8 @@ + #include <net/sock.h> + #include <net/tcp_states.h> + ++#include <ub/ub_net.h> ++ + /* + * Is a socket 'connection oriented' ? + */ +@@ -493,6 +495,7 @@ unsigned int datagram_poll(struct file * + { + struct sock *sk = sock->sk; + unsigned int mask; ++ int no_ubc_space; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; +@@ -500,8 +503,14 @@ unsigned int datagram_poll(struct file * + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; +- if (sk->sk_shutdown == SHUTDOWN_MASK) ++ if (sk->sk_shutdown == SHUTDOWN_MASK) { ++ no_ubc_space = 0; + mask |= POLLHUP; ++ } else { ++ no_ubc_space = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); ++ if (no_ubc_space) ++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); ++ } + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || +@@ -518,7 +527,7 @@ unsigned int datagram_poll(struct file * + } + + /* writable? */ +- if (sock_writeable(sk)) ++ if (!no_ubc_space && sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +diff -upr linux-2.6.16.orig/net/core/dev.c linux-2.6.16-026test015/net/core/dev.c +--- linux-2.6.16.orig/net/core/dev.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/dev.c 2006-07-04 14:41:39.000000000 +0400 +@@ -115,6 +115,10 @@ + #include <net/iw_handler.h> + #endif /* CONFIG_NET_RADIO */ + #include <asm/current.h> ++#include <ub/beancounter.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> + + /* + * The list of packet types we will receive (as opposed to discard) +@@ -167,25 +171,40 @@ static struct list_head ptype_all; /* T + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define dev_tail (get_exec_env()->_net_dev_tail) ++#else + struct net_device *dev_base; + static struct net_device **dev_tail = &dev_base; ++EXPORT_SYMBOL(dev_base); ++#endif + DEFINE_RWLOCK(dev_base_lock); + +-EXPORT_SYMBOL(dev_base); + EXPORT_SYMBOL(dev_base_lock); + ++#ifdef CONFIG_VE ++#define MAX_UNMOVABLE_NETDEVICES (8*4096) ++static uint8_t unmovable_ifindex_list[MAX_UNMOVABLE_NETDEVICES/8]; ++static LIST_HEAD(dev_global_list); ++#endif ++ + #define NETDEV_HASHBITS 8 + static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; + static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; + +-static inline struct hlist_head *dev_name_hash(const char *name) ++struct hlist_head *dev_name_hash(const char *name, struct ve_struct *env) + { +- unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); ++ unsigned hash; ++ if (!ve_is_super(env)) ++ return visible_dev_head(env); ++ hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); + return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; + } + +-static inline struct hlist_head *dev_index_hash(int ifindex) ++struct hlist_head *dev_index_hash(int ifindex, struct ve_struct *env) + { ++ if (!ve_is_super(env)) ++ return visible_dev_index_head(env); + return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; + } + +@@ -469,7 +488,7 @@ struct net_device *__dev_get_by_name(con + { + struct hlist_node *p; + +- hlist_for_each(p, dev_name_hash(name)) { ++ hlist_for_each(p, dev_name_hash(name, get_exec_env())) { + struct net_device *dev + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(dev->name, name, IFNAMSIZ)) +@@ -502,6 +521,32 @@ struct net_device *dev_get_by_name(const + } + + /** ++ * __dev_global_get_by_name - find a device by its name in dev_global_list ++ * @name: name to find ++ * ++ * Find an interface by name. Must be called under RTNL semaphore ++ * If the name is found a pointer to the device ++ * is returned. If the name is not found then %NULL is returned. The ++ * reference counters are not incremented so the caller must be ++ * careful with locks. ++ */ ++ ++#ifdef CONFIG_VE ++struct net_device *__dev_global_get_by_name(const char *name) ++{ ++ struct net_device *dev; ++ /* It's called relatively rarely */ ++ list_for_each_entry(dev, &dev_global_list, dev_global_list_entry) { ++ if (strncmp(dev->name, name, IFNAMSIZ) == 0) ++ return dev; ++ } ++ return NULL; ++} ++#else /* CONFIG_VE */ ++#define __dev_global_get_by_name(name) __dev_get_by_name(name) ++#endif /* CONFIG_VE */ ++ ++/** + * __dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * +@@ -516,7 +561,7 @@ struct net_device *__dev_get_by_index(in + { + struct hlist_node *p; + +- hlist_for_each(p, dev_index_hash(ifindex)) { ++ hlist_for_each(p, dev_index_hash(ifindex, get_exec_env())) { + struct net_device *dev + = hlist_entry(p, struct net_device, index_hlist); + if (dev->ifindex == ifindex) +@@ -635,6 +680,23 @@ int dev_valid_name(const char *name) + || strchr(name, '/')); + } + ++static inline void __dev_check_name(const char *dev_name, const char *name, ++ long *inuse, const int max_netdevices) ++{ ++ int i = 0; ++ char buf[IFNAMSIZ]; ++ ++ if (!sscanf(dev_name, name, &i)) ++ return; ++ if (i < 0 || i >= max_netdevices) ++ return; ++ ++ /* avoid cases where sscanf is not exact inverse of printf */ ++ snprintf(buf, sizeof(buf), name, i); ++ if (!strncmp(buf, dev_name, IFNAMSIZ)) ++ set_bit(i, inuse); ++} ++ + /** + * dev_alloc_name - allocate a name for a device + * @dev: device +@@ -671,16 +733,20 @@ int dev_alloc_name(struct net_device *de + if (!inuse) + return -ENOMEM; + +- for (d = dev_base; d; d = d->next) { +- if (!sscanf(d->name, name, &i)) +- continue; +- if (i < 0 || i >= max_netdevices) +- continue; +- +- /* avoid cases where sscanf is not exact inverse of printf */ +- snprintf(buf, sizeof(buf), name, i); +- if (!strncmp(buf, d->name, IFNAMSIZ)) +- set_bit(i, inuse); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) { ++ list_for_each_entry(d, &dev_global_list, ++ dev_global_list_entry) { ++ __dev_check_name(d->name, name, inuse, ++ max_netdevices); ++ } ++ } else ++#endif ++ { ++ for (d = dev_base; d; d = d->next) { ++ __dev_check_name(d->name, name, inuse, ++ max_netdevices); ++ } + } + + i = find_first_zero_bit(inuse, max_netdevices); +@@ -688,7 +754,11 @@ int dev_alloc_name(struct net_device *de + } + + snprintf(buf, sizeof(buf), name, i); +- if (!__dev_get_by_name(buf)) { ++ if (ve_is_super(get_exec_env())) ++ d = __dev_global_get_by_name(buf); ++ else ++ d = __dev_get_by_name(buf); ++ if (d == NULL) { + strlcpy(dev->name, buf, IFNAMSIZ); + return i; + } +@@ -721,13 +791,14 @@ int dev_change_name(struct net_device *d + if (!dev_valid_name(newname)) + return -EINVAL; + ++ /* Rename of devices in VE is prohibited by CAP_NET_ADMIN */ + if (strchr(newname, '%')) { + err = dev_alloc_name(dev, newname); + if (err < 0) + return err; + strcpy(newname, dev->name); + } +- else if (__dev_get_by_name(newname)) ++ else if (__dev_global_get_by_name(newname)) + return -EEXIST; + else + strlcpy(dev->name, newname, IFNAMSIZ); +@@ -735,7 +806,8 @@ int dev_change_name(struct net_device *d + err = class_device_rename(&dev->class_dev, dev->name); + if (!err) { + hlist_del(&dev->name_hlist); +- hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); ++ hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name, ++ get_exec_env())); + notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + } + +@@ -1294,6 +1366,25 @@ int dev_queue_xmit(struct sk_buff *skb) + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); + #endif + if (q->enqueue) { ++ struct user_beancounter *ub; ++ ++ ub = netdev_bc(dev)->exec_ub; ++ /* the skb CAN be already charged if it transmitted via ++ * something like bonding device */ ++ if (ub && (skb_bc(skb)->resource == 0)) { ++ unsigned long chargesize; ++ chargesize = skb_charge_fullsize(skb); ++ if (charge_beancounter(ub, UB_OTHERSOCKBUF, ++ chargesize, UB_SOFT)) { ++ rcu_read_unlock(); ++ rc = -ENOMEM; ++ goto out_kfree_skb; ++ } ++ skb_bc(skb)->ub = ub; ++ skb_bc(skb)->charged = chargesize; ++ skb_bc(skb)->resource = UB_OTHERSOCKBUF; ++ } ++ + /* Grab device queue */ + spin_lock(&dev->queue_lock); + +@@ -1580,6 +1671,7 @@ int netif_receive_skb(struct sk_buff *sk + struct net_device *orig_dev; + int ret = NET_RX_DROP; + unsigned short type; ++ struct ve_struct *old_env; + + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) +@@ -1598,6 +1690,17 @@ int netif_receive_skb(struct sk_buff *sk + skb->h.raw = skb->nh.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->mac.raw; + ++#ifdef CONFIG_VE ++ /* ++ * Skb might be alloced in another VE context, than its device works. ++ * So, set the correct owner_env. ++ */ ++ skb->owner_env = skb->dev->owner_env; ++ BUG_ON(skb->owner_env == NULL); ++#endif ++ ++ old_env = set_exec_env(VE_OWNER_SKB(skb)); ++ + pt_prev = NULL; + + rcu_read_lock(); +@@ -1663,6 +1766,7 @@ ncls: + + out: + rcu_read_unlock(); ++ (void)set_exec_env(old_env); + return ret; + } + +@@ -2038,7 +2142,7 @@ static int __init dev_proc_init(void) + { + int rc = -ENOMEM; + +- if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) ++ if (!proc_glob_fops_create("net/dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; +@@ -2050,7 +2154,7 @@ out: + out_softnet: + proc_net_remove("softnet_stat"); + out_dev: +- proc_net_remove("dev"); ++ remove_proc_glob_entry("net/dev", NULL); + goto out; + } + #else +@@ -2115,6 +2219,9 @@ void dev_set_promiscuity(struct net_devi + dev->flags &= ~IFF_PROMISC; + else + dev->flags |= IFF_PROMISC; ++ /* Promiscous mode on these devices does not mean anything */ ++ if (dev->flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) ++ return; + if (dev->flags != old_flags) { + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", +@@ -2529,9 +2636,28 @@ int dev_ioctl(unsigned int cmd, void __u + * - require strict serialization. + * - do not return a value + */ ++ case SIOCSIFMTU: ++ if (!capable(CAP_NET_ADMIN) && ++ !capable(CAP_VE_NET_ADMIN)) ++ return -EPERM; ++ dev_load(ifr.ifr_name); ++ rtnl_lock(); ++ if (!ve_is_super(get_exec_env())) { ++ struct net_device *dev; ++ ret = -ENODEV; ++ if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) ++ goto out_set_mtu_unlock; ++ ret = -EPERM; ++ if (ifr.ifr_mtu > dev->orig_mtu) ++ goto out_set_mtu_unlock; ++ } ++ ret = dev_ifsioc(&ifr, cmd); ++out_set_mtu_unlock: ++ rtnl_unlock(); ++ return ret; ++ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: +- case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: +@@ -2613,20 +2739,73 @@ int dev_ioctl(unsigned int cmd, void __u + * dev_new_index - allocate an ifindex + * + * Returns a suitable unique value for a new device interface +- * number. The caller must hold the rtnl semaphore or the ++ * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. ++ * ++ * Note: dev->name must be valid on entrance + */ +-static int dev_new_index(void) ++static int dev_ve_new_index(void) + { +- static int ifindex; ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ int *ifindex = &get_exec_env()->ifindex; ++ int delta = 2; ++#else ++ static int s_ifindex; ++ int *ifindex = &s_ifindex; ++ int delta = 1; ++#endif + for (;;) { +- if (++ifindex <= 0) +- ifindex = 1; +- if (!__dev_get_by_index(ifindex)) +- return ifindex; ++ *ifindex += delta; ++ if (*ifindex <= 0) ++ *ifindex = 1; ++ if (!__dev_get_by_index(*ifindex)) ++ return *ifindex; + } + } + ++#ifdef CONFIG_VE ++static int dev_glb_new_index(void) ++{ ++ int i; ++ ++ i = find_first_zero_bit((long*)unmovable_ifindex_list, ++ MAX_UNMOVABLE_NETDEVICES); ++ ++ if (i == MAX_UNMOVABLE_NETDEVICES) ++ return -EMFILE; ++ ++ __set_bit(i, (long*)unmovable_ifindex_list); ++ return (i + 1) * 2; ++} ++#endif ++ ++static void dev_glb_free_index(struct net_device *dev) ++{ ++#ifdef CONFIG_VE ++ int bit; ++ ++ bit = dev->ifindex / 2 - 1; ++ BUG_ON(bit >= MAX_UNMOVABLE_NETDEVICES); ++ __clear_bit(bit, (long*)unmovable_ifindex_list); ++#endif ++} ++ ++static int dev_new_index(struct net_device *dev) ++{ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) ++ return dev_glb_new_index(); ++#endif ++ ++ return dev_ve_new_index(); ++} ++ ++static void dev_free_index(struct net_device *dev) ++{ ++ if ((dev->ifindex % 2) == 0) ++ dev_glb_free_index(dev); ++} ++ + static int dev_boot_phase = 1; + + /* Delayed registration/unregisteration */ +@@ -2669,6 +2848,10 @@ int register_netdevice(struct net_device + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + ++ ret = -EPERM; ++ if (!ve_is_super(get_exec_env()) && ve_is_dev_movable(dev)) ++ goto out; ++ + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->xmit_lock); + dev->xmit_lock_owner = -1; +@@ -2688,27 +2871,32 @@ int register_netdevice(struct net_device + if (ret) { + if (ret > 0) + ret = -EIO; +- goto out_err; ++ goto out_free_div; + } + } + + if (!dev_valid_name(dev->name)) { + ret = -EINVAL; +- goto out_err; ++ goto out_free_div; ++ } ++ ++ dev->ifindex = dev_new_index(dev); ++ if (dev->ifindex < 0) { ++ ret = dev->ifindex; ++ goto out_free_div; + } + +- dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Check for existence of name */ +- head = dev_name_hash(dev->name); ++ head = dev_name_hash(dev->name, get_exec_env()); + hlist_for_each(p, head) { + struct net_device *d + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(d->name, dev->name, IFNAMSIZ)) { + ret = -EEXIST; +- goto out_err; ++ goto out_free_ind; + } + } + +@@ -2760,12 +2948,21 @@ int register_netdevice(struct net_device + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev->next = NULL; ++ dev->owner_env = get_exec_env(); ++ dev->orig_mtu = dev->mtu; ++ netdev_bc(dev)->owner_ub = get_beancounter(get_exec_ub()); ++ netdev_bc(dev)->exec_ub = get_beancounter(get_exec_ub()); + dev_init_scheduler(dev); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ list_add_tail(&dev->dev_global_list_entry, &dev_global_list); ++#endif + write_lock_bh(&dev_base_lock); + *dev_tail = dev; + dev_tail = &dev->next; + hlist_add_head(&dev->name_hlist, head); +- hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); ++ hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex, ++ get_exec_env())); + dev_hold(dev); + dev->reg_state = NETREG_REGISTERING; + write_unlock_bh(&dev_base_lock); +@@ -2779,7 +2976,9 @@ int register_netdevice(struct net_device + + out: + return ret; +-out_err: ++out_free_ind: ++ dev_free_index(dev); ++out_free_div: + free_divert_blk(dev); + goto out; + } +@@ -2825,6 +3024,10 @@ int register_netdev(struct net_device *d + err = register_netdevice(dev); + out: + rtnl_unlock(); ++ if (err == 0 && dev->reg_state != NETREG_REGISTERED) { ++ unregister_netdev(dev); ++ err = -ENOMEM; ++ } + return err; + } + EXPORT_SYMBOL(register_netdev); +@@ -2907,6 +3110,7 @@ void netdev_run_todo(void) + { + struct list_head list = LIST_HEAD_INIT(list); + int err; ++ struct ve_struct *current_env; + + + /* Need to guard against multiple cpu's getting out of order. */ +@@ -2925,22 +3129,30 @@ void netdev_run_todo(void) + list_splice_init(&net_todo_list, &list); + spin_unlock(&net_todo_list_lock); + ++ current_env = get_exec_env(); + while (!list_empty(&list)) { + struct net_device *dev + = list_entry(list.next, struct net_device, todo_list); + list_del(&dev->todo_list); + ++ (void)set_exec_env(dev->owner_env); + switch(dev->reg_state) { + case NETREG_REGISTERING: ++ dev->reg_state = NETREG_REGISTERED; + err = netdev_register_sysfs(dev); +- if (err) ++ if (err) { + printk(KERN_ERR "%s: failed sysfs registration (%d)\n", + dev->name, err); +- dev->reg_state = NETREG_REGISTERED; ++ dev->reg_state = NETREG_REGISTER_ERR; ++ break; ++ } + break; + + case NETREG_UNREGISTERING: + netdev_unregister_sysfs(dev); ++ /* fall through */ ++ ++ case NETREG_REGISTER_ERR: + dev->reg_state = NETREG_UNREGISTERED; + + netdev_wait_allrefs(dev); +@@ -2951,6 +3163,10 @@ void netdev_run_todo(void) + BUG_TRAP(!dev->ip6_ptr); + BUG_TRAP(!dev->dn_ptr); + ++ put_beancounter(netdev_bc(dev)->exec_ub); ++ put_beancounter(netdev_bc(dev)->owner_ub); ++ netdev_bc(dev)->exec_ub = NULL; ++ netdev_bc(dev)->owner_ub = NULL; + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. +@@ -2965,6 +3181,7 @@ void netdev_run_todo(void) + break; + } + } ++ (void)set_exec_env(current_env); + + out: + up(&net_todo_run_mutex); +@@ -2990,7 +3207,7 @@ struct net_device *alloc_netdev(int size + alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; + +- p = kmalloc(alloc_size, GFP_KERNEL); ++ p = ub_kmalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); + return NULL; +@@ -3070,7 +3287,8 @@ int unregister_netdevice(struct net_devi + return -ENODEV; + } + +- BUG_ON(dev->reg_state != NETREG_REGISTERED); ++ BUG_ON(dev->reg_state != NETREG_REGISTERED && ++ dev->reg_state != NETREG_REGISTER_ERR); + + /* If device is running, close it first. */ + if (dev->flags & IFF_UP) +@@ -3086,6 +3304,10 @@ int unregister_netdevice(struct net_devi + dev_tail = dp; + *dp = d->next; + write_unlock_bh(&dev_base_lock); ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ list_del(&dev->dev_global_list_entry); ++#endif + break; + } + } +@@ -3095,7 +3317,8 @@ int unregister_netdevice(struct net_devi + return -ENODEV; + } + +- dev->reg_state = NETREG_UNREGISTERING; ++ if (dev->reg_state != NETREG_REGISTER_ERR) ++ dev->reg_state = NETREG_UNREGISTERING; + + synchronize_net(); + +@@ -3119,6 +3342,8 @@ int unregister_netdevice(struct net_devi + /* Notifier chain MUST detach us from master device. */ + BUG_TRAP(!dev->master); + ++ dev_free_index(dev); ++ + free_divert_blk(dev); + + /* Finish processing unregister after unlock */ +@@ -3276,6 +3501,8 @@ EXPORT_SYMBOL(dev_close); + EXPORT_SYMBOL(dev_get_by_flags); + EXPORT_SYMBOL(dev_get_by_index); + EXPORT_SYMBOL(dev_get_by_name); ++EXPORT_SYMBOL(dev_name_hash); ++EXPORT_SYMBOL(dev_index_hash); + EXPORT_SYMBOL(dev_open); + EXPORT_SYMBOL(dev_queue_xmit); + EXPORT_SYMBOL(dev_remove_pack); +diff -upr linux-2.6.16.orig/net/core/dev_mcast.c linux-2.6.16-026test015/net/core/dev_mcast.c +--- linux-2.6.16.orig/net/core/dev_mcast.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/dev_mcast.c 2006-07-04 14:41:38.000000000 +0400 +@@ -290,9 +290,10 @@ static struct file_operations dev_mc_seq + + void __init dev_mcast_init(void) + { +- proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); ++ proc_glob_fops_create("net/dev_mcast", 0, &dev_mc_seq_fops); + } + + EXPORT_SYMBOL(dev_mc_add); + EXPORT_SYMBOL(dev_mc_delete); + EXPORT_SYMBOL(dev_mc_upload); ++EXPORT_SYMBOL(dev_mc_discard); +diff -upr linux-2.6.16.orig/net/core/dst.c linux-2.6.16-026test015/net/core/dst.c +--- linux-2.6.16.orig/net/core/dst.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/dst.c 2006-07-04 14:41:39.000000000 +0400 +@@ -95,12 +95,11 @@ static void dst_run_gc(unsigned long dum + dst_gc_timer_inc = DST_GC_INC; + dst_gc_timer_expires = DST_GC_MIN; + } +- dst_gc_timer.expires = jiffies + dst_gc_timer_expires; + #if RT_CACHE_DEBUG >= 2 + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); + #endif +- add_timer(&dst_gc_timer); ++ mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); + + out: + spin_unlock(&dst_lock); +@@ -260,11 +259,14 @@ static int dst_dev_event(struct notifier + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: +- spin_lock_bh(&dst_lock); ++ local_bh_disable(); ++ dst_run_gc(0); ++ spin_lock(&dst_lock); + for (dst = dst_garbage_list; dst; dst = dst->next) { + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } +- spin_unlock_bh(&dst_lock); ++ spin_unlock(&dst_lock); ++ local_bh_enable(); + break; + } + return NOTIFY_DONE; +diff -upr linux-2.6.16.orig/net/core/dv.c linux-2.6.16-026test015/net/core/dv.c +--- linux-2.6.16.orig/net/core/dv.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/dv.c 2006-07-04 14:41:37.000000000 +0400 +@@ -547,3 +547,5 @@ void divert_frame(struct sk_buff *skb) + break; + } + } ++ ++EXPORT_SYMBOL(free_divert_blk); +diff -upr linux-2.6.16.orig/net/core/filter.c linux-2.6.16-026test015/net/core/filter.c +--- linux-2.6.16.orig/net/core/filter.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/filter.c 2006-07-04 14:41:37.000000000 +0400 +@@ -34,6 +34,7 @@ + #include <linux/timer.h> + #include <asm/system.h> + #include <asm/uaccess.h> ++#include <asm/unaligned.h> + #include <linux/filter.h> + + /* No hurry in this branch */ +@@ -177,7 +178,7 @@ unsigned int sk_run_filter(struct sk_buf + load_w: + ptr = load_pointer(skb, k, 4, &tmp); + if (ptr != NULL) { +- A = ntohl(*(u32 *)ptr); ++ A = ntohl(get_unaligned((u32 *)ptr)); + continue; + } + break; +@@ -186,7 +187,7 @@ load_w: + load_h: + ptr = load_pointer(skb, k, 2, &tmp); + if (ptr != NULL) { +- A = ntohs(*(u16 *)ptr); ++ A = ntohs(get_unaligned((u16 *)ptr)); + continue; + } + break; +@@ -406,7 +407,7 @@ int sk_attach_filter(struct sock_fprog * + if (fprog->filter == NULL) + return -EINVAL; + +- fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); ++ fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_UBC); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { +diff -upr linux-2.6.16.orig/net/core/neighbour.c linux-2.6.16-026test015/net/core/neighbour.c +--- linux-2.6.16.orig/net/core/neighbour.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/neighbour.c 2006-07-04 14:41:39.000000000 +0400 +@@ -33,6 +33,7 @@ + #include <linux/rtnetlink.h> + #include <linux/random.h> + #include <linux/string.h> ++#include <ub/beancounter.h> + + #define NEIGH_DEBUG 1 + +@@ -639,6 +640,8 @@ static void neigh_periodic_timer(unsigne + struct neigh_table *tbl = (struct neigh_table *)arg; + struct neighbour *n, **np; + unsigned long expire, now = jiffies; ++ struct ve_struct *env = set_exec_env(tbl->owner_env); ++ struct user_beancounter *ub = set_exec_ub(tbl->owner_ub); + + NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + +@@ -700,6 +703,8 @@ next_elt: + mod_timer(&tbl->gc_timer, now + expire); + + write_unlock(&tbl->lock); ++ set_exec_ub(ub); ++ set_exec_env(env); + } + + static __inline__ int neigh_max_probes(struct neighbour *n) +@@ -727,6 +732,11 @@ static void neigh_timer_handler(unsigned + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; ++ struct ve_struct *env; ++ struct user_beancounter *ub; ++ ++ env = set_exec_env(neigh->dev->owner_env); ++ ub = set_exec_ub(netdev_bc(neigh->dev)->exec_ub); + + write_lock(&neigh->lock); + +@@ -824,6 +834,8 @@ out: + neigh_app_notify(neigh); + #endif + neigh_release(neigh); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(env); + } + + int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +@@ -1202,6 +1214,9 @@ static void neigh_proxy_process(unsigned + unsigned long now = jiffies; + struct sk_buff *skb; + ++ struct ve_struct *env = set_exec_env(tbl->owner_env); ++ struct user_beancounter *ub = set_exec_ub(tbl->owner_ub); ++ + spin_lock(&tbl->proxy_queue.lock); + + skb = tbl->proxy_queue.next; +@@ -1213,6 +1228,7 @@ static void neigh_proxy_process(unsigned + skb = skb->next; + if (tdif <= 0) { + struct net_device *dev = back->dev; ++ + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) + tbl->proxy_redo(back); +@@ -1220,6 +1236,7 @@ static void neigh_proxy_process(unsigned + kfree_skb(back); + + dev_put(dev); ++ + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } +@@ -1227,6 +1244,8 @@ static void neigh_proxy_process(unsigned + if (sched_next) + mod_timer(&tbl->proxy_timer, jiffies + sched_next); + spin_unlock(&tbl->proxy_queue.lock); ++ (void)set_exec_ub(ub); ++ (void)set_exec_env(env); + } + + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, +@@ -1323,12 +1342,15 @@ void neigh_parms_destroy(struct neigh_pa + } + + +-void neigh_table_init(struct neigh_table *tbl) ++int neigh_table_init(struct neigh_table *tbl) + { + unsigned long now = jiffies; + unsigned long phsize; + + atomic_set(&tbl->parms.refcnt, 1); ++ atomic_set(&tbl->entries, 0); ++ tbl->hash_chain_gc = 0; ++ tbl->parms.next = NULL; + INIT_RCU_HEAD(&tbl->parms.rcu_head); + tbl->parms.reachable_time = + neigh_rand_reach_time(tbl->parms.base_reachable_time); +@@ -1336,22 +1358,30 @@ void neigh_table_init(struct neigh_table + if (!tbl->kmem_cachep) + tbl->kmem_cachep = kmem_cache_create(tbl->id, + tbl->entry_size, +- 0, SLAB_HWCACHE_ALIGN, ++ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, + NULL, NULL); + + if (!tbl->kmem_cachep) +- panic("cannot create neighbour cache"); ++ return -ENOMEM; ++ ++ tbl->owner_env = get_ve(get_exec_env()); ++ tbl->owner_ub = get_beancounter(get_exec_ub()); + + tbl->stats = alloc_percpu(struct neigh_statistics); + if (!tbl->stats) +- panic("cannot create neighbour cache statistics"); ++ return -ENOMEM; + + #ifdef CONFIG_PROC_FS +- tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); +- if (!tbl->pde) +- panic("cannot create neighbour proc dir entry"); +- tbl->pde->proc_fops = &neigh_stat_seq_fops; +- tbl->pde->data = tbl; ++ if (ve_is_super(get_exec_env())) { ++ char name[strlen(tbl->id) + sizeof("net/stat/")]; ++ strcpy(name, "net/stat/"); ++ strcat(name, tbl->id); ++ tbl->pde = create_proc_glob_entry(name, S_IRUGO, NULL); ++ if (tbl->pde) { ++ tbl->pde->proc_fops = &neigh_stat_seq_fops; ++ tbl->pde->data = tbl; ++ } ++ } + #endif + + tbl->hash_mask = 1; +@@ -1361,7 +1391,7 @@ void neigh_table_init(struct neigh_table + tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL); + + if (!tbl->hash_buckets || !tbl->phash_buckets) +- panic("cannot allocate neighbour cache hashes"); ++ goto nomem; + + memset(tbl->phash_buckets, 0, phsize); + +@@ -1385,6 +1415,24 @@ void neigh_table_init(struct neigh_table + tbl->next = neigh_tables; + neigh_tables = tbl; + write_unlock(&neigh_tbl_lock); ++ return 0; ++ ++nomem: ++ if (tbl->hash_buckets) { ++ neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); ++ tbl->hash_buckets = NULL; ++ } ++ if (tbl->phash_buckets) { ++ kfree(tbl->phash_buckets); ++ tbl->phash_buckets = NULL; ++ } ++ if (tbl->stats) { ++ free_percpu(tbl->stats); ++ tbl->stats = NULL; ++ } ++ put_beancounter(tbl->owner_ub); ++ put_ve(tbl->owner_env); ++ return -ENOMEM; + } + + int neigh_table_clear(struct neigh_table *tbl) +@@ -1398,6 +1446,15 @@ int neigh_table_clear(struct neigh_table + neigh_ifdown(tbl, NULL); + if (atomic_read(&tbl->entries)) + printk(KERN_CRIT "neighbour leakage\n"); ++#ifdef CONFIG_PROC_FS ++ if (ve_is_super(get_exec_env())) { ++ char name[strlen(tbl->id) + sizeof("net/stat/")]; ++ strcpy(name, "net/stat/"); ++ strcat(name, tbl->id); ++ remove_proc_glob_entry(name, NULL); ++ } ++#endif ++ + write_lock(&neigh_tbl_lock); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { +@@ -1413,6 +1470,9 @@ int neigh_table_clear(struct neigh_table + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + ++ put_beancounter(tbl->owner_ub); ++ put_ve(tbl->owner_env); ++ + return 0; + } + +@@ -1435,6 +1495,8 @@ int neigh_delete(struct sk_buff *skb, st + + if (tbl->family != ndm->ndm_family) + continue; ++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) ++ continue; + read_unlock(&neigh_tbl_lock); + + err = -EINVAL; +@@ -1488,6 +1550,8 @@ int neigh_add(struct sk_buff *skb, struc + + if (tbl->family != ndm->ndm_family) + continue; ++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) ++ continue; + read_unlock(&neigh_tbl_lock); + + err = -EINVAL; +@@ -1720,6 +1784,9 @@ int neightbl_set(struct sk_buff *skb, st + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) + continue; + ++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) ++ continue; ++ + if (!rtattr_strcmp(tb[NDTA_NAME - 1], tbl->id)) + break; + } +@@ -1941,6 +2008,8 @@ int neigh_dump_info(struct sk_buff *skb, + s_t = cb->args[0]; + + for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { ++ if (!ve_accessible_strict(tbl->owner_env, get_exec_env())) ++ continue; + if (t < s_t || (family && tbl->family != family)) + continue; + if (t > s_t) +@@ -2530,11 +2599,12 @@ int neigh_sysctl_register(struct net_dev + int p_id, int pdev_id, char *p_name, + proc_handler *handler, ctl_handler *strategy) + { +- struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); ++ struct neigh_sysctl_table *t; + const char *dev_name_source = NULL; + char *dev_name = NULL; + int err = 0; + ++ t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); +diff -upr linux-2.6.16.orig/net/core/net-sysfs.c linux-2.6.16-026test015/net/core/net-sysfs.c +--- linux-2.6.16.orig/net/core/net-sysfs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/net-sysfs.c 2006-07-04 14:41:38.000000000 +0400 +@@ -388,12 +388,13 @@ static void netdev_release(struct class_ + struct net_device *dev + = container_of(cd, struct net_device, class_dev); + +- BUG_ON(dev->reg_state != NETREG_RELEASED); ++ BUG_ON(dev->reg_state != NETREG_RELEASED && ++ dev->reg_state != NETREG_REGISTERING); + + kfree((char *)dev - dev->padded); + } + +-static struct class net_class = { ++struct class net_class = { + .name = "net", + .release = netdev_release, + .class_dev_attrs = net_class_attributes, +@@ -401,6 +402,13 @@ static struct class net_class = { + .uevent = netdev_uevent, + #endif + }; ++EXPORT_SYMBOL(net_class); ++ ++#ifndef CONFIG_VE ++#define visible_net_class net_class ++#else ++#define visible_net_class (*get_exec_env()->net_class) ++#endif + + void netdev_unregister_sysfs(struct net_device * net) + { +@@ -424,7 +432,7 @@ int netdev_register_sysfs(struct net_dev + struct class_device *class_dev = &(net->class_dev); + int ret; + +- class_dev->class = &net_class; ++ class_dev->class = &visible_net_class; + class_dev->class_data = net; + + strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); +@@ -453,12 +461,21 @@ out_cleanup: + out_unreg: + printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n", + net->name, ret); +- class_device_unregister(class_dev); ++ /* put is called in free_netdev() */ ++ class_device_del(class_dev); + out: + return ret; + } + ++void prepare_sysfs_netdev(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->net_class = &net_class; ++#endif ++} ++ + int netdev_sysfs_init(void) + { ++ prepare_sysfs_netdev(); + return class_register(&net_class); + } +diff -upr linux-2.6.16.orig/net/core/rtnetlink.c linux-2.6.16-026test015/net/core/rtnetlink.c +--- linux-2.6.16.orig/net/core/rtnetlink.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/rtnetlink.c 2006-07-04 14:41:38.000000000 +0400 +@@ -434,6 +434,8 @@ static int rtnetlink_dump_all(struct sk_ + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; ++ if (vz_security_proto_check(idx, 0, 0)) ++ continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb)) +@@ -501,7 +503,7 @@ rtnetlink_rcv_msg(struct sk_buff *skb, s + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; +- if (family >= NPROTO) { ++ if (family >= NPROTO || vz_security_proto_check(family, 0, 0)) { + *errp = -EAFNOSUPPORT; + return -1; + } +diff -upr linux-2.6.16.orig/net/core/scm.c linux-2.6.16-026test015/net/core/scm.c +--- linux-2.6.16.orig/net/core/scm.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/scm.c 2006-07-04 14:41:38.000000000 +0400 +@@ -34,6 +34,7 @@ + #include <net/compat.h> + #include <net/scm.h> + ++#include <ub/ub_mem.h> + + /* + * Only allow a user to send credentials, that they could set with +@@ -42,7 +43,9 @@ + + static __inline__ int scm_check_creds(struct ucred *creds) + { +- if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && ++ if ((creds->pid == virt_tgid(current) || ++ creds->pid == current->tgid || ++ capable(CAP_VE_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || +@@ -69,7 +72,7 @@ static int scm_fp_copy(struct cmsghdr *c + + if (!fpl) + { +- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); ++ fpl = ub_kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; +@@ -275,7 +278,7 @@ struct scm_fp_list *scm_fp_dup(struct sc + if (!fpl) + return NULL; + +- new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); ++ new_fpl = ub_kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + for (i=fpl->count-1; i>=0; i--) + get_file(fpl->fp[i]); +diff -upr linux-2.6.16.orig/net/core/skbuff.c linux-2.6.16-026test015/net/core/skbuff.c +--- linux-2.6.16.orig/net/core/skbuff.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/skbuff.c 2006-07-04 14:41:38.000000000 +0400 +@@ -48,6 +48,7 @@ + #include <linux/in.h> + #include <linux/inet.h> + #include <linux/slab.h> ++#include <linux/kmem_cache.h> + #include <linux/netdevice.h> + #ifdef CONFIG_NET_CLS_ACT + #include <net/pkt_sched.h> +@@ -68,6 +69,8 @@ + #include <asm/uaccess.h> + #include <asm/system.h> + ++#include <ub/ub_net.h> ++ + static kmem_cache_t *skbuff_head_cache __read_mostly; + static kmem_cache_t *skbuff_fclone_cache __read_mostly; + +@@ -147,6 +150,9 @@ struct sk_buff *__alloc_skb(unsigned int + if (!skb) + goto out; + ++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) ++ goto nobc; ++ + /* Get the DATA. Size must match skb_add_mtu(). */ + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); +@@ -160,6 +166,7 @@ struct sk_buff *__alloc_skb(unsigned int + skb->data = data; + skb->tail = data; + skb->end = data + size; ++ SET_VE_OWNER_SKB(skb, get_exec_env()); + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); +@@ -182,6 +189,8 @@ struct sk_buff *__alloc_skb(unsigned int + out: + return skb; + nodata: ++ ub_skb_free_bc(skb); ++nobc: + kmem_cache_free(cache, skb); + skb = NULL; + goto out; +@@ -214,6 +223,9 @@ struct sk_buff *alloc_skb_from_cache(kme + if (!skb) + goto out; + ++ if (ub_skb_alloc_bc(skb, gfp_mask & ~__GFP_DMA)) ++ goto nobc; ++ + /* Get the DATA. */ + size = SKB_DATA_ALIGN(size); + data = kmem_cache_alloc(cp, gfp_mask); +@@ -227,6 +239,7 @@ struct sk_buff *alloc_skb_from_cache(kme + skb->data = data; + skb->tail = data; + skb->end = data + size; ++ SET_VE_OWNER_SKB(skb, get_exec_env()); + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; +@@ -236,6 +249,8 @@ struct sk_buff *alloc_skb_from_cache(kme + out: + return skb; + nodata: ++ ub_skb_free_bc(skb); ++nobc: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +@@ -290,6 +305,7 @@ void kfree_skbmem(struct sk_buff *skb) + atomic_t *fclone_ref; + + skb_release_data(skb); ++ ub_skb_free_bc(skb); + switch (skb->fclone) { + case SKB_FCLONE_UNAVAILABLE: + kmem_cache_free(skbuff_head_cache, skb); +@@ -331,6 +347,7 @@ void __kfree_skb(struct sk_buff *skb) + #ifdef CONFIG_XFRM + secpath_put(skb->sp); + #endif ++ ub_skb_uncharge(skb); + if (skb->destructor) { + WARN_ON(in_irq()); + skb->destructor(skb); +@@ -386,6 +403,11 @@ struct sk_buff *skb_clone(struct sk_buff + n->fclone = SKB_FCLONE_UNAVAILABLE; + } + ++ if (ub_skb_alloc_bc(n, gfp_mask)) { ++ kmem_cache_free(skbuff_head_cache, n); ++ return NULL; ++ } ++ + #define C(x) n->x = skb->x + + n->next = n->prev = NULL; +@@ -415,6 +437,7 @@ struct sk_buff *skb_clone(struct sk_buff + C(ipvs_property); + #endif + C(protocol); ++ SET_VE_OWNER_SKB(n, VE_OWNER_SKB(skb)); + n->destructor = NULL; + #ifdef CONFIG_NETFILTER + C(nfmark); +diff -upr linux-2.6.16.orig/net/core/sock.c linux-2.6.16-026test015/net/core/sock.c +--- linux-2.6.16.orig/net/core/sock.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/sock.c 2006-07-04 14:41:38.000000000 +0400 +@@ -108,6 +108,7 @@ + #include <linux/net.h> + #include <linux/mm.h> + #include <linux/slab.h> ++#include <linux/kmem_cache.h> + #include <linux/interrupt.h> + #include <linux/poll.h> + #include <linux/tcp.h> +@@ -124,6 +125,9 @@ + #include <net/xfrm.h> + #include <linux/ipsec.h> + ++#include <ub/ub_net.h> ++#include <ub/beancounter.h> ++ + #include <linux/filter.h> + + #ifdef CONFIG_INET +@@ -172,7 +176,7 @@ static void sock_warn_obsolete_bsdism(co + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); +- printk(KERN_WARNING "process `%s' is using obsolete " ++ ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +@@ -404,8 +408,9 @@ set_rcvbuf: + if (!valbool) { + sk->sk_bound_dev_if = 0; + } else { +- if (optlen > IFNAMSIZ) +- optlen = IFNAMSIZ; ++ if (optlen > IFNAMSIZ - 1) ++ optlen = IFNAMSIZ - 1; ++ memset(devname, 0, sizeof(devname)); + if (copy_from_user(devname, optval, optlen)) { + ret = -EFAULT; + break; +@@ -659,6 +664,7 @@ struct sock *sk_alloc(int family, gfp_t + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); ++ SET_VE_OWNER_SK(sk, get_exec_env()); + } + + if (security_sk_alloc(sk, family, priority)) +@@ -698,6 +704,7 @@ void sk_free(struct sock *sk) + __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + + security_sk_free(sk); ++ ub_sock_uncharge(sk); + if (sk->sk_prot_creator->slab != NULL) + kmem_cache_free(sk->sk_prot_creator->slab, sk); + else +@@ -742,14 +749,11 @@ struct sock *sk_clone(const struct sock + if (filter != NULL) + sk_filter_charge(newsk, filter); + +- if (unlikely(xfrm_sk_clone_policy(newsk))) { +- /* It is still raw copy of parent, so invalidate +- * destructor and make plain sk_free() */ +- newsk->sk_destruct = NULL; +- sk_free(newsk); +- newsk = NULL; +- goto out; +- } ++ if (ub_sock_charge(newsk, newsk->sk_family, newsk->sk_type) < 0) ++ goto out_err; ++ ++ if (unlikely(xfrm_sk_clone_policy(newsk))) ++ goto out_err; + + newsk->sk_err = 0; + newsk->sk_priority = 0; +@@ -773,8 +777,15 @@ struct sock *sk_clone(const struct sock + if (newsk->sk_prot->sockets_allocated) + atomic_inc(newsk->sk_prot->sockets_allocated); + } +-out: + return newsk; ++ ++out_err: ++ /* It is still raw copy of parent, so invalidate ++ * destructor and make plain sk_free() */ ++ sock_reset_flag(newsk, SOCK_TIMESTAMP); ++ newsk->sk_destruct = NULL; ++ sk_free(newsk); ++ return NULL; + } + + EXPORT_SYMBOL_GPL(sk_clone); +@@ -934,14 +945,12 @@ static long sock_wait_for_wmem(struct so + /* + * Generic send/receive buffer handlers + */ +- +-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, +- unsigned long header_len, +- unsigned long data_len, +- int noblock, int *errcode) ++struct sk_buff *sock_alloc_send_skb2(struct sock *sk, unsigned long size, ++ unsigned long size2, int noblock, ++ int *errcode) + { + struct sk_buff *skb; +- gfp_t gfp_mask; ++ unsigned int gfp_mask; + long timeo; + int err; + +@@ -959,46 +968,35 @@ static struct sk_buff *sock_alloc_send_p + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + +- if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { +- skb = alloc_skb(header_len, sk->sk_allocation); +- if (skb) { +- int npages; +- int i; +- +- /* No pages, we're done... */ +- if (!data_len) +- break; +- +- npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; +- skb->truesize += data_len; +- skb_shinfo(skb)->nr_frags = npages; +- for (i = 0; i < npages; i++) { +- struct page *page; +- skb_frag_t *frag; +- +- page = alloc_pages(sk->sk_allocation, 0); +- if (!page) { +- err = -ENOBUFS; +- skb_shinfo(skb)->nr_frags = i; +- kfree_skb(skb); +- goto failure; +- } +- +- frag = &skb_shinfo(skb)->frags[i]; +- frag->page = page; +- frag->page_offset = 0; +- frag->size = (data_len >= PAGE_SIZE ? +- PAGE_SIZE : +- data_len); +- data_len -= PAGE_SIZE; +- } ++ if (ub_sock_getwres_other(sk, skb_charge_size(size))) { ++ if (size2 < size) { ++ size = size2; ++ continue; ++ } ++ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); ++ err = -EAGAIN; ++ if (!timeo) ++ goto failure; ++ if (signal_pending(current)) ++ goto interrupted; ++ timeo = ub_sock_wait_for_space(sk, timeo, ++ skb_charge_size(size)); ++ continue; ++ } + ++ if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { ++ skb = alloc_skb(size, sk->sk_allocation); ++ if (skb) + /* Full success... */ + break; +- } ++ ub_sock_retwres_other(sk, skb_charge_size(size), ++ SOCK_MIN_UBCSPACE_CH); + err = -ENOBUFS; + goto failure; + } ++ ub_sock_retwres_other(sk, ++ skb_charge_size(size), ++ SOCK_MIN_UBCSPACE_CH); + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; +@@ -1009,6 +1007,7 @@ static struct sk_buff *sock_alloc_send_p + timeo = sock_wait_for_wmem(sk, timeo); + } + ++ ub_skb_set_charge(skb, sk, skb_charge_size(size), UB_OTHERSOCKBUF); + skb_set_owner_w(skb, sk); + return skb; + +@@ -1022,7 +1021,7 @@ failure: + struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) + { +- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); ++ return sock_alloc_send_skb2(sk, size, size, noblock, errcode); + } + + static void __lock_sock(struct sock *sk) +@@ -1462,7 +1461,8 @@ int proto_register(struct proto *prot, i + + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", +@@ -1478,9 +1478,11 @@ int proto_register(struct proto *prot, i + goto out_free_sock_slab; + + sprintf(request_sock_slab_name, mask, prot->name); +- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, +- prot->rsk_prot->obj_size, 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ prot->rsk_prot->slab = ++ kmem_cache_create(request_sock_slab_name, ++ prot->rsk_prot->obj_size, 0, ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + + if (prot->rsk_prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n", +@@ -1501,7 +1503,7 @@ int proto_register(struct proto *prot, i + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, +- 0, SLAB_HWCACHE_ALIGN, ++ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, + NULL, NULL); + if (prot->twsk_prot->twsk_slab == NULL) + goto out_free_timewait_sock_slab_name; +diff -upr linux-2.6.16.orig/net/core/stream.c linux-2.6.16-026test015/net/core/stream.c +--- linux-2.6.16.orig/net/core/stream.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/core/stream.c 2006-07-04 14:41:37.000000000 +0400 +@@ -111,8 +111,9 @@ EXPORT_SYMBOL(sk_stream_wait_close); + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk: socket to wait for memory + * @timeo_p: for how long ++ * @amount - amount of memory to wait for (in UB space!) + */ +-int sk_stream_wait_memory(struct sock *sk, long *timeo_p) ++int sk_stream_wait_memory(struct sock *sk, long *timeo_p, unsigned long amount) + { + int err = 0; + long vm_wait = 0; +@@ -134,8 +135,11 @@ int sk_stream_wait_memory(struct sock *s + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); +- if (sk_stream_memory_free(sk) && !vm_wait) +- break; ++ if (amount == 0) { ++ if (sk_stream_memory_free(sk) && !vm_wait) ++ break; ++ } else ++ ub_sock_sndqueueadd_tcp(sk, amount); + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; +@@ -144,6 +148,8 @@ int sk_stream_wait_memory(struct sock *s + sk_stream_memory_free(sk) && + vm_wait); + sk->sk_write_pending--; ++ if (amount > 0) ++ ub_sock_sndqueuedel(sk); + + if (vm_wait) { + vm_wait -= current_timeo; +diff -upr linux-2.6.16.orig/net/dccp/ipv6.c linux-2.6.16-026test015/net/dccp/ipv6.c +--- linux-2.6.16.orig/net/dccp/ipv6.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/dccp/ipv6.c 2006-07-04 14:41:37.000000000 +0400 +@@ -872,6 +872,8 @@ static struct sock *dccp_v6_request_recv + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); ++ if (!sysctl_tcp_use_sg) ++ newsk->sk_route_caps &= ~NETIF_F_SG; + + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); +diff -upr linux-2.6.16.orig/net/ipv4/af_inet.c linux-2.6.16-026test015/net/ipv4/af_inet.c +--- linux-2.6.16.orig/net/ipv4/af_inet.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/af_inet.c 2006-07-04 14:41:38.000000000 +0400 +@@ -114,6 +114,7 @@ + #ifdef CONFIG_IP_MROUTE + #include <linux/mroute.h> + #endif ++#include <ub/ub_net.h> + + DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; + +@@ -298,6 +299,13 @@ lookup_protocol: + if (sk == NULL) + goto out; + ++ err = -ENOBUFS; ++ if (ub_sock_charge(sk, PF_INET, sock->type)) ++ goto out_sk_free; ++ /* if charge was successful, sock_init_data() MUST be called to ++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource ++ */ ++ + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) +@@ -355,6 +363,9 @@ out: + out_rcu_unlock: + rcu_read_unlock(); + goto out; ++out_sk_free: ++ sk_free(sk); ++ return err; + } + + +@@ -369,6 +380,9 @@ int inet_release(struct socket *sock) + + if (sk) { + long timeout; ++ struct ve_struct *saved_env; ++ ++ saved_env = set_exec_env(VE_OWNER_SK(sk)); + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); +@@ -386,6 +400,8 @@ int inet_release(struct socket *sock) + timeout = sk->sk_lingertime; + sock->sk = NULL; + sk->sk_prot->close(sk, timeout); ++ ++ (void)set_exec_env(saved_env); + } + return 0; + } +@@ -1108,20 +1124,20 @@ static struct net_protocol icmp_protocol + + static int __init init_ipv4_mibs(void) + { +- net_statistics[0] = alloc_percpu(struct linux_mib); +- net_statistics[1] = alloc_percpu(struct linux_mib); +- ip_statistics[0] = alloc_percpu(struct ipstats_mib); +- ip_statistics[1] = alloc_percpu(struct ipstats_mib); +- icmp_statistics[0] = alloc_percpu(struct icmp_mib); +- icmp_statistics[1] = alloc_percpu(struct icmp_mib); +- tcp_statistics[0] = alloc_percpu(struct tcp_mib); +- tcp_statistics[1] = alloc_percpu(struct tcp_mib); +- udp_statistics[0] = alloc_percpu(struct udp_mib); +- udp_statistics[1] = alloc_percpu(struct udp_mib); ++ ve_net_statistics[0] = alloc_percpu(struct linux_mib); ++ ve_net_statistics[1] = alloc_percpu(struct linux_mib); ++ ve_ip_statistics[0] = alloc_percpu(struct ipstats_mib); ++ ve_ip_statistics[1] = alloc_percpu(struct ipstats_mib); ++ ve_icmp_statistics[0] = alloc_percpu(struct icmp_mib); ++ ve_icmp_statistics[1] = alloc_percpu(struct icmp_mib); ++ ve_tcp_statistics[0] = alloc_percpu(struct tcp_mib); ++ ve_tcp_statistics[1] = alloc_percpu(struct tcp_mib); ++ ve_udp_statistics[0] = alloc_percpu(struct udp_mib); ++ ve_udp_statistics[1] = alloc_percpu(struct udp_mib); + if (! +- (net_statistics[0] && net_statistics[1] && ip_statistics[0] +- && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] +- && udp_statistics[0] && udp_statistics[1])) ++ (ve_net_statistics[0] && ve_net_statistics[1] && ve_ip_statistics[0] ++ && ve_ip_statistics[1] && ve_tcp_statistics[0] && ve_tcp_statistics[1] ++ && ve_udp_statistics[0] && ve_udp_statistics[1])) + return -ENOMEM; + + (void) tcp_mib_init(); +diff -upr linux-2.6.16.orig/net/ipv4/arp.c linux-2.6.16-026test015/net/ipv4/arp.c +--- linux-2.6.16.orig/net/ipv4/arp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/arp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -175,7 +175,7 @@ struct neigh_ops arp_broken_ops = { + .queue_xmit = dev_queue_xmit, + }; + +-struct neigh_table arp_tbl = { ++struct neigh_table global_arp_tbl = { + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, +@@ -184,7 +184,7 @@ struct neigh_table arp_tbl = { + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { +- .tbl = &arp_tbl, ++ .tbl = &global_arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, +@@ -920,6 +920,9 @@ out: + + static void parp_redo(struct sk_buff *skb) + { ++#if defined(CONFIG_NETFILTER) && defined(CONFIG_NETFILTER_DEBUG) ++ skb->nf_debug = 0; ++#endif + arp_process(skb); + } + +@@ -988,7 +991,7 @@ static int arp_req_set(struct arpreq *r, + return 0; + } + if (dev == NULL) { +- ipv4_devconf.proxy_arp = 1; ++ ve_ipv4_devconf.proxy_arp = 1; + return 0; + } + if (__in_dev_get_rtnl(dev)) { +@@ -1094,7 +1097,7 @@ static int arp_req_delete(struct arpreq + return pneigh_delete(&arp_tbl, &ip, dev); + if (mask == 0) { + if (dev == NULL) { +- ipv4_devconf.proxy_arp = 0; ++ ve_ipv4_devconf.proxy_arp = 0; + return 0; + } + if (__in_dev_get_rtnl(dev)) { +@@ -1240,7 +1243,9 @@ static int arp_proc_init(void); + + void __init arp_init(void) + { +- neigh_table_init(&arp_tbl); ++ get_ve0()->ve_arp_tbl = &global_arp_tbl; ++ if (neigh_table_init(&arp_tbl)) ++ panic("cannot initialize ARP tables\n"); + + dev_add_pack(&arp_packet_type); + arp_proc_init(); +@@ -1372,8 +1377,9 @@ static int arp_seq_open(struct inode *in + { + struct seq_file *seq; + int rc = -ENOMEM; +- struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL); +- ++ struct neigh_seq_state *s; ++ ++ s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + goto out; + +@@ -1401,7 +1407,7 @@ static struct file_operations arp_seq_fo + + static int __init arp_proc_init(void) + { +- if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) ++ if (!proc_glob_fops_create("net/arp", S_IRUGO, &arp_seq_fops)) + return -ENOMEM; + return 0; + } +@@ -1421,8 +1427,55 @@ EXPORT_SYMBOL(arp_rcv); + EXPORT_SYMBOL(arp_create); + EXPORT_SYMBOL(arp_xmit); + EXPORT_SYMBOL(arp_send); +-EXPORT_SYMBOL(arp_tbl); ++EXPORT_SYMBOL(global_arp_tbl); + + #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) + EXPORT_SYMBOL(clip_tbl_hook); + #endif ++ ++int ve_arp_init(struct ve_struct *ve) ++{ ++ struct ve_struct *old_env; ++ int err; ++ ++ ve->ve_arp_tbl = kmalloc(sizeof(struct neigh_table), GFP_KERNEL); ++ if (ve->ve_arp_tbl == NULL) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ *(ve->ve_arp_tbl) = global_arp_tbl; ++ ve->ve_arp_tbl->parms.tbl = ve->ve_arp_tbl; ++ old_env = set_exec_env(ve); ++ err = neigh_table_init(ve->ve_arp_tbl); ++ if (err) ++ goto out_free; ++#ifdef CONFIG_SYSCTL ++ neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, ++ NET_IPV4_NEIGH, "ipv4", NULL, NULL); ++#endif ++ set_exec_env(old_env); ++ err = 0; ++ ++out: ++ return err; ++ ++out_free: ++ kfree(ve->ve_arp_tbl); ++ ve->ve_arp_tbl = NULL; ++ goto out; ++} ++EXPORT_SYMBOL(ve_arp_init); ++ ++void ve_arp_fini(struct ve_struct *ve) ++{ ++ if (ve->ve_arp_tbl) { ++#ifdef CONFIG_SYSCTL ++ neigh_sysctl_unregister(&ve->ve_arp_tbl->parms); ++#endif ++ neigh_table_clear(ve->ve_arp_tbl); ++ kfree(ve->ve_arp_tbl); ++ ve->ve_arp_tbl = NULL; ++ } ++} ++EXPORT_SYMBOL(ve_arp_fini); +diff -upr linux-2.6.16.orig/net/ipv4/devinet.c linux-2.6.16-026test015/net/ipv4/devinet.c +--- linux-2.6.16.orig/net/ipv4/devinet.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/devinet.c 2006-07-04 14:41:39.000000000 +0400 +@@ -71,7 +71,7 @@ struct ipv4_devconf ipv4_devconf = { + .shared_media = 1, + }; + +-static struct ipv4_devconf ipv4_devconf_dflt = { ++struct ipv4_devconf ipv4_devconf_dflt = { + .accept_redirects = 1, + .send_redirects = 1, + .secure_redirects = 1, +@@ -79,10 +79,16 @@ static struct ipv4_devconf ipv4_devconf_ + .accept_source_route = 1, + }; + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ve_ipv4_devconf_dflt (*(get_exec_env()->_ipv4_devconf_dflt)) ++#else ++#define ve_ipv4_devconf_dflt ipv4_devconf_dflt ++#endif ++ + static void rtmsg_ifa(int event, struct in_ifaddr *); + + static struct notifier_block *inetaddr_chain; +-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, ++void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy); + #ifdef CONFIG_SYSCTL + static void devinet_sysctl_register(struct in_device *in_dev, +@@ -92,7 +98,7 @@ static void devinet_sysctl_unregister(st + + /* Locks all the inet devices. */ + +-static struct in_ifaddr *inet_alloc_ifa(void) ++struct in_ifaddr *inet_alloc_ifa(void) + { + struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + +@@ -103,6 +109,7 @@ static struct in_ifaddr *inet_alloc_ifa( + + return ifa; + } ++EXPORT_SYMBOL_GPL(inet_alloc_ifa); + + static void inet_rcu_free_ifa(struct rcu_head *head) + { +@@ -175,6 +182,7 @@ out_kfree: + in_dev = NULL; + goto out; + } ++EXPORT_SYMBOL_GPL(inetdev_init); + + static void in_dev_rcu_put(struct rcu_head *head) + { +@@ -190,7 +198,7 @@ static void inetdev_destroy(struct in_de + ASSERT_RTNL(); + + dev = in_dev->dev; +- if (dev == &loopback_dev) ++ if (dev == &ve0_loopback) + return; + + in_dev->dead = 1; +@@ -232,7 +240,7 @@ int inet_addr_onlink(struct in_device *i + return 0; + } + +-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, ++void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy) + { + struct in_ifaddr *promote = NULL; +@@ -320,7 +328,7 @@ static void inet_del_ifa(struct in_devic + } + } + +-static int inet_insert_ifa(struct in_ifaddr *ifa) ++int inet_insert_ifa(struct in_ifaddr *ifa) + { + struct in_device *in_dev = ifa->ifa_dev; + struct in_ifaddr *ifa1, **ifap, **last_primary; +@@ -370,6 +378,7 @@ static int inet_insert_ifa(struct in_ifa + + return 0; + } ++EXPORT_SYMBOL_GPL(inet_insert_ifa); + + static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) + { +@@ -578,7 +587,7 @@ int devinet_ioctl(unsigned int cmd, void + + case SIOCSIFFLAGS: + ret = -EACCES; +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + goto out; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ +@@ -586,7 +595,7 @@ int devinet_ioctl(unsigned int cmd, void + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + ret = -EACCES; +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + goto out; + ret = -EINVAL; + if (sin->sin_family != AF_INET) +@@ -1163,10 +1172,10 @@ static struct rtnetlink_link inet_rtnetl + void inet_forward_change(void) + { + struct net_device *dev; +- int on = ipv4_devconf.forwarding; ++ int on = ve_ipv4_devconf.forwarding; + +- ipv4_devconf.accept_redirects = !on; +- ipv4_devconf_dflt.forwarding = on; ++ ve_ipv4_devconf.accept_redirects = !on; ++ ve_ipv4_devconf_dflt.forwarding = on; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { +@@ -1191,9 +1200,9 @@ static int devinet_sysctl_forward(ctl_ta + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *valp != val) { +- if (valp == &ipv4_devconf.forwarding) ++ if (valp == &ve_ipv4_devconf.forwarding) + inet_forward_change(); +- else if (valp != &ipv4_devconf_dflt.forwarding) ++ else if (valp != &ve_ipv4_devconf_dflt.forwarding) + rt_cache_flush(0); + } + +@@ -1464,30 +1473,22 @@ static struct devinet_sysctl_table { + }, + }; + +-static void devinet_sysctl_register(struct in_device *in_dev, +- struct ipv4_devconf *p) ++static struct devinet_sysctl_table *__devinet_sysctl_register(char *dev_name, ++ int ifindex, struct ipv4_devconf *p) + { + int i; +- struct net_device *dev = in_dev ? in_dev->dev : NULL; +- struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); +- char *dev_name = NULL; ++ struct devinet_sysctl_table *t; + ++ t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) +- return; ++ goto out; ++ + memcpy(t, &devinet_sysctl, sizeof(*t)); + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; + t->devinet_vars[i].de = NULL; + } + +- if (dev) { +- dev_name = dev->name; +- t->devinet_dev[0].ctl_name = dev->ifindex; +- } else { +- dev_name = "default"; +- t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; +- } +- + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet +@@ -1495,8 +1496,9 @@ static void devinet_sysctl_register(stru + */ + dev_name = kstrdup(dev_name, GFP_KERNEL); + if (!dev_name) +- goto free; ++ goto out_free_table; + ++ t->devinet_dev[0].ctl_name = ifindex; + t->devinet_dev[0].procname = dev_name; + t->devinet_dev[0].child = t->devinet_vars; + t->devinet_dev[0].de = NULL; +@@ -1509,17 +1511,38 @@ static void devinet_sysctl_register(stru + + t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); + if (!t->sysctl_header) +- goto free_procname; ++ goto out_free_procname; + +- p->sysctl = t; +- return; ++ return t; + + /* error path */ +- free_procname: ++out_free_procname: + kfree(dev_name); +- free: ++out_free_table: + kfree(t); +- return; ++out: ++ printk(KERN_DEBUG "Can't register net/ipv4/conf sysctls.\n"); ++ return NULL; ++} ++ ++static void devinet_sysctl_register(struct in_device *in_dev, ++ struct ipv4_devconf *p) ++{ ++ struct net_device *dev; ++ char *dev_name; ++ int ifindex; ++ ++ dev = in_dev ? in_dev->dev : NULL; ++ ++ if (dev) { ++ dev_name = dev->name; ++ ifindex = dev->ifindex; ++ } else { ++ dev_name = "default"; ++ ifindex = NET_PROTO_CONF_DEFAULT; ++ } ++ ++ p->sysctl = __devinet_sysctl_register(dev_name, ifindex, p); + } + + static void devinet_sysctl_unregister(struct ipv4_devconf *p) +@@ -1532,7 +1555,170 @@ static void devinet_sysctl_unregister(st + kfree(t); + } + } ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static ctl_table net_sysctl_tables[] = { ++ /* 0: net */ ++ { ++ .ctl_name = CTL_NET, ++ .procname = "net", ++ .mode = 0555, ++ .child = &net_sysctl_tables[2], ++ }, ++ { .ctl_name = 0, }, ++ /* 2: net/ipv4 */ ++ { ++ .ctl_name = NET_IPV4, ++ .procname = "ipv4", ++ .mode = 0555, ++ .child = &net_sysctl_tables[4], ++ }, ++ { .ctl_name = 0, }, ++ /* 4, 5: net/ipv4/[vars] */ ++ { ++ .ctl_name = NET_IPV4_FORWARD, ++ .procname = "ip_forward", ++ .data = &ipv4_devconf.forwarding, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &ipv4_sysctl_forward, ++ .strategy = &ipv4_sysctl_forward_strategy, ++ }, ++ { ++ .ctl_name = NET_IPV4_ROUTE, ++ .procname = "route", ++ .maxlen = 0, ++ .mode = 0555, ++ .child = &net_sysctl_tables[7], ++ }, ++ { .ctl_name = 0 }, ++ /* 7: net/ipv4/route/flush */ ++ { ++ .ctl_name = NET_IPV4_ROUTE_FLUSH, ++ .procname = "flush", ++ .data = NULL, /* setuped below */ ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &ipv4_sysctl_rtcache_flush, ++ .strategy = &ipv4_sysctl_rtcache_flush_strategy, ++ }, ++ { .ctl_name = 0 }, ++}; ++ ++static int ip_forward_sysctl_register(struct ve_struct *ve, ++ struct ipv4_devconf *p) ++{ ++ struct ctl_table_header *hdr; ++ ctl_table *root; ++ ++ root = clone_sysctl_template(net_sysctl_tables, ++ sizeof(net_sysctl_tables) / sizeof(ctl_table)); ++ if (root == NULL) ++ goto out; ++ ++ root[4].data = &p->forwarding; ++ root[7].data = &ipv4_flush_delay; ++ ++ hdr = register_sysctl_table(root, 1); ++ if (hdr == NULL) ++ goto out_free; ++ ++ ve->forward_header = hdr; ++ ve->forward_table = root; ++ return 0; ++ ++out_free: ++ free_sysctl_clone(root); ++out: ++ return -ENOMEM; ++} ++ ++static inline void ip_forward_sysctl_unregister(struct ve_struct *ve) ++{ ++ unregister_sysctl_table(ve->forward_header); ++ ve->forward_header = NULL; ++} ++ ++static inline void ip_forward_sysctl_free(struct ve_struct *ve) ++{ ++ free_sysctl_clone(ve->forward_table); ++ ve->forward_table = NULL; ++} ++#endif ++#endif ++ ++int devinet_sysctl_init(struct ve_struct *ve) ++{ ++ int err = 0; ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct ipv4_devconf *conf, *conf_def; ++ ++ err = -ENOMEM; ++ ++ conf = kmalloc(sizeof(*conf), GFP_KERNEL); ++ if (!conf) ++ goto err1; ++ ++ memcpy(conf, &ipv4_devconf, sizeof(*conf)); ++ conf->sysctl = __devinet_sysctl_register("all", ++ NET_PROTO_CONF_ALL, conf); ++ if (!conf->sysctl) ++ goto err2; ++ ++ conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); ++ if (!conf_def) ++ goto err3; ++ ++ memcpy(conf_def, &ipv4_devconf_dflt, sizeof(*conf_def)); ++ conf_def->sysctl = __devinet_sysctl_register("default", ++ NET_PROTO_CONF_DEFAULT, conf_def); ++ if (!conf_def->sysctl) ++ goto err4; ++ ++ err = ip_forward_sysctl_register(ve, conf); ++ if (err) ++ goto err5; ++ ++ ve->_ipv4_devconf = conf; ++ ve->_ipv4_devconf_dflt = conf_def; ++ return 0; ++ ++err5: ++ devinet_sysctl_unregister(conf_def); ++err4: ++ kfree(conf_def); ++err3: ++ devinet_sysctl_unregister(conf); ++err2: ++ kfree(conf); ++err1: + #endif ++#endif ++ return err; ++} ++ ++void devinet_sysctl_fini(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ip_forward_sysctl_unregister(ve); ++ devinet_sysctl_unregister(ve->_ipv4_devconf); ++ devinet_sysctl_unregister(ve->_ipv4_devconf_dflt); ++#endif ++#endif ++} ++ ++void devinet_sysctl_free(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ ip_forward_sysctl_free(ve); ++ kfree(ve->_ipv4_devconf); ++ kfree(ve->_ipv4_devconf_dflt); ++#endif ++#endif ++} + + void __init devinet_init(void) + { +@@ -1542,13 +1728,18 @@ void __init devinet_init(void) + #ifdef CONFIG_SYSCTL + devinet_sysctl.sysctl_header = + register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); +- devinet_sysctl_register(NULL, &ipv4_devconf_dflt); ++ __devinet_sysctl_register("default", NET_PROTO_CONF_DEFAULT, ++ &ipv4_devconf_dflt); + #endif + } + + EXPORT_SYMBOL(devinet_ioctl); + EXPORT_SYMBOL(in_dev_finish_destroy); + EXPORT_SYMBOL(inet_select_addr); ++EXPORT_SYMBOL(inet_del_ifa); + EXPORT_SYMBOL(inetdev_by_index); ++EXPORT_SYMBOL(devinet_sysctl_init); ++EXPORT_SYMBOL(devinet_sysctl_fini); ++EXPORT_SYMBOL(devinet_sysctl_free); + EXPORT_SYMBOL(register_inetaddr_notifier); + EXPORT_SYMBOL(unregister_inetaddr_notifier); +diff -upr linux-2.6.16.orig/net/ipv4/fib_frontend.c linux-2.6.16-026test015/net/ipv4/fib_frontend.c +--- linux-2.6.16.orig/net/ipv4/fib_frontend.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/fib_frontend.c 2006-07-04 14:41:39.000000000 +0400 +@@ -53,14 +53,46 @@ + + #define RT_TABLE_MIN RT_TABLE_MAIN + ++#undef ip_fib_local_table ++#undef ip_fib_main_table + struct fib_table *ip_fib_local_table; + struct fib_table *ip_fib_main_table; ++void prepare_fib_tables(void) ++{ ++#ifdef CONFIG_VE ++ get_ve0()->_local_table = ip_fib_local_table; ++ ip_fib_local_table = (struct fib_table *)0x12345678; ++ get_ve0()->_main_table = ip_fib_main_table; ++ ip_fib_main_table = (struct fib_table *)0x12345678; ++#endif ++} ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ip_fib_local_table get_exec_env()->_local_table ++#define ip_fib_main_table get_exec_env()->_main_table ++#endif + + #else + + #define RT_TABLE_MIN 1 + ++#undef fib_tables + struct fib_table *fib_tables[RT_TABLE_MAX+1]; ++void prepare_fib_tables(void) ++{ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ int i; ++ ++ BUG_ON(sizeof(fib_tables) != ++ sizeof(((struct ve_struct *)0)->_fib_tables)); ++ memcpy(get_ve0()->_fib_tables, fib_tables, sizeof(fib_tables)); ++ for (i = 0; i <= RT_TABLE_MAX; i++) ++ fib_tables[i] = (void *)0x12366678; ++#endif ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define fib_tables get_exec_env()->_fib_tables ++#endif + + struct fib_table *__fib_new_table(int id) + { +@@ -250,7 +282,7 @@ int ip_rt_ioctl(unsigned int cmd, void _ + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&r, arg, sizeof(struct rtentry))) + return -EFAULT; +@@ -653,6 +685,7 @@ static struct notifier_block fib_netdev_ + + void __init ip_fib_init(void) + { ++ prepare_fib_tables(); + #ifndef CONFIG_IP_MULTIPLE_TABLES + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); +diff -upr linux-2.6.16.orig/net/ipv4/fib_hash.c linux-2.6.16-026test015/net/ipv4/fib_hash.c +--- linux-2.6.16.orig/net/ipv4/fib_hash.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/fib_hash.c 2006-07-04 14:41:38.000000000 +0400 +@@ -36,6 +36,7 @@ + #include <linux/skbuff.h> + #include <linux/netlink.h> + #include <linux/init.h> ++#include <linux/ve.h> + + #include <net/ip.h> + #include <net/protocol.h> +@@ -73,11 +74,6 @@ struct fn_zone { + * can be cheaper than memory lookup, so that FZ_* macros are used. + */ + +-struct fn_hash { +- struct fn_zone *fn_zones[33]; +- struct fn_zone *fn_zone_list; +-}; +- + static inline u32 fn_hash(u32 key, struct fn_zone *fz) + { + u32 h = ntohl(key)>>(32 - fz->fz_order); +@@ -623,7 +619,7 @@ fn_hash_delete(struct fib_table *tb, str + return -ESRCH; + } + +-static int fn_flush_list(struct fn_zone *fz, int idx) ++static int fn_flush_list(struct fn_zone *fz, int idx, int destroy) + { + struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_node *node, *n; +@@ -638,7 +634,9 @@ static int fn_flush_list(struct fn_zone + list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { + struct fib_info *fi = fa->fa_info; + +- if (fi && (fi->fib_flags&RTNH_F_DEAD)) { ++ if (fi == NULL) ++ continue; ++ if (destroy || (fi->fib_flags&RTNH_F_DEAD)) { + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + if (list_empty(&f->fn_alias)) { +@@ -660,7 +658,7 @@ static int fn_flush_list(struct fn_zone + return found; + } + +-static int fn_hash_flush(struct fib_table *tb) ++static int __fn_hash_flush(struct fib_table *tb, int destroy) + { + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz; +@@ -670,11 +668,84 @@ static int fn_hash_flush(struct fib_tabl + int i; + + for (i = fz->fz_divisor - 1; i >= 0; i--) +- found += fn_flush_list(fz, i); ++ found += fn_flush_list(fz, i, destroy); + } + return found; + } + ++static int fn_hash_flush(struct fib_table *tb) ++{ ++ return __fn_hash_flush(tb, 0); ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++void fib_hash_destroy(struct fib_table *tb) ++{ ++ __fn_hash_flush(tb, 1); ++ kfree(tb); ++} ++ ++/* ++ * Initialization of virtualized networking subsystem. ++ */ ++int init_ve_route(struct ve_struct *ve) ++{ ++#ifdef CONFIG_IP_MULTIPLE_TABLES ++ if (fib_rules_create()) ++ return -ENOMEM; ++ ve->_fib_tables[RT_TABLE_LOCAL] = fib_hash_init(RT_TABLE_LOCAL); ++ if (!ve->_fib_tables[RT_TABLE_LOCAL]) ++ goto out_destroy; ++ ve->_fib_tables[RT_TABLE_MAIN] = fib_hash_init(RT_TABLE_MAIN); ++ if (!ve->_fib_tables[RT_TABLE_MAIN]) ++ goto out_destroy_local; ++ ++ return 0; ++ ++out_destroy_local: ++ fib_hash_destroy(ve->_fib_tables[RT_TABLE_LOCAL]); ++out_destroy: ++ fib_rules_destroy(); ++ ve->_local_rule = NULL; ++ return -ENOMEM; ++#else ++ ve->_local_table = fib_hash_init(RT_TABLE_LOCAL); ++ if (!ve->_local_table) ++ return -ENOMEM; ++ ve->_main_table = fib_hash_init(RT_TABLE_MAIN); ++ if (!ve->_main_table) { ++ fib_hash_destroy(ve->_local_table); ++ return -ENOMEM; ++ } ++ return 0; ++#endif ++} ++ ++void fini_ve_route(struct ve_struct *ve) ++{ ++#ifdef CONFIG_IP_MULTIPLE_TABLES ++ int i; ++ for (i=0; i<RT_TABLE_MAX+1; i++) ++ { ++ if (!ve->_fib_tables[i]) ++ continue; ++ fib_hash_destroy(ve->_fib_tables[i]); ++ } ++ fib_rules_destroy(); ++ ve->_local_rule = NULL; ++#else ++ fib_hash_destroy(ve->_local_table); ++ fib_hash_destroy(ve->_main_table); ++#endif ++ fib_hash_free(ve->_fib_info_hash, ve->_fib_hash_size); ++ fib_hash_free(ve->_fib_info_laddrhash, ve->_fib_hash_size); ++ ve->_fib_info_hash = ve->_fib_info_laddrhash = NULL; ++} ++ ++EXPORT_SYMBOL(init_ve_route); ++EXPORT_SYMBOL(fini_ve_route); ++#endif ++ + + static inline int + fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, +@@ -766,7 +837,7 @@ static int fn_hash_dump(struct fib_table + return skb->len; + } + +-#ifdef CONFIG_IP_MULTIPLE_TABLES ++#if defined(CONFIG_IP_MULTIPLE_TABLES) || defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) + struct fib_table * fib_hash_init(int id) + #else + struct fib_table * __init fib_hash_init(int id) +@@ -1076,13 +1147,13 @@ static struct file_operations fib_seq_fo + + int __init fib_proc_init(void) + { +- if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) ++ if (!proc_glob_fops_create("net/route", S_IRUGO, &fib_seq_fops)) + return -ENOMEM; + return 0; + } + + void __init fib_proc_exit(void) + { +- proc_net_remove("route"); ++ remove_proc_glob_entry("net/route", NULL); + } + #endif /* CONFIG_PROC_FS */ +diff -upr linux-2.6.16.orig/net/ipv4/fib_lookup.h linux-2.6.16-026test015/net/ipv4/fib_lookup.h +--- linux-2.6.16.orig/net/ipv4/fib_lookup.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/fib_lookup.h 2006-07-04 14:41:38.000000000 +0400 +@@ -41,5 +41,6 @@ extern struct fib_alias *fib_find_alias( + extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, + int *last_idx, int *dflt); ++void fib_hash_free(struct hlist_head *hash, int bytes); + + #endif /* _FIB_LOOKUP_H */ +diff -upr linux-2.6.16.orig/net/ipv4/fib_rules.c linux-2.6.16-026test015/net/ipv4/fib_rules.c +--- linux-2.6.16.orig/net/ipv4/fib_rules.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/fib_rules.c 2006-07-04 14:41:39.000000000 +0400 +@@ -39,6 +39,7 @@ + #include <linux/proc_fs.h> + #include <linux/skbuff.h> + #include <linux/netlink.h> ++#include <linux/rtnetlink.h> + #include <linux/init.h> + + #include <net/ip.h> +@@ -99,9 +100,89 @@ static struct fib_rule local_rule = { + .r_action = RTN_UNICAST, + }; + +-static struct fib_rule *fib_rules = &local_rule; + static DEFINE_RWLOCK(fib_rules_lock); + ++void __init prepare_fib_rules(void) ++{ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ get_ve0()->_local_rule = &local_rule; ++ get_ve0()->_fib_rules = &local_rule; ++#endif ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define local_rule (*(get_exec_env()->_local_rule)) ++#define fib_rules (get_exec_env()->_fib_rules) ++#else ++static struct fib_rule *fib_rules = &local_rule; ++#endif ++ ++#if defined(CONFIG_VE_CALLS) || defined(CONFIG_VE_CALLS_MODULE) ++int fib_rules_create() ++{ ++ struct fib_rule *default_rule, *main_rule, *loc_rule; ++ ++ default_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); ++ if (default_rule == NULL) ++ goto out_def; ++ memset(default_rule, 0, sizeof(struct fib_rule)); ++ atomic_set(&default_rule->r_clntref, 1); ++ default_rule->r_preference = 0x7FFF; ++ default_rule->r_table = RT_TABLE_DEFAULT; ++ default_rule->r_action = RTN_UNICAST; ++ ++ main_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); ++ if (main_rule == NULL) ++ goto out_main; ++ memset(main_rule, 0, sizeof(struct fib_rule)); ++ atomic_set(&main_rule->r_clntref, 1); ++ main_rule->r_preference = 0x7FFE; ++ main_rule->r_table = RT_TABLE_MAIN; ++ main_rule->r_action = RTN_UNICAST; ++ main_rule->r_next = default_rule; ++ ++ loc_rule = kmalloc(sizeof(struct fib_rule), GFP_KERNEL); ++ if (loc_rule == NULL) ++ goto out_loc; ++ memset(loc_rule, 0, sizeof(struct fib_rule)); ++ atomic_set(&loc_rule->r_clntref, 1); ++ loc_rule->r_preference = 0; ++ loc_rule->r_table = RT_TABLE_LOCAL; ++ loc_rule->r_action = RTN_UNICAST; ++ loc_rule->r_next = main_rule; ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ get_exec_env()->_local_rule = loc_rule; ++ get_exec_env()->_fib_rules = loc_rule; ++#endif ++ ++ return 0; ++ ++out_loc: ++ kfree(main_rule); ++out_main: ++ kfree(default_rule); ++out_def: ++ return -1; ++} ++ ++void fib_rules_destroy() ++{ ++ struct fib_rule *r; ++ ++ rtnl_lock(); ++ write_lock_bh(&fib_rules_lock); ++ while(fib_rules != NULL) { ++ r = fib_rules; ++ fib_rules = fib_rules->r_next; ++ r->r_dead = 1; ++ fib_rule_put(r); ++ } ++ write_unlock_bh(&fib_rules_lock); ++ rtnl_unlock(); ++} ++#endif ++ + int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) + { + struct rtattr **rta = arg; +@@ -435,5 +516,6 @@ int inet_dump_rules(struct sk_buff *skb, + + void __init fib_rules_init(void) + { ++ prepare_fib_rules(); + register_netdevice_notifier(&fib_rules_notifier); + } +diff -upr linux-2.6.16.orig/net/ipv4/fib_semantics.c linux-2.6.16-026test015/net/ipv4/fib_semantics.c +--- linux-2.6.16.orig/net/ipv4/fib_semantics.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/fib_semantics.c 2006-07-04 14:41:39.000000000 +0400 +@@ -33,6 +33,7 @@ + #include <linux/netdevice.h> + #include <linux/if_arp.h> + #include <linux/proc_fs.h> ++#include <linux/ve.h> + #include <linux/skbuff.h> + #include <linux/netlink.h> + #include <linux/init.h> +@@ -56,6 +57,24 @@ static struct hlist_head *fib_info_laddr + static unsigned int fib_hash_size; + static unsigned int fib_info_cnt; + ++void prepare_fib_info(void) ++{ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ get_ve0()->_fib_info_hash = fib_info_hash; ++ get_ve0()->_fib_info_laddrhash = fib_info_laddrhash; ++ get_ve0()->_fib_hash_size = fib_hash_size; ++ get_ve0()->_fib_info_cnt = fib_info_cnt; ++#endif ++} ++ ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define fib_info_hash (get_exec_env()->_fib_info_hash) ++#define fib_info_laddrhash (get_exec_env()->_fib_info_laddrhash) ++#define fib_hash_size (get_exec_env()->_fib_hash_size) ++#define fib_info_cnt (get_exec_env()->_fib_info_cnt) ++#endif ++ ++ + #define DEVINDEX_HASHBITS 8 + #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) + static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; +@@ -235,13 +254,15 @@ static struct fib_info *fib_find_info(co + return NULL; + } + +-static inline unsigned int fib_devindex_hashfn(unsigned int val) ++static inline unsigned int fib_devindex_hashfn(unsigned int val, ++ envid_t veid) + { + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ +- (val >> (DEVINDEX_HASHBITS * 2))) & mask; ++ (val >> (DEVINDEX_HASHBITS * 2)) ^ ++ (veid ^ (veid >> 16))) & mask; + } + + /* Check, that the gateway is already configured. +@@ -257,7 +278,7 @@ int ip_fib_check_default(u32 gw, struct + + read_lock(&fib_info_lock); + +- hash = fib_devindex_hashfn(dev->ifindex); ++ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); + head = &fib_info_devhash[hash]; + hlist_for_each_entry(nh, node, head, nh_hash) { + if (nh->nh_dev == dev && +@@ -580,7 +601,7 @@ static struct hlist_head *fib_hash_alloc + __get_free_pages(GFP_KERNEL, get_order(bytes)); + } + +-static void fib_hash_free(struct hlist_head *hash, int bytes) ++void fib_hash_free(struct hlist_head *hash, int bytes) + { + if (!hash) + return; +@@ -837,7 +858,8 @@ link_it: + + if (!nh->nh_dev) + continue; +- hash = fib_devindex_hashfn(nh->nh_dev->ifindex); ++ hash = fib_devindex_hashfn(nh->nh_dev->ifindex, ++ VEID(nh->nh_dev->owner_env)); + head = &fib_info_devhash[hash]; + hlist_add_head(&nh->nh_hash, head); + } endfor_nexthops(fi) +@@ -1184,7 +1206,8 @@ int fib_sync_down(u32 local, struct net_ + + if (dev) { + struct fib_info *prev_fi = NULL; +- unsigned int hash = fib_devindex_hashfn(dev->ifindex); ++ unsigned int hash = fib_devindex_hashfn(dev->ifindex, ++ VEID(dev->owner_env)); + struct hlist_head *head = &fib_info_devhash[hash]; + struct hlist_node *node; + struct fib_nh *nh; +@@ -1249,7 +1272,7 @@ int fib_sync_up(struct net_device *dev) + return 0; + + prev_fi = NULL; +- hash = fib_devindex_hashfn(dev->ifindex); ++ hash = fib_devindex_hashfn(dev->ifindex, VEID(dev->owner_env)); + head = &fib_info_devhash[hash]; + ret = 0; + +diff -upr linux-2.6.16.orig/net/ipv4/fib_trie.c linux-2.6.16-026test015/net/ipv4/fib_trie.c +--- linux-2.6.16.orig/net/ipv4/fib_trie.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/fib_trie.c 2006-07-04 14:41:36.000000000 +0400 +@@ -314,11 +314,6 @@ static void __leaf_free_rcu(struct rcu_h + kfree(container_of(head, struct leaf, rcu)); + } + +-static inline void free_leaf(struct leaf *leaf) +-{ +- call_rcu(&leaf->rcu, __leaf_free_rcu); +-} +- + static void __leaf_info_free_rcu(struct rcu_head *head) + { + kfree(container_of(head, struct leaf_info, rcu)); +@@ -357,7 +352,12 @@ static void __tnode_free_rcu(struct rcu_ + + static inline void tnode_free(struct tnode *tn) + { +- call_rcu(&tn->rcu, __tnode_free_rcu); ++ if(IS_LEAF(tn)) { ++ struct leaf *l = (struct leaf *) tn; ++ call_rcu_bh(&l->rcu, __leaf_free_rcu); ++ } ++ else ++ call_rcu(&tn->rcu, __tnode_free_rcu); + } + + static struct leaf *leaf_new(void) +diff -upr linux-2.6.16.orig/net/ipv4/igmp.c linux-2.6.16-026test015/net/ipv4/igmp.c +--- linux-2.6.16.orig/net/ipv4/igmp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/igmp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -2262,6 +2262,8 @@ static inline struct ip_mc_list *igmp_mc + state->dev; + state->dev = state->dev->next) { + struct in_device *in_dev; ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + in_dev = in_dev_get(state->dev); + if (!in_dev) + continue; +@@ -2291,6 +2293,8 @@ static struct ip_mc_list *igmp_mc_get_ne + state->in_dev = NULL; + break; + } ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + state->in_dev = in_dev_get(state->dev); + if (!state->in_dev) + continue; +@@ -2425,6 +2429,8 @@ static inline struct ip_sf_list *igmp_mc + state->dev; + state->dev = state->dev->next) { + struct in_device *idev; ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + idev = in_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; +@@ -2464,6 +2470,8 @@ static struct ip_sf_list *igmp_mcf_get_n + state->idev = NULL; + goto out; + } ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + state->idev = in_dev_get(state->dev); + if (!state->idev) + continue; +@@ -2584,8 +2592,8 @@ static struct file_operations igmp_mcf_s + + int __init igmp_mc_proc_init(void) + { +- proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops); +- proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops); ++ proc_glob_fops_create("net/igmp", S_IRUGO, &igmp_mc_seq_fops); ++ proc_glob_fops_create("net/mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + return 0; + } + #endif +diff -upr linux-2.6.16.orig/net/ipv4/inet_connection_sock.c linux-2.6.16-026test015/net/ipv4/inet_connection_sock.c +--- linux-2.6.16.orig/net/ipv4/inet_connection_sock.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/inet_connection_sock.c 2006-07-04 14:41:38.000000000 +0400 +@@ -25,6 +25,9 @@ + #include <net/tcp_states.h> + #include <net/xfrm.h> + ++#include <ub/ub_net.h> ++#include <ub/ub_orphan.h> ++ + #ifdef INET_CSK_DEBUG + const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; + EXPORT_SYMBOL(inet_csk_timer_bug_msg); +@@ -48,6 +51,7 @@ int inet_csk_bind_conflict(const struct + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && ++ !ve_accessible_strict(VE_OWNER_SK(sk), VE_OWNER_SK(sk2)) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { +@@ -77,7 +81,9 @@ int inet_csk_get_port(struct inet_hashin + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret; ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; +@@ -86,11 +92,15 @@ int inet_csk_get_port(struct inet_hashin + int rover = net_random() % (high - low) + low; + + do { +- head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; ++ head = &hashinfo->bhash[inet_bhashfn(rover, ++ hashinfo->bhash_size, VEID(env))]; + spin_lock(&head->lock); +- inet_bind_bucket_for_each(tb, node, &head->chain) ++ inet_bind_bucket_for_each(tb, node, &head->chain) { ++ if (!ve_accessible_strict(VE_OWNER_TB(tb),env)) ++ continue; + if (tb->port == rover) + goto next; ++ } + break; + next: + spin_unlock(&head->lock); +@@ -113,11 +123,15 @@ int inet_csk_get_port(struct inet_hashin + */ + snum = rover; + } else { +- head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; ++ head = &hashinfo->bhash[inet_bhashfn(snum, ++ hashinfo->bhash_size, VEID(env))]; + spin_lock(&head->lock); +- inet_bind_bucket_for_each(tb, node, &head->chain) ++ inet_bind_bucket_for_each(tb, node, &head->chain) { ++ if (!ve_accessible_strict(VE_OWNER_TB(tb), env)) ++ continue; + if (tb->port == snum) + goto tb_found; ++ } + } + tb = NULL; + goto tb_not_found; +@@ -136,7 +150,7 @@ tb_found: + } + tb_not_found: + ret = 1; +- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) ++ if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum, env)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) +@@ -541,7 +555,7 @@ void inet_csk_destroy_sock(struct sock * + + sk_refcnt_debug_release(sk); + +- atomic_dec(sk->sk_prot->orphan_count); ++ ub_dec_orphan_count(sk); + sock_put(sk); + } + +@@ -621,7 +635,7 @@ void inet_csk_listen_stop(struct sock *s + + sock_orphan(child); + +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + + inet_csk_destroy_sock(child); + +diff -upr linux-2.6.16.orig/net/ipv4/inet_diag.c linux-2.6.16-026test015/net/ipv4/inet_diag.c +--- linux-2.6.16.orig/net/ipv4/inet_diag.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/inet_diag.c 2006-07-04 14:41:38.000000000 +0400 +@@ -673,7 +673,9 @@ static int inet_diag_dump(struct sk_buff + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; +@@ -694,6 +696,8 @@ static int inet_diag_dump(struct sk_buff + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (num < s_num) { + num++; + continue; +@@ -754,6 +758,8 @@ skip_listen_ht: + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) +@@ -778,6 +784,8 @@ next_normal: + inet_twsk_for_each(tw, node, + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { + ++ if (!ve_accessible_veid(inet_twsk(sk)->tw_owner_env, VEID(ve))) ++ continue; + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != tw->tw_sport && +diff -upr linux-2.6.16.orig/net/ipv4/inet_hashtables.c linux-2.6.16-026test015/net/ipv4/inet_hashtables.c +--- linux-2.6.16.orig/net/ipv4/inet_hashtables.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/inet_hashtables.c 2006-07-04 14:41:38.000000000 +0400 +@@ -30,7 +30,8 @@ + */ + struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, +- const unsigned short snum) ++ const unsigned short snum, ++ struct ve_struct *ve) + { + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); + +@@ -38,6 +39,7 @@ struct inet_bind_bucket *inet_bind_bucke + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); ++ SET_VE_OWNER_TB(tb, ve); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +@@ -71,10 +73,13 @@ EXPORT_SYMBOL(inet_bind_hash); + */ + static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) + { +- const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); +- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; ++ int bhash; ++ struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + ++ bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size, ++ VEID(VE_OWNER_SK(sk))); ++ head = &hashinfo->bhash[bhash]; + spin_lock(&head->lock); + tb = inet_csk(sk)->icsk_bind_hash; + __sk_del_bind_node(sk); +@@ -130,7 +135,8 @@ EXPORT_SYMBOL(inet_listen_wlock); + * wildcarded during the search since they can never be otherwise. + */ + struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, +- const unsigned short hnum, const int dif) ++ const unsigned short hnum, const int dif, ++ struct ve_struct *env) + { + struct sock *result = NULL, *sk; + const struct hlist_node *node; +@@ -139,6 +145,8 @@ struct sock *__inet_lookup_listener(cons + sk_for_each(sk, node, head) { + const struct inet_sock *inet = inet_sk(sk); + ++ if (!ve_accessible_strict(VE_OWNER_SK(sk), env)) ++ continue; + if (inet->num == hnum && !ipv6_only_sock(sk)) { + const __u32 rcv_saddr = inet->rcv_saddr; + int score = sk->sk_family == PF_INET ? 1 : 0; +@@ -169,7 +177,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_listener + /* called with local bh disabled */ + static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, +- struct inet_timewait_sock **twp) ++ struct inet_timewait_sock **twp, ++ struct ve_struct *ve) + { + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); +@@ -178,12 +187,15 @@ static int __inet_check_established(stru + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); +- unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); +- struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); ++ unsigned int hash; ++ struct inet_ehash_bucket *head; + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + ++ hash = inet_ehashfn(daddr, lport, saddr, inet->dport, VEID(ve)); ++ head = inet_ehash_bucket(hinfo, hash); ++ + prefetch(head->chain.first); + write_lock(&head->lock); + +@@ -191,7 +203,8 @@ static int __inet_check_established(stru + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + tw = inet_twsk(sk2); + +- if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { ++ if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ++ ports, dif, ve)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else +@@ -202,7 +215,8 @@ static int __inet_check_established(stru + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { +- if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) ++ if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ++ ports, dif, ve)) + goto not_unique; + } + +@@ -253,7 +267,9 @@ int inet_hash_connect(struct inet_timewa + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; ++ struct ve_struct *ve; + ++ ve = VE_OWNER_SK(sk); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; +@@ -268,7 +284,8 @@ int inet_hash_connect(struct inet_timewa + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; +- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; ++ head = &hinfo->bhash[inet_bhashfn(port, ++ hinfo->bhash_size, VEID(ve))]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, +@@ -282,13 +299,14 @@ int inet_hash_connect(struct inet_timewa + goto next_port; + if (!__inet_check_established(death_row, + sk, port, +- &tw)) ++ &tw, ve)) + goto ok; + goto next_port; + } + } + +- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); ++ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, ++ head, port, ve); + if (!tb) { + spin_unlock(&head->lock); + break; +@@ -323,7 +341,7 @@ ok: + goto out; + } + +- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; ++ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { +@@ -333,7 +351,7 @@ ok: + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ +- ret = __inet_check_established(death_row, sk, snum, NULL); ++ ret = __inet_check_established(death_row, sk, snum, NULL, ve); + out: + local_bh_enable(); + return ret; +diff -upr linux-2.6.16.orig/net/ipv4/inet_timewait_sock.c linux-2.6.16-026test015/net/ipv4/inet_timewait_sock.c +--- linux-2.6.16.orig/net/ipv4/inet_timewait_sock.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/inet_timewait_sock.c 2006-07-04 14:41:38.000000000 +0400 +@@ -32,7 +32,8 @@ void __inet_twsk_kill(struct inet_timewa + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ +- bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; ++ bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, ++ hashinfo->bhash_size, tw->tw_owner_env)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); +@@ -66,7 +67,8 @@ void __inet_twsk_hashdance(struct inet_t + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ +- bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; ++ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, ++ hashinfo->bhash_size, tw->tw_owner_env)]; + spin_lock(&bhead->lock); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); +@@ -90,9 +92,14 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance) + + struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) + { +- struct inet_timewait_sock *tw = +- kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, +- SLAB_ATOMIC); ++ struct user_beancounter *ub; ++ struct inet_timewait_sock *tw; ++ ++ ub = set_exec_ub(sock_bc(sk)->ub); ++ tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, ++ SLAB_ATOMIC); ++ (void)set_exec_ub(ub); ++ + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + +diff -upr linux-2.6.16.orig/net/ipv4/ip_forward.c linux-2.6.16-026test015/net/ipv4/ip_forward.c +--- linux-2.6.16.orig/net/ipv4/ip_forward.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/ip_forward.c 2006-07-04 14:41:38.000000000 +0400 +@@ -87,6 +87,24 @@ int ip_forward(struct sk_buff *skb) + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto sr_failed; + ++ /* ++ * We try to optimize forwarding of VE packets: ++ * do not decrement TTL (and so save skb_cow) ++ * during forwarding of outgoing pkts from VE. ++ * For incoming pkts we still do ttl decr, ++ * since such skb is not cloned and does not require ++ * actual cow. So, there is at least one place ++ * in pkts path with mandatory ttl decr, that is ++ * sufficient to prevent routing loops. ++ */ ++ iph = skb->nh.iph; ++ if ( ++#ifdef CONFIG_IP_ROUTE_NAT ++ (rt->rt_flags & RTCF_NAT) == 0 && /* no NAT mangling expected */ ++#endif /* and */ ++ (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */ ++ goto no_ttl_decr; ++ + /* We are about to mangle packet. Copy it! */ + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + goto drop; +@@ -95,6 +113,8 @@ int ip_forward(struct sk_buff *skb) + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + ++no_ttl_decr: ++ + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. +diff -upr linux-2.6.16.orig/net/ipv4/ip_fragment.c linux-2.6.16-026test015/net/ipv4/ip_fragment.c +--- linux-2.6.16.orig/net/ipv4/ip_fragment.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/ip_fragment.c 2006-07-04 14:41:38.000000000 +0400 +@@ -44,6 +44,7 @@ + #include <linux/udp.h> + #include <linux/inet.h> + #include <linux/netfilter_ipv4.h> ++#include <linux/ve_owner.h> + + /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c +@@ -97,8 +98,12 @@ struct ipq { + int iif; + unsigned int rid; + struct inet_peer *peer; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(IPQ, struct ipq, owner_env) ++DCL_VE_OWNER(IPQ, struct ipq, owner_env) ++ + /* Hash table. */ + + #define IPQ_HASHSZ 64 +@@ -182,7 +187,8 @@ static __inline__ void frag_free_queue(s + + static __inline__ struct ipq *frag_alloc_queue(void) + { +- struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); ++ struct ipq *qp = kmalloc(sizeof(struct ipq) + sizeof(void *), ++ GFP_ATOMIC); + + if(!qp) + return NULL; +@@ -278,6 +284,9 @@ static void ip_evictor(void) + static void ip_expire(unsigned long arg) + { + struct ipq *qp = (struct ipq *) arg; ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(VE_OWNER_IPQ(qp)); + + spin_lock(&qp->lock); + +@@ -300,6 +309,8 @@ static void ip_expire(unsigned long arg) + out: + spin_unlock(&qp->lock); + ipq_put(qp, NULL); ++ ++ (void)set_exec_env(envid); + } + + /* Creation primitives. */ +@@ -321,7 +332,8 @@ static struct ipq *ip_frag_intern(unsign + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && +- qp->user == qp_in->user) { ++ qp->user == qp_in->user && ++ qp->owner_env == get_exec_env()) { + atomic_inc(&qp->refcnt); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; +@@ -371,6 +383,8 @@ static struct ipq *ip_frag_create(unsign + spin_lock_init(&qp->lock); + atomic_set(&qp->refcnt, 1); + ++ SET_VE_OWNER_IPQ(qp, get_exec_env()); ++ + return ip_frag_intern(hash, qp); + + out_nomem: +@@ -397,7 +411,8 @@ static inline struct ipq *ip_find(struct + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && +- qp->user == user) { ++ qp->user == user && ++ qp->owner_env == get_exec_env()) { + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + return qp; +@@ -719,6 +734,9 @@ struct sk_buff *ip_defrag(struct sk_buff + qp->meat == qp->len) + ret = ip_frag_reasm(qp, dev); + ++ if (ret) ++ SET_VE_OWNER_SKB(ret, VE_OWNER_SKB(skb)); ++ + spin_unlock(&qp->lock); + ipq_put(qp, NULL); + return ret; +@@ -729,6 +747,51 @@ struct sk_buff *ip_defrag(struct sk_buff + return NULL; + } + ++#ifdef CONFIG_VE ++/* XXX */ ++void ip_fragment_cleanup(struct ve_struct *envid) ++{ ++ int i, progress; ++ ++ /* All operations with fragment queues are performed from NET_RX/TX ++ * soft interrupts or from timer context. --Den */ ++ local_bh_disable(); ++ do { ++ progress = 0; ++ for (i = 0; i < IPQ_HASHSZ; i++) { ++ struct ipq *qp; ++ struct hlist_node *p, *n; ++ ++ if (hlist_empty(&ipq_hash[i])) ++ continue; ++inner_restart: ++ read_lock(&ipfrag_lock); ++ hlist_for_each_entry_safe(qp, p, n, ++ &ipq_hash[i], list) { ++ if (!ve_accessible_strict( ++ VE_OWNER_IPQ(qp), ++ envid)) ++ continue; ++ atomic_inc(&qp->refcnt); ++ read_unlock(&ipfrag_lock); ++ ++ spin_lock(&qp->lock); ++ if (!(qp->last_in&COMPLETE)) ++ ipq_kill(qp); ++ spin_unlock(&qp->lock); ++ ++ ipq_put(qp, NULL); ++ progress = 1; ++ goto inner_restart; ++ } ++ read_unlock(&ipfrag_lock); ++ } ++ } while(progress); ++ local_bh_enable(); ++} ++EXPORT_SYMBOL(ip_fragment_cleanup); ++#endif ++ + void ipfrag_init(void) + { + ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ +diff -upr linux-2.6.16.orig/net/ipv4/ip_output.c linux-2.6.16-026test015/net/ipv4/ip_output.c +--- linux-2.6.16.orig/net/ipv4/ip_output.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/ip_output.c 2006-07-04 14:41:37.000000000 +0400 +@@ -86,8 +86,6 @@ + + int sysctl_ip_default_ttl = IPDEFTTL; + +-static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)); +- + /* Generate a checksum for an outgoing IP datagram. */ + __inline__ void ip_send_check(struct iphdr *iph) + { +@@ -421,7 +419,7 @@ static void ip_copy_metadata(struct sk_b + * single device frame, and queue such a frame for sending. + */ + +-static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) ++int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) + { + struct iphdr *iph; + int raw = 0; +@@ -673,6 +671,8 @@ fail: + return err; + } + ++EXPORT_SYMBOL(ip_fragment); ++ + int + ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) + { +@@ -1249,11 +1249,7 @@ int ip_push_pending_frames(struct sock * + iph->tos = inet->tos; + iph->tot_len = htons(skb->len); + iph->frag_off = df; +- if (!df) { +- __ip_select_ident(iph, &rt->u.dst, 0); +- } else { +- iph->id = htons(inet->id++); +- } ++ ip_select_ident(iph, &rt->u.dst, sk); + iph->ttl = ttl; + iph->protocol = sk->sk_protocol; + iph->saddr = rt->rt_src; +@@ -1340,12 +1336,13 @@ void ip_send_reply(struct sock *sk, stru + char data[40]; + } replyopts; + struct ipcm_cookie ipc; +- u32 daddr; ++ u32 saddr, daddr; + struct rtable *rt = (struct rtable*)skb->dst; + + if (ip_options_echo(&replyopts.opt, skb)) + return; + ++ saddr = skb->nh.iph->daddr; + daddr = ipc.addr = rt->rt_src; + ipc.opt = NULL; + +@@ -1359,7 +1356,7 @@ void ip_send_reply(struct sock *sk, stru + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, +- .saddr = rt->rt_spec_dst, ++ .saddr = saddr, + .tos = RT_TOS(skb->nh.iph->tos) } }, + /* Not quite clean, but right. */ + .uli_u = { .ports = +diff -upr linux-2.6.16.orig/net/ipv4/ipmr.c linux-2.6.16-026test015/net/ipv4/ipmr.c +--- linux-2.6.16.orig/net/ipv4/ipmr.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/ipmr.c 2006-07-04 14:41:38.000000000 +0400 +@@ -837,7 +837,7 @@ static void mrtsock_destruct(struct sock + { + rtnl_lock(); + if (sk == mroute_socket) { +- ipv4_devconf.mc_forwarding--; ++ ve_ipv4_devconf.mc_forwarding--; + + write_lock_bh(&mrt_lock); + mroute_socket=NULL; +@@ -888,7 +888,7 @@ int ip_mroute_setsockopt(struct sock *sk + mroute_socket=sk; + write_unlock_bh(&mrt_lock); + +- ipv4_devconf.mc_forwarding++; ++ ve_ipv4_devconf.mc_forwarding++; + } + rtnl_unlock(); + return ret; +diff -upr linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_conn.c linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_conn.c +--- linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_conn.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_conn.c 2006-07-04 14:41:37.000000000 +0400 +@@ -902,7 +902,8 @@ int ip_vs_conn_init(void) + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, ++ NULL, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; +diff -upr linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_core.c linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_core.c +--- linux-2.6.16.orig/net/ipv4/ipvs/ip_vs_core.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/ipvs/ip_vs_core.c 2006-07-04 14:41:38.000000000 +0400 +@@ -952,6 +952,10 @@ ip_vs_in(unsigned int hooknum, struct sk + * Big tappo: only PACKET_HOST (neither loopback nor mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ ++ /* ++ * VZ: the question above is right. ++ * The second test is superfluous. ++ */ + if (unlikely(skb->pkt_type != PACKET_HOST + || skb->dev == &loopback_dev || skb->sk)) { + IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/arp_tables.c linux-2.6.16-026test015/net/ipv4/netfilter/arp_tables.c +--- linux-2.6.16.orig/net/ipv4/netfilter/arp_tables.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/arp_tables.c 2006-07-04 14:41:36.000000000 +0400 +@@ -941,7 +941,7 @@ static int do_add_counters(void __user * + + write_lock_bh(&t->lock); + private = t->private; +- if (private->number != paddc->num_counters) { ++ if (private->number != tmp.num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_core.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_core.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_core.c 2006-07-04 14:41:39.000000000 +0400 +@@ -49,6 +49,7 @@ + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/netfilter_ipv4/ip_conntrack_core.h> + #include <linux/netfilter_ipv4/listhelp.h> ++#include <ub/ub_mem.h> + + #define IP_CONNTRACK_VERSION "2.4" + +@@ -60,22 +61,41 @@ + + DEFINE_RWLOCK(ip_conntrack_lock); + +-/* ip_conntrack_standalone needs this */ +-atomic_t ip_conntrack_count = ATOMIC_INIT(0); ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_conntrack_helpers \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_helpers) ++#define ve_ip_conntrack_max \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_max) ++#define ve_ip_conntrack_count \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_count) ++#define ve_ip_conntrack_unconfirmed \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_unconfirmed) ++#else + + void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; + LIST_HEAD(ip_conntrack_expect_list); + struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; + static LIST_HEAD(helpers); ++struct list_head *ip_conntrack_hash; ++static LIST_HEAD(unconfirmed); ++#define ve_ip_conntrack_count ip_conntrack_count ++#define ve_ip_conntrack_helpers helpers ++#define ve_ip_conntrack_max ip_conntrack_max ++#define ve_ip_conntrack_unconfirmed unconfirmed ++#endif ++ ++/* ip_conntrack_standalone needs this */ ++atomic_t ip_conntrack_count = ATOMIC_INIT(0); ++ + unsigned int ip_conntrack_htable_size = 0; + int ip_conntrack_max; +-struct list_head *ip_conntrack_hash; + static kmem_cache_t *ip_conntrack_cachep __read_mostly; + static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; + struct ip_conntrack ip_conntrack_untracked; + unsigned int ip_ct_log_invalid; +-static LIST_HEAD(unconfirmed); ++#ifndef CONFIG_VE_IPTABLES + static int ip_conntrack_vmalloc; ++#endif + + static unsigned int ip_conntrack_next_id = 1; + static unsigned int ip_conntrack_expect_next_id = 1; +@@ -105,6 +125,9 @@ void ip_ct_deliver_cached_events(const s + { + struct ip_conntrack_ecache *ecache; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) +@@ -133,6 +156,9 @@ static void ip_ct_event_cache_flush(void + struct ip_conntrack_ecache *ecache; + int cpu; + ++ if (!ve_is_super(get_exec_env())) ++ return; ++ + for_each_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) +@@ -226,7 +252,7 @@ __ip_conntrack_expect_find(const struct + { + struct ip_conntrack_expect *i; + +- list_for_each_entry(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; +@@ -255,7 +281,7 @@ find_expectation(const struct ip_conntra + { + struct ip_conntrack_expect *i; + +- list_for_each_entry(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if +@@ -284,7 +310,7 @@ void ip_ct_remove_expectations(struct ip + if (ct->expecting == 0) + return; + +- list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_safe(i, tmp, &ve_ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + ip_conntrack_expect_put(i); +@@ -302,8 +328,10 @@ clean_from_lists(struct ip_conntrack *ct + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); +- LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); +- LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); ++ LIST_DELETE(&ve_ip_conntrack_hash[ho], ++ &ct->tuplehash[IP_CT_DIR_ORIGINAL]); ++ LIST_DELETE(&ve_ip_conntrack_hash[hr], ++ &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + ip_ct_remove_expectations(ct); +@@ -329,8 +357,8 @@ destroy_conntrack(struct nf_conntrack *n + if (proto && proto->destroy) + proto->destroy(ct); + +- if (ip_conntrack_destroyed) +- ip_conntrack_destroyed(ct); ++ if (ve_ip_conntrack_destroyed) ++ ve_ip_conntrack_destroyed(ct); + + write_lock_bh(&ip_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, +@@ -358,7 +386,11 @@ destroy_conntrack(struct nf_conntrack *n + static void death_by_timeout(unsigned long ul_conntrack) + { + struct ip_conntrack *ct = (void *)ul_conntrack; ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *old; + ++ old = set_exec_env(VE_OWNER_CT(ct)); ++#endif + write_lock_bh(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ +@@ -366,6 +398,9 @@ static void death_by_timeout(unsigned lo + clean_from_lists(ct); + write_unlock_bh(&ip_conntrack_lock); + ip_conntrack_put(ct); ++#ifdef CONFIG_VE_IPTABLES ++ (void)set_exec_env(old); ++#endif + } + + static inline int +@@ -386,7 +421,7 @@ __ip_conntrack_find(const struct ip_conn + unsigned int hash = hash_conntrack(tuple); + + ASSERT_READ_LOCK(&ip_conntrack_lock); +- list_for_each_entry(h, &ip_conntrack_hash[hash], list) { ++ list_for_each_entry(h, &ve_ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; +@@ -418,9 +453,9 @@ static void __ip_conntrack_hash_insert(s + unsigned int repl_hash) + { + ct->id = ++ip_conntrack_next_id; +- list_prepend(&ip_conntrack_hash[hash], ++ list_prepend(&ve_ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); +- list_prepend(&ip_conntrack_hash[repl_hash], ++ list_prepend(&ve_ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); + } + +@@ -471,11 +506,11 @@ __ip_conntrack_confirm(struct sk_buff ** + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ +- if (!LIST_FIND(&ip_conntrack_hash[hash], ++ if (!LIST_FIND(&ve_ip_conntrack_hash[hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) +- && !LIST_FIND(&ip_conntrack_hash[repl_hash], ++ && !LIST_FIND(&ve_ip_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { +@@ -569,7 +604,7 @@ static inline int helper_cmp(const struc + static struct ip_conntrack_helper * + __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) + { +- return LIST_FIND(&helpers, helper_cmp, ++ return LIST_FIND(&ve_ip_conntrack_helpers, helper_cmp, + struct ip_conntrack_helper *, + tuple); + } +@@ -605,7 +640,7 @@ void ip_conntrack_helper_put(struct ip_c + struct ip_conntrack_protocol * + __ip_conntrack_proto_find(u_int8_t protocol) + { +- return ip_ct_protos[protocol]; ++ return ve_ip_ct_protos[protocol]; + } + + /* this is guaranteed to always return a valid protocol helper, since +@@ -632,29 +667,32 @@ void ip_conntrack_proto_put(struct ip_co + } + + struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, +- struct ip_conntrack_tuple *repl) ++ struct ip_conntrack_tuple *repl, struct user_beancounter *ub) + { + struct ip_conntrack *conntrack; ++ struct user_beancounter *old_ub; + + if (!ip_conntrack_hash_rnd_initted) { + get_random_bytes(&ip_conntrack_hash_rnd, 4); + ip_conntrack_hash_rnd_initted = 1; + } + +- if (ip_conntrack_max +- && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { ++ if (ve_ip_conntrack_max ++ && atomic_read(&ve_ip_conntrack_count) >= ve_ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); + /* Try dropping from this hash chain. */ +- if (!early_drop(&ip_conntrack_hash[hash])) { ++ if (!early_drop(&ve_ip_conntrack_hash[hash])) { + if (net_ratelimit()) +- printk(KERN_WARNING +- "ip_conntrack: table full, dropping" +- " packet.\n"); ++ ve_printk(VE_LOG_BOTH, KERN_WARNING ++ "ip_conntrack: VPS %d: table full, dropping" ++ " packet.\n", VEID(get_exec_env())); + return ERR_PTR(-ENOMEM); + } + } + ++ old_ub = set_exec_ub(ub); + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); ++ (void)set_exec_ub(old_ub); + if (!conntrack) { + DEBUGP("Can't allocate conntrack.\n"); + return ERR_PTR(-ENOMEM); +@@ -669,8 +707,11 @@ struct ip_conntrack *ip_conntrack_alloc( + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; ++#ifdef CONFIG_VE_IPTABLES ++ SET_VE_OWNER_CT(conntrack, get_exec_env()); ++#endif + +- atomic_inc(&ip_conntrack_count); ++ atomic_inc(&ve_ip_conntrack_count); + + return conntrack; + } +@@ -678,7 +719,7 @@ struct ip_conntrack *ip_conntrack_alloc( + void + ip_conntrack_free(struct ip_conntrack *conntrack) + { +- atomic_dec(&ip_conntrack_count); ++ atomic_dec(&ve_ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); + } + +@@ -692,13 +733,22 @@ init_conntrack(struct ip_conntrack_tuple + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; ++ struct user_beancounter *ub; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + +- conntrack = ip_conntrack_alloc(tuple, &repl_tuple); ++#ifdef CONFIG_USER_RESOURCE ++ if (skb->dev != NULL) /* received skb */ ++ ub = netdev_bc(skb->dev)->exec_ub; ++ else if (skb->sk != NULL) /* sent skb */ ++ ub = sock_bc(skb->sk)->ub; ++ else ++#endif ++ ub = NULL; ++ conntrack = ip_conntrack_alloc(tuple, &repl_tuple, ub); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; + +@@ -733,7 +783,8 @@ init_conntrack(struct ip_conntrack_tuple + } + + /* Overload tuple linked list to put us in unconfirmed list. */ +- list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); ++ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, ++ &ve_ip_conntrack_unconfirmed); + + write_unlock_bh(&ip_conntrack_lock); + +@@ -925,7 +976,7 @@ void ip_conntrack_unexpect_related(struc + + write_lock_bh(&ip_conntrack_lock); + /* choose the the oldest expectation to evict */ +- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + write_unlock_bh(&ip_conntrack_lock); +@@ -959,11 +1010,11 @@ void ip_conntrack_expect_put(struct ip_c + kmem_cache_free(ip_conntrack_expect_cachep, exp); + } + +-static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) ++void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) + { + atomic_inc(&exp->use); + exp->master->expecting++; +- list_add(&exp->list, &ip_conntrack_expect_list); ++ list_add(&exp->list, &ve_ip_conntrack_expect_list); + + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; +@@ -975,13 +1026,14 @@ static void ip_conntrack_expect_insert(s + atomic_inc(&exp->use); + CONNTRACK_STAT_INC(expect_create); + } ++EXPORT_SYMBOL_GPL(ip_conntrack_expect_insert); + + /* Race with expectations being used means we could have none to find; OK. */ + static void evict_oldest_expect(struct ip_conntrack *master) + { + struct ip_conntrack_expect *i; + +- list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_reverse(i, &ve_ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); +@@ -1012,7 +1064,7 @@ int ip_conntrack_expect_related(struct i + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + write_lock_bh(&ip_conntrack_lock); +- list_for_each_entry(i, &ip_conntrack_expect_list, list) { ++ list_for_each_entry(i, &ve_ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { +@@ -1060,18 +1112,48 @@ int ip_conntrack_helper_register(struct + { + BUG_ON(me->timeout == 0); + write_lock_bh(&ip_conntrack_lock); +- list_prepend(&helpers, me); ++ list_prepend(&ve_ip_conntrack_helpers, me); + write_unlock_bh(&ip_conntrack_lock); + + return 0; + } + ++int virt_ip_conntrack_helper_register(struct ip_conntrack_helper *me) ++{ ++ int ret; ++ struct module *mod = me->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct ip_conntrack_helper *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = kmalloc(sizeof(struct ip_conntrack_helper), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, me, sizeof(struct ip_conntrack_helper)); ++ me = tmp; ++ } ++ ++ ret = ip_conntrack_helper_register(me); ++ if (ret) ++ goto out; ++ ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())){ ++ kfree(me); ++nomem: ++ module_put(mod); ++ } ++ return ret; ++} ++ + struct ip_conntrack_helper * + __ip_conntrack_helper_find_byname(const char *name) + { + struct ip_conntrack_helper *h; + +- list_for_each_entry(h, &helpers, list) { ++ list_for_each_entry(h, &ve_ip_conntrack_helpers, list) { + if (!strcmp(h->name, name)) + return h; + } +@@ -1096,19 +1178,20 @@ void ip_conntrack_helper_unregister(stru + + /* Need write lock here, to delete helper. */ + write_lock_bh(&ip_conntrack_lock); +- LIST_DELETE(&helpers, me); ++ LIST_DELETE(&ve_ip_conntrack_helpers, me); + + /* Get rid of expectations */ +- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { ++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); + ip_conntrack_expect_put(exp); + } + } + /* Get rid of expecteds, set helpers to NULL. */ +- LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); ++ LIST_FIND_W(&ve_ip_conntrack_unconfirmed, unhelp, ++ struct ip_conntrack_tuple_hash*, me); + for (i = 0; i < ip_conntrack_htable_size; i++) +- LIST_FIND_W(&ip_conntrack_hash[i], unhelp, ++ LIST_FIND_W(&ve_ip_conntrack_hash[i], unhelp, + struct ip_conntrack_tuple_hash *, me); + write_unlock_bh(&ip_conntrack_lock); + +@@ -1116,6 +1199,25 @@ void ip_conntrack_helper_unregister(stru + synchronize_net(); + } + ++void virt_ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) ++{ ++ ++ if (!ve_is_super(get_exec_env())) { ++ read_lock_bh(&ip_conntrack_lock); ++ me = list_named_find(&ve_ip_conntrack_helpers, me->name); ++ read_unlock_bh(&ip_conntrack_lock); ++ if (!me) ++ return; ++ } ++ ++ ip_conntrack_helper_unregister(me); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(me->me); ++ kfree(me); ++ } ++} ++ + /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ + void __ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, +@@ -1246,13 +1348,13 @@ get_next_corpse(int (*iter)(struct ip_co + + write_lock_bh(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { +- h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, ++ h = LIST_FIND_W(&ve_ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) +- h = LIST_FIND_W(&unconfirmed, do_iter, ++ h = LIST_FIND_W(&ve_ip_conntrack_unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); +@@ -1289,6 +1391,11 @@ getorigdst(struct sock *sk, int optval, + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + ++#ifdef CONFIG_VE_IPTABLES ++ if (!get_exec_env()->_ip_conntrack) ++ return -ENOPROTOOPT; ++#endif ++ + IP_CT_TUPLE_U_BLANK(&tuple); + tuple.src.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; +@@ -1318,6 +1425,7 @@ getorigdst(struct sock *sk, int optval, + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.ip; ++ memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); +@@ -1359,12 +1467,17 @@ static void free_conntrack_hash(struct l + get_order(sizeof(struct list_head) * size)); + } + ++static void ip_conntrack_cache_free(void) ++{ ++ kmem_cache_destroy(ip_conntrack_expect_cachep); ++ kmem_cache_destroy(ip_conntrack_cachep); ++ nf_unregister_sockopt(&so_getorigdst); ++} ++ + /* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ + void ip_conntrack_cleanup(void) + { +- ip_ct_attach = NULL; +- + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ +@@ -1373,19 +1486,32 @@ void ip_conntrack_cleanup(void) + ip_ct_event_cache_flush(); + i_see_dead_people: + ip_conntrack_flush(); +- if (atomic_read(&ip_conntrack_count) != 0) { ++ if (atomic_read(&ve_ip_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } +- /* wait until all references to ip_conntrack_untracked are dropped */ +- while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) +- schedule(); +- +- kmem_cache_destroy(ip_conntrack_cachep); +- kmem_cache_destroy(ip_conntrack_expect_cachep); +- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, ++ if (ve_is_super(get_exec_env())) { ++ /* wait until all references to ip_conntrack_untracked are ++ * dropped */ ++ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) ++ schedule(); ++ ip_ct_attach = NULL; ++ ip_conntrack_cache_free(); ++ } ++ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, + ip_conntrack_htable_size); +- nf_unregister_sockopt(&so_getorigdst); ++ ve_ip_conntrack_hash = NULL; ++ INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed); ++ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); ++ INIT_LIST_HEAD(&ve_ip_conntrack_helpers); ++ atomic_set(&ve_ip_conntrack_count, 0); ++ ve_ip_conntrack_max = 0; ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve_ip_ct_protos); ++ ve_ip_ct_protos = NULL; ++ kfree(get_exec_env()->_ip_conntrack); ++ get_exec_env()->_ip_conntrack = NULL; ++#endif + } + + static struct list_head *alloc_hashtable(int size, int *vmalloced) +@@ -1394,13 +1520,13 @@ static struct list_head *alloc_hashtable + unsigned int i; + + *vmalloced = 0; +- hash = (void*)__get_free_pages(GFP_KERNEL, ++ hash = (void*)__get_free_pages(GFP_KERNEL_UBC, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); +- hash = vmalloc(sizeof(struct list_head) * size); ++ hash = ub_vmalloc(sizeof(struct list_head) * size); + } + + if (hash) +@@ -1436,8 +1562,8 @@ static int set_hashsize(const char *val, + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { +- while (!list_empty(&ip_conntrack_hash[i])) { +- h = list_entry(ip_conntrack_hash[i].next, ++ while (!list_empty(&ve_ip_conntrack_hash[i])) { ++ h = list_entry(ve_ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); +@@ -1445,12 +1571,12 @@ static int set_hashsize(const char *val, + } + } + old_size = ip_conntrack_htable_size; +- old_vmalloced = ip_conntrack_vmalloc; +- old_hash = ip_conntrack_hash; ++ old_vmalloced = ve_ip_conntrack_vmalloc; ++ old_hash = ve_ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; +- ip_conntrack_vmalloc = vmalloced; +- ip_conntrack_hash = hash; ++ ve_ip_conntrack_vmalloc = vmalloced; ++ ve_ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + +@@ -1461,9 +1587,8 @@ static int set_hashsize(const char *val, + module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); + +-int __init ip_conntrack_init(void) ++static int ip_conntrack_cache_create(void) + { +- unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB +@@ -1477,70 +1602,127 @@ int __init ip_conntrack_init(void) + if (ip_conntrack_htable_size < 16) + ip_conntrack_htable_size = 16; + } +- ip_conntrack_max = 8 * ip_conntrack_htable_size; ++ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; + + printk("ip_conntrack version %s (%u buckets, %d max)" + " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, +- ip_conntrack_htable_size, ip_conntrack_max, ++ ip_conntrack_htable_size, ve_ip_conntrack_max, + sizeof(struct ip_conntrack)); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret != 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); +- return ret; +- } +- +- ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, +- &ip_conntrack_vmalloc); +- if (!ip_conntrack_hash) { +- printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); +- goto err_unreg_sockopt; ++ goto out_sockopt; + } + ++ ret = -ENOMEM; + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, +- 0, NULL, NULL); ++ SLAB_UBC, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); +- goto err_free_hash; ++ goto err_unreg_sockopt; + } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), +- 0, 0, NULL, NULL); ++ 0, SLAB_UBC, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + ++ return 0; ++ ++err_free_conntrack_slab: ++ kmem_cache_destroy(ip_conntrack_cachep); ++err_unreg_sockopt: ++ nf_unregister_sockopt(&so_getorigdst); ++out_sockopt: ++ return ret; ++} ++ ++int ip_conntrack_init(void) ++{ ++ struct ve_struct *env; ++ unsigned int i; ++ int ret; ++ ++ env = get_exec_env(); ++#ifdef CONFIG_VE_IPTABLES ++ ret = -ENOMEM; ++ env->_ip_conntrack = ++ kmalloc(sizeof(struct ve_ip_conntrack), GFP_KERNEL); ++ if (!env->_ip_conntrack) ++ goto out; ++ memset(env->_ip_conntrack, 0, sizeof(struct ve_ip_conntrack)); ++ if (ve_is_super(env)) { ++ ret = ip_conntrack_cache_create(); ++ if (ret) ++ goto cache_fail; ++ } else ++ ve_ip_conntrack_max = 8 * ip_conntrack_htable_size; ++#else /* CONFIG_VE_IPTABLES */ ++ ret = ip_conntrack_cache_create(); ++ if (ret) ++ goto out; ++#endif ++ ++ ret = -ENOMEM; ++ ve_ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, ++ &ve_ip_conntrack_vmalloc); ++ if (!ve_ip_conntrack_hash) { ++ printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); ++ goto err_free_cache; ++ } ++ ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_ct_protos = (struct ip_conntrack_protocol **) ++ ub_kmalloc(sizeof(void *)*MAX_IP_CT_PROTO, GFP_KERNEL); ++ if (!ve_ip_ct_protos) ++ goto err_free_hash; ++#endif + /* Don't NEED lock here, but good form anyway. */ + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) +- ip_ct_protos[i] = &ip_conntrack_generic_protocol; ++ ve_ip_ct_protos[i] = &ip_conntrack_generic_protocol; + /* Sew in builtin protocols. */ +- ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; +- ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; +- ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; ++ ve_ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; ++ ve_ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; ++ ve_ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + write_unlock_bh(&ip_conntrack_lock); + +- /* For use by ipt_REJECT */ +- ip_ct_attach = ip_conntrack_attach; +- +- /* Set up fake conntrack: +- - to never be deleted, not in any hashes */ +- atomic_set(&ip_conntrack_untracked.ct_general.use, 1); +- /* - and look it like as a confirmed connection */ +- set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); ++ INIT_LIST_HEAD(&ve_ip_conntrack_unconfirmed); ++ INIT_LIST_HEAD(&ve_ip_conntrack_expect_list); ++ INIT_LIST_HEAD(&ve_ip_conntrack_helpers); ++ ++ if (ve_is_super(env)) { ++ /* For use by ipt_REJECT */ ++ ip_ct_attach = ip_conntrack_attach; ++ ++ /* Set up fake conntrack: ++ - to never be deleted, not in any hashes */ ++ atomic_set(&ip_conntrack_untracked.ct_general.use, 1); ++ /* - and look it like as a confirmed connection */ ++ set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); ++ } + +- return ret; ++ return 0; + +-err_free_conntrack_slab: +- kmem_cache_destroy(ip_conntrack_cachep); ++#ifdef CONFIG_VE_IPTABLES + err_free_hash: +- free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, ++#endif ++ free_conntrack_hash(ve_ip_conntrack_hash, ve_ip_conntrack_vmalloc, + ip_conntrack_htable_size); +-err_unreg_sockopt: +- nf_unregister_sockopt(&so_getorigdst); +- +- return -ENOMEM; ++ ve_ip_conntrack_hash = NULL; ++err_free_cache: ++ if (ve_is_super(env)) ++ ip_conntrack_cache_free(); ++#ifdef CONFIG_VE_IPTABLES ++cache_fail: ++ kfree(env->_ip_conntrack); ++ env->_ip_conntrack = NULL; ++#endif ++out: ++ return ret; + } +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_ftp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_ftp.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_ftp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -15,6 +15,7 @@ + #include <linux/ctype.h> + #include <net/checksum.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +@@ -425,8 +426,8 @@ static int help(struct sk_buff **pskb, + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ +- if (ip_nat_ftp_hook) +- ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, ++ if (ve_ip_nat_ftp_hook) ++ ret = ve_ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ +@@ -452,16 +453,39 @@ out_update_nl: + static struct ip_conntrack_helper ftp[MAX_PORTS]; + static char ftp_names[MAX_PORTS][sizeof("ftp-65535")]; + +-/* Not __exit: called from init() */ +-static void fini(void) ++void fini_iptable_ftp(void) + { + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", + ports[i]); +- ip_conntrack_helper_unregister(&ftp[i]); ++ virt_ip_conntrack_helper_unregister(&ftp[i]); + } ++} ++ ++int init_iptable_ftp(void) ++{ ++ int i, ret; + ++ for (i = 0; i < ports_c; i++) { ++ DEBUGP("ip_ct_ftp: registering helper for port %d\n", ++ ports[i]); ++ ret = virt_ip_conntrack_helper_register(&ftp[i]); ++ if (ret) { ++ fini_iptable_ftp(); ++ return ret; ++ } ++ } ++ return 0; ++} ++ ++/* Not __exit: called from init() */ ++static void fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_conntrack_ftp); ++ KSYMUNRESOLVE(init_iptable_ftp); ++ KSYMUNRESOLVE(fini_iptable_ftp); ++ fini_iptable_ftp(); + kfree(ftp_buffer); + } + +@@ -496,13 +520,17 @@ static int __init init(void) + + DEBUGP("ip_ct_ftp: registering helper for port %d\n", + ports[i]); +- ret = ip_conntrack_helper_register(&ftp[i]); ++ ret = virt_ip_conntrack_helper_register(&ftp[i]); + + if (ret) { + fini(); + return ret; + } + } ++ ++ KSYMRESOLVE(init_iptable_ftp); ++ KSYMRESOLVE(fini_iptable_ftp); ++ KSYMMODRESOLVE(ip_conntrack_ftp); + return 0; + } + +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_irc.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_irc.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_irc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_irc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -28,6 +28,7 @@ + #include <linux/ip.h> + #include <net/checksum.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/netfilter_ipv4/ip_conntrack_irc.h> +@@ -244,6 +245,33 @@ static char irc_names[MAX_PORTS][sizeof( + + static void fini(void); + ++void fini_iptable_irc(void) ++{ ++ int i; ++ for (i = 0; i < ports_c; i++) { ++ DEBUGP("unregistering port %d\n", ++ ports[i]); ++ virt_ip_conntrack_helper_unregister(&irc_helpers[i]); ++ } ++} ++ ++int init_iptable_irc(void) ++{ ++ int i, ret; ++ ++ for (i = 0; i < ports_c; i++) { ++ DEBUGP("port #%d: %d\n", i, ports[i]); ++ ret = virt_ip_conntrack_helper_register(&irc_helpers[i]); ++ if (ret) { ++ printk("ip_conntrack_irc: ERROR registering port %d\n", ++ ports[i]); ++ fini_iptable_irc(); ++ return -EBUSY; ++ } ++ } ++ return 0; ++} ++ + static int __init init(void) + { + int i, ret; +@@ -283,7 +311,7 @@ static int __init init(void) + + DEBUGP("port #%d: %d\n", i, ports[i]); + +- ret = ip_conntrack_helper_register(hlpr); ++ ret = virt_ip_conntrack_helper_register(hlpr); + + if (ret) { + printk("ip_conntrack_irc: ERROR registering port %d\n", +@@ -292,6 +320,10 @@ static int __init init(void) + return -EBUSY; + } + } ++ ++ KSYMRESOLVE(init_iptable_irc); ++ KSYMRESOLVE(fini_iptable_irc); ++ KSYMMODRESOLVE(ip_conntrack_irc); + return 0; + } + +@@ -299,12 +331,10 @@ static int __init init(void) + * it is needed by the init function */ + static void fini(void) + { +- int i; +- for (i = 0; i < ports_c; i++) { +- DEBUGP("unregistering port %d\n", +- ports[i]); +- ip_conntrack_helper_unregister(&irc_helpers[i]); +- } ++ KSYMMODUNRESOLVE(ip_conntrack_irc); ++ KSYMUNRESOLVE(init_iptable_irc); ++ KSYMUNRESOLVE(fini_iptable_irc); ++ fini_iptable_irc(); + kfree(irc_buffer); + } + +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_netlink.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_netlink.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_netlink.c 2006-07-04 14:41:39.000000000 +0400 +@@ -29,6 +29,7 @@ + #include <linux/spinlock.h> + #include <linux/interrupt.h> + #include <linux/notifier.h> ++#include <net/sock.h> + + #include <linux/netfilter.h> + #include <linux/netfilter_ipv4/ip_conntrack.h> +@@ -39,6 +40,8 @@ + + #include <linux/netfilter/nfnetlink.h> + #include <linux/netfilter/nfnetlink_conntrack.h> ++#include <ub/beancounter.h> ++#include <ub/ub_sk.h> + + MODULE_LICENSE("GPL"); + +@@ -403,7 +406,7 @@ ctnetlink_dump_table(struct sk_buff *skb + + read_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { +- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { ++ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; +@@ -440,7 +443,7 @@ ctnetlink_dump_table_w(struct sk_buff *s + + write_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { +- list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { ++ list_for_each_prev(i, &ve_ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; +@@ -1003,14 +1006,15 @@ ctnetlink_change_conntrack(struct ip_con + static int + ctnetlink_create_conntrack(struct nfattr *cda[], + struct ip_conntrack_tuple *otuple, +- struct ip_conntrack_tuple *rtuple) ++ struct ip_conntrack_tuple *rtuple, ++ struct user_beancounter *ub) + { + struct ip_conntrack *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + +- ct = ip_conntrack_alloc(otuple, rtuple); ++ ct = ip_conntrack_alloc(otuple, rtuple, ub); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + +@@ -1087,8 +1091,16 @@ ctnetlink_new_conntrack(struct sock *ctn + write_unlock_bh(&ip_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; +- if (nlh->nlmsg_flags & NLM_F_CREATE) +- err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); ++ if (nlh->nlmsg_flags & NLM_F_CREATE) { ++#ifdef CONFIG_USER_RESOURCE ++ if (skb->sk) ++ err = ctnetlink_create_conntrack(cda, &otuple, ++ &rtuple, sock_bc(skb->sk)->ub); ++ else ++#endif ++ err = ctnetlink_create_conntrack(cda, ++ &otuple, &rtuple, NULL); ++ } + return err; + } + /* implicit 'else' */ +@@ -1249,7 +1261,7 @@ ctnetlink_exp_dump_table(struct sk_buff + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&ip_conntrack_lock); +- list_for_each_prev(i, &ip_conntrack_expect_list) { ++ list_for_each_prev(i, &ve_ip_conntrack_expect_list) { + exp = (struct ip_conntrack_expect *) i; + if (exp->id <= *id) + continue; +@@ -1395,7 +1407,7 @@ ctnetlink_del_expect(struct sock *ctnl, + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } +- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, ++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) { +@@ -1407,7 +1419,7 @@ ctnetlink_del_expect(struct sock *ctnl, + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&ip_conntrack_lock); +- list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, ++ list_for_each_entry_safe(exp, tmp, &ve_ip_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) { + ip_ct_unlink_expect(exp); +@@ -1619,7 +1631,7 @@ static void __exit ctnetlink_exit(void) + printk("ctnetlink: unregistering from nfnetlink.\n"); + + #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +- ip_conntrack_unregister_notifier(&ctnl_notifier_exp); ++ ip_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); + ip_conntrack_unregister_notifier(&ctnl_notifier); + #endif + +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_generic.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_generic.c 2006-07-04 14:41:39.000000000 +0400 +@@ -52,7 +52,7 @@ static int packet(struct ip_conntrack *c + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) + { +- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); ++ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_generic_timeout); + return NF_ACCEPT; + } + +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_icmp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -104,7 +104,7 @@ static int icmp_packet(struct ip_conntra + } else { + atomic_inc(&ct->proto.icmp.count); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +- ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); ++ ip_ct_refresh_acct(ct, ctinfo, skb, ve_ip_ct_icmp_timeout); + } + + return NF_ACCEPT; +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_sctp.c 2006-07-04 14:41:36.000000000 +0400 +@@ -235,12 +235,15 @@ static int do_basic_checks(struct ip_con + flag = 1; + } + +- /* Cookie Ack/Echo chunks not the first OR +- Init / Init Ack / Shutdown compl chunks not the only chunks */ +- if ((sch->type == SCTP_CID_COOKIE_ACK ++ /* ++ * Cookie Ack/Echo chunks not the first OR ++ * Init / Init Ack / Shutdown compl chunks not the only chunks ++ * OR zero-length. ++ */ ++ if (((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) +- && count !=0 ) { ++ && count !=0) || !sch->length) { + DEBUGP("Basic checks failed\n"); + return 1; + } +@@ -251,7 +254,7 @@ static int do_basic_checks(struct ip_con + } + + DEBUGP("Basic checks passed\n"); +- return 0; ++ return count == 0; + } + + static int new_state(enum ip_conntrack_dir dir, +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -98,7 +98,7 @@ unsigned int ip_ct_tcp_timeout_close = + to ~13-30min depending on RTO. */ + unsigned int ip_ct_tcp_timeout_max_retrans = 5 MINS; + +-static const unsigned int * tcp_timeouts[] ++const unsigned int * tcp_timeouts[] + = { NULL, /* TCP_CONNTRACK_NONE */ + &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ +@@ -762,7 +762,7 @@ static int tcp_in_window(struct ip_ct_tc + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + +- res = ip_ct_tcp_be_liberal; ++ res = ve_ip_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " +@@ -1033,9 +1033,11 @@ static int tcp_packet(struct ip_conntrac + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; +- timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans +- && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans +- ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; ++ timeout = conntrack->proto.tcp.retrans >= ve_ip_ct_tcp_max_retrans && ++ ve_ip_ct_tcp_timeouts[new_state] > ++ ve_ip_ct_tcp_timeout_max_retrans ++ ? ve_ip_ct_tcp_timeout_max_retrans : ++ ve_ip_ct_tcp_timeouts[new_state]; + write_unlock_bh(&tcp_lock); + + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); +@@ -1110,7 +1112,7 @@ static int tcp_new(struct ip_conntrack * + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; +- } else if (ip_ct_tcp_loose == 0) { ++ } else if (ve_ip_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { +@@ -1134,7 +1136,7 @@ static int tcp_new(struct ip_conntrack * + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = +- conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; ++ conntrack->proto.tcp.seen[1].loose = ve_ip_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_udp.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -71,12 +71,12 @@ static int udp_packet(struct ip_conntrac + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + ip_ct_refresh_acct(conntrack, ctinfo, skb, +- ip_ct_udp_timeout_stream); ++ ve_ip_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + ip_conntrack_event_cache(IPCT_STATUS, skb); + } else +- ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); ++ ip_ct_refresh_acct(conntrack, ctinfo, skb, ve_ip_ct_udp_timeout); + + return NF_ACCEPT; + } +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_standalone.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_conntrack_standalone.c 2006-07-04 14:41:39.000000000 +0400 +@@ -28,6 +28,7 @@ + #include <net/checksum.h> + #include <net/ip.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) +@@ -46,9 +47,31 @@ + + MODULE_LICENSE("GPL"); + ++int ip_conntrack_disable_ve0 = 0; ++module_param(ip_conntrack_disable_ve0, int, 0440); ++ + extern atomic_t ip_conntrack_count; ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ip_conntrack_count \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_count) ++#else ++#define ve_ip_conntrack_count ip_conntrack_count ++#endif + DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + ++/* Prior to 2.6.15, we had a ip_conntrack_enable_ve0 param. */ ++static int warn_set(const char *val, struct kernel_param *kp) ++{ ++ printk(KERN_INFO KBUILD_MODNAME ++ ": parameter ip_conntrack_enable_ve0 is obsoleted. In ovzkernel" ++ " >= 2.6.15 connection tracking on hardware node is enabled by " ++ "default, use ip_conntrack_disable_ve0=1 parameter to " ++ "disable.\n"); ++ return 0; ++} ++module_param_call(ip_conntrack_enable_ve0, warn_set, NULL, NULL, 0); ++ + static int kill_proto(struct ip_conntrack *i, void *data) + { + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == +@@ -89,8 +112,8 @@ static struct list_head *ct_get_first(st + for (st->bucket = 0; + st->bucket < ip_conntrack_htable_size; + st->bucket++) { +- if (!list_empty(&ip_conntrack_hash[st->bucket])) +- return ip_conntrack_hash[st->bucket].next; ++ if (!list_empty(&ve_ip_conntrack_hash[st->bucket])) ++ return ve_ip_conntrack_hash[st->bucket].next; + } + return NULL; + } +@@ -100,10 +123,10 @@ static struct list_head *ct_get_next(str + struct ct_iter_state *st = seq->private; + + head = head->next; +- while (head == &ip_conntrack_hash[st->bucket]) { ++ while (head == &ve_ip_conntrack_hash[st->bucket]) { + if (++st->bucket >= ip_conntrack_htable_size) + return NULL; +- head = ip_conntrack_hash[st->bucket].next; ++ head = ve_ip_conntrack_hash[st->bucket].next; + } + return head; + } +@@ -234,7 +257,7 @@ static struct file_operations ct_file_op + /* expects */ + static void *exp_seq_start(struct seq_file *s, loff_t *pos) + { +- struct list_head *e = &ip_conntrack_expect_list; ++ struct list_head *e = &ve_ip_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, +@@ -246,7 +269,7 @@ static void *exp_seq_start(struct seq_fi + + for (i = 0; i <= *pos; i++) { + e = e->next; +- if (e == &ip_conntrack_expect_list) ++ if (e == &ve_ip_conntrack_expect_list) + return NULL; + } + return e; +@@ -259,7 +282,7 @@ static void *exp_seq_next(struct seq_fil + ++*pos; + e = e->next; + +- if (e == &ip_conntrack_expect_list) ++ if (e == &ve_ip_conntrack_expect_list) + return NULL; + + return e; +@@ -344,7 +367,7 @@ static void ct_cpu_seq_stop(struct seq_f + + static int ct_cpu_seq_show(struct seq_file *seq, void *v) + { +- unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); ++ unsigned int nr_conntracks = atomic_read(&ve_ip_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { +@@ -541,6 +564,28 @@ static struct nf_hook_ops ip_conntrack_l + + /* From ip_conntrack_core.c */ + extern int ip_conntrack_max; ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_conntrack_max \ ++ (get_exec_env()->_ip_conntrack->_ip_conntrack_max) ++#define ve_ip_ct_sysctl_header \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_header) ++#define ve_ip_ct_net_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_net_table) ++#define ve_ip_ct_ipv4_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_ipv4_table) ++#define ve_ip_ct_netfilter_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_netfilter_table) ++#define ve_ip_ct_sysctl_table \ ++ (get_exec_env()->_ip_conntrack->_ip_ct_sysctl_table) ++#else ++#define ve_ip_conntrack_max ip_conntrack_max ++static struct ctl_table_header *ip_ct_sysctl_header; ++#define ve_ip_ct_sysctl_header ip_ct_sysctl_header ++#define ve_ip_ct_net_table ip_ct_net_table ++#define ve_ip_ct_ipv4_table ip_ct_ipv4_table ++#define ve_ip_ct_netfilter_table ip_ct_netfilter_table ++#define ve_ip_ct_sysctl_table ip_ct_sysctl_table ++#endif + extern unsigned int ip_conntrack_htable_size; + + /* From ip_conntrack_proto_tcp.c */ +@@ -571,8 +616,6 @@ extern unsigned int ip_ct_generic_timeou + static int log_invalid_proto_min = 0; + static int log_invalid_proto_max = 255; + +-static struct ctl_table_header *ip_ct_sysctl_header; +- + static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, +@@ -781,6 +824,112 @@ static ctl_table ip_ct_net_table[] = { + }; + + EXPORT_SYMBOL(ip_ct_log_invalid); ++ ++#ifdef CONFIG_VE_IPTABLES ++static void ip_conntrack_sysctl_cleanup(void) ++{ ++ if (!ve_is_super(get_exec_env())) { ++ kfree(ve_ip_ct_net_table); ++ kfree(ve_ip_ct_ipv4_table); ++ kfree(ve_ip_ct_netfilter_table); ++ kfree(ve_ip_ct_sysctl_table); ++ } ++ ve_ip_ct_net_table = NULL; ++ ve_ip_ct_ipv4_table = NULL; ++ ve_ip_ct_netfilter_table = NULL; ++ ve_ip_ct_sysctl_table = NULL; ++} ++ ++#define ALLOC_ENVCTL(field,k,label) \ ++ if ( !(field = kmalloc(k*sizeof(ctl_table), GFP_KERNEL)) ) \ ++ goto label; ++static int ip_conntrack_sysctl_init(void) ++{ ++ int i, ret = 0; ++ ++ ret = -ENOMEM; ++ if (ve_is_super(get_exec_env())) { ++ ve_ip_ct_net_table = ip_ct_net_table; ++ ve_ip_ct_ipv4_table = ip_ct_ipv4_table; ++ ve_ip_ct_netfilter_table = ip_ct_netfilter_table; ++ ve_ip_ct_sysctl_table = ip_ct_sysctl_table; ++ } else { ++ /* allocate structures in ve_struct */ ++ ALLOC_ENVCTL(ve_ip_ct_net_table, 2, out); ++ ALLOC_ENVCTL(ve_ip_ct_ipv4_table, 2, nomem_1); ++ ALLOC_ENVCTL(ve_ip_ct_netfilter_table, 3, nomem_2); ++ ALLOC_ENVCTL(ve_ip_ct_sysctl_table, 21, nomem_3); ++ ++ memcpy(ve_ip_ct_net_table, ip_ct_net_table, ++ 2*sizeof(ctl_table)); ++ memcpy(ve_ip_ct_ipv4_table, ip_ct_ipv4_table, ++ 2*sizeof(ctl_table)); ++ memcpy(ve_ip_ct_netfilter_table, ip_ct_netfilter_table, ++ 3*sizeof(ctl_table)); ++ memcpy(ve_ip_ct_sysctl_table, ip_ct_sysctl_table, ++ 21*sizeof(ctl_table)); ++ ++ ve_ip_ct_net_table[0].child = ve_ip_ct_ipv4_table; ++ ve_ip_ct_ipv4_table[0].child = ve_ip_ct_netfilter_table; ++ ve_ip_ct_netfilter_table[0].child = ve_ip_ct_sysctl_table; ++ } ++ ve_ip_ct_sysctl_table[0].data = &ve_ip_conntrack_max; ++ ve_ip_ct_netfilter_table[1].data = &ve_ip_conntrack_max; ++ ve_ip_ct_sysctl_table[1].data = &ve_ip_conntrack_count; ++ /* skip ve_ip_ct_sysctl_table[2].data as it is read-only and common ++ * for all environments */ ++ ve_ip_ct_tcp_timeouts[1] = ip_ct_tcp_timeout_syn_sent; ++ ve_ip_ct_sysctl_table[3].data = &ve_ip_ct_tcp_timeouts[1]; ++ ve_ip_ct_tcp_timeouts[2] = ip_ct_tcp_timeout_syn_recv; ++ ve_ip_ct_sysctl_table[4].data = &ve_ip_ct_tcp_timeouts[2]; ++ ve_ip_ct_tcp_timeouts[3] = ip_ct_tcp_timeout_established; ++ ve_ip_ct_sysctl_table[5].data = &ve_ip_ct_tcp_timeouts[3]; ++ ve_ip_ct_tcp_timeouts[4] = ip_ct_tcp_timeout_fin_wait; ++ ve_ip_ct_sysctl_table[6].data = &ve_ip_ct_tcp_timeouts[4]; ++ ve_ip_ct_tcp_timeouts[5] = ip_ct_tcp_timeout_close_wait; ++ ve_ip_ct_sysctl_table[7].data = &ve_ip_ct_tcp_timeouts[5]; ++ ve_ip_ct_tcp_timeouts[6] = ip_ct_tcp_timeout_last_ack; ++ ve_ip_ct_sysctl_table[8].data = &ve_ip_ct_tcp_timeouts[6]; ++ ve_ip_ct_tcp_timeouts[7] = ip_ct_tcp_timeout_time_wait; ++ ve_ip_ct_sysctl_table[9].data = &ve_ip_ct_tcp_timeouts[7]; ++ ve_ip_ct_tcp_timeouts[8] = ip_ct_tcp_timeout_close; ++ ve_ip_ct_sysctl_table[10].data = &ve_ip_ct_tcp_timeouts[8]; ++ ve_ip_ct_udp_timeout = ip_ct_udp_timeout; ++ ve_ip_ct_sysctl_table[11].data = &ve_ip_ct_udp_timeout; ++ ve_ip_ct_udp_timeout_stream = ip_ct_udp_timeout_stream; ++ ve_ip_ct_sysctl_table[12].data = &ve_ip_ct_udp_timeout_stream; ++ ve_ip_ct_icmp_timeout = ip_ct_icmp_timeout; ++ ve_ip_ct_sysctl_table[13].data = &ve_ip_ct_icmp_timeout; ++ ve_ip_ct_generic_timeout = ip_ct_generic_timeout; ++ ve_ip_ct_sysctl_table[14].data = &ve_ip_ct_generic_timeout; ++ ve_ip_ct_log_invalid = ip_ct_log_invalid; ++ ve_ip_ct_sysctl_table[15].data = &ve_ip_ct_log_invalid; ++ ve_ip_ct_tcp_timeout_max_retrans = ip_ct_tcp_timeout_max_retrans; ++ ve_ip_ct_sysctl_table[16].data = &ve_ip_ct_tcp_timeout_max_retrans; ++ ve_ip_ct_tcp_loose = ip_ct_tcp_loose; ++ ve_ip_ct_sysctl_table[17].data = &ve_ip_ct_tcp_loose; ++ ve_ip_ct_tcp_be_liberal = ip_ct_tcp_be_liberal; ++ ve_ip_ct_sysctl_table[18].data = &ve_ip_ct_tcp_be_liberal; ++ ve_ip_ct_tcp_max_retrans = ip_ct_tcp_max_retrans; ++ ve_ip_ct_sysctl_table[19].data = &ve_ip_ct_tcp_max_retrans; ++ for (i = 0; i < 20; i++) ++ ve_ip_ct_sysctl_table[i].owner_env = get_exec_env(); ++ ve_ip_ct_netfilter_table[1].owner_env = get_exec_env(); ++ return 0; ++ ++nomem_3: ++ kfree(ve_ip_ct_netfilter_table); ++ ve_ip_ct_netfilter_table = NULL; ++nomem_2: ++ kfree(ve_ip_ct_ipv4_table); ++ ve_ip_ct_ipv4_table = NULL; ++nomem_1: ++ kfree(ve_ip_ct_net_table); ++ ve_ip_ct_net_table = NULL; ++out: ++ return ret; ++} ++#endif /*CONFIG_VE*/ + #endif /* CONFIG_SYSCTL */ + + static int init_or_cleanup(int init) +@@ -792,9 +941,16 @@ static int init_or_cleanup(int init) + + if (!init) goto cleanup; + ++ ret = -ENOENT; ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); ++ + ret = ip_conntrack_init(); + if (ret < 0) +- goto cleanup_nothing; ++ goto cleanup_unget; ++ ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ return 0; + + #ifdef CONFIG_PROC_FS + ret = -ENOMEM; +@@ -804,98 +960,115 @@ static int init_or_cleanup(int init) + proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; ++ proc_exp->proc_fops = &exp_file_ops; + +- proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); +- if (!proc_stat) +- goto cleanup_proc_exp; ++ if (ve_is_super(get_exec_env())) { ++ proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); ++ if (!proc_stat) ++ goto cleanup_proc_exp; + +- proc_stat->proc_fops = &ct_cpu_seq_fops; +- proc_stat->owner = THIS_MODULE; ++ proc_stat->proc_fops = &ct_cpu_seq_fops; ++ proc_stat->owner = THIS_MODULE; ++ } + #endif + +- ret = nf_register_hook(&ip_conntrack_defrag_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_defrag_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing defrag hook.\n"); + goto cleanup_proc_stat; + } +- ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } +- ret = nf_register_hook(&ip_conntrack_in_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } +- ret = nf_register_hook(&ip_conntrack_local_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local out hook.\n"); + goto cleanup_inops; + } +- ret = nf_register_hook(&ip_conntrack_helper_in_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_helper_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in helper hook.\n"); + goto cleanup_inandlocalops; + } +- ret = nf_register_hook(&ip_conntrack_helper_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_helper_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register postrouting helper hook.\n"); + goto cleanup_helperinops; + } +- ret = nf_register_hook(&ip_conntrack_out_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register post-routing hook.\n"); + goto cleanup_helperoutops; + } +- ret = nf_register_hook(&ip_conntrack_local_in_ops); ++ ret = virt_nf_register_hook(&ip_conntrack_local_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } + #ifdef CONFIG_SYSCTL +- ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); +- if (ip_ct_sysctl_header == NULL) { ++#ifdef CONFIG_VE_IPTABLES ++ ret = ip_conntrack_sysctl_init(); ++ if (ret < 0) ++ goto cleanup_sysctl; ++#endif ++ ret = -ENOMEM; ++ ve_ip_ct_sysctl_header = register_sysctl_table(ve_ip_ct_net_table, 0); ++ if (ve_ip_ct_sysctl_header == NULL) { + printk("ip_conntrack: can't register to sysctl.\n"); +- ret = -ENOMEM; +- goto cleanup_localinops; ++ goto cleanup_sysctl2; + } + #endif + +- return ret; ++ return 0; + + cleanup: ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ goto cleanup_init; + synchronize_net(); + #ifdef CONFIG_SYSCTL +- unregister_sysctl_table(ip_ct_sysctl_header); +- cleanup_localinops: ++ unregister_sysctl_table(ve_ip_ct_sysctl_header); ++ cleanup_sysctl2: ++#ifdef CONFIG_VE_IPTABLES ++ ip_conntrack_sysctl_cleanup(); ++ cleanup_sysctl: ++#endif + #endif +- nf_unregister_hook(&ip_conntrack_local_in_ops); ++ virt_nf_unregister_hook(&ip_conntrack_local_in_ops); + cleanup_inoutandlocalops: +- nf_unregister_hook(&ip_conntrack_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_out_ops); + cleanup_helperoutops: +- nf_unregister_hook(&ip_conntrack_helper_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_helper_out_ops); + cleanup_helperinops: +- nf_unregister_hook(&ip_conntrack_helper_in_ops); ++ virt_nf_unregister_hook(&ip_conntrack_helper_in_ops); + cleanup_inandlocalops: +- nf_unregister_hook(&ip_conntrack_local_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_local_out_ops); + cleanup_inops: +- nf_unregister_hook(&ip_conntrack_in_ops); ++ virt_nf_unregister_hook(&ip_conntrack_in_ops); + cleanup_defraglocalops: +- nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); ++ virt_nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); + cleanup_defragops: +- nf_unregister_hook(&ip_conntrack_defrag_ops); ++ virt_nf_unregister_hook(&ip_conntrack_defrag_ops); + cleanup_proc_stat: + #ifdef CONFIG_PROC_FS +- remove_proc_entry("ip_conntrack", proc_net_stat); ++ if (ve_is_super(get_exec_env())) ++ remove_proc_entry("ip_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("ip_conntrack_expect"); + cleanup_proc: + proc_net_remove("ip_conntrack"); +- cleanup_init: + #endif /* CONFIG_PROC_FS */ ++ cleanup_init: + ip_conntrack_cleanup(); +- cleanup_nothing: ++ cleanup_unget: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); + return ret; + } + +@@ -906,11 +1079,11 @@ int ip_conntrack_protocol_register(struc + int ret = 0; + + write_lock_bh(&ip_conntrack_lock); +- if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { ++ if (ve_ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { + ret = -EBUSY; + goto out; + } +- ip_ct_protos[proto->proto] = proto; ++ ve_ip_ct_protos[proto->proto] = proto; + out: + write_unlock_bh(&ip_conntrack_lock); + return ret; +@@ -919,7 +1092,7 @@ int ip_conntrack_protocol_register(struc + void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) + { + write_lock_bh(&ip_conntrack_lock); +- ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; ++ ve_ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; + write_unlock_bh(&ip_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ +@@ -929,17 +1102,39 @@ void ip_conntrack_protocol_unregister(st + ip_ct_iterate_cleanup(kill_proto, &proto->proto); + } + +-static int __init init(void) ++int init_iptable_conntrack(void) + { + return init_or_cleanup(1); + } + +-static void __exit fini(void) ++void fini_iptable_conntrack(void) + { + init_or_cleanup(0); + } + +-module_init(init); ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_conntrack(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_conntrack); ++ KSYMRESOLVE(fini_iptable_conntrack); ++ KSYMMODRESOLVE(ip_conntrack); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_conntrack); ++ KSYMUNRESOLVE(init_iptable_conntrack); ++ KSYMUNRESOLVE(fini_iptable_conntrack); ++ fini_iptable_conntrack(); ++} ++ ++subsys_initcall(init); + module_exit(fini); + + /* Some modules need us, but don't depend directly on any symbol. +@@ -956,15 +1151,20 @@ EXPORT_SYMBOL_GPL(ip_conntrack_unregiste + EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); + EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); + #endif ++EXPORT_SYMBOL(ip_conntrack_disable_ve0); + EXPORT_SYMBOL(ip_conntrack_protocol_register); + EXPORT_SYMBOL(ip_conntrack_protocol_unregister); + EXPORT_SYMBOL(ip_ct_get_tuple); + EXPORT_SYMBOL(invert_tuplepr); + EXPORT_SYMBOL(ip_conntrack_alter_reply); ++#ifndef CONFIG_VE_IPTABLES + EXPORT_SYMBOL(ip_conntrack_destroyed); ++#endif + EXPORT_SYMBOL(need_conntrack); + EXPORT_SYMBOL(ip_conntrack_helper_register); + EXPORT_SYMBOL(ip_conntrack_helper_unregister); ++EXPORT_SYMBOL(virt_ip_conntrack_helper_register); ++EXPORT_SYMBOL(virt_ip_conntrack_helper_unregister); + EXPORT_SYMBOL(ip_ct_iterate_cleanup); + EXPORT_SYMBOL(__ip_ct_refresh_acct); + +@@ -974,14 +1174,18 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_expect_ + EXPORT_SYMBOL_GPL(ip_conntrack_expect_find); + EXPORT_SYMBOL(ip_conntrack_expect_related); + EXPORT_SYMBOL(ip_conntrack_unexpect_related); ++#ifndef CONFIG_VE_IPTABLES + EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); ++#endif + EXPORT_SYMBOL_GPL(ip_ct_unlink_expect); + + EXPORT_SYMBOL(ip_conntrack_tuple_taken); + EXPORT_SYMBOL(ip_ct_gather_frags); + EXPORT_SYMBOL(ip_conntrack_htable_size); + EXPORT_SYMBOL(ip_conntrack_lock); ++#ifndef CONFIG_VE_IPTABLES + EXPORT_SYMBOL(ip_conntrack_hash); ++#endif + EXPORT_SYMBOL(ip_conntrack_untracked); + EXPORT_SYMBOL_GPL(ip_conntrack_find_get); + #ifdef CONFIG_IP_NF_NAT_NEEDED +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_core.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_core.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_core.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_core.c 2006-07-04 14:41:39.000000000 +0400 +@@ -21,6 +21,8 @@ + #include <linux/icmp.h> + #include <linux/udp.h> + #include <linux/jhash.h> ++#include <linux/nfcalls.h> ++#include <ub/ub_mem.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) +@@ -46,15 +48,24 @@ DEFINE_RWLOCK(ip_nat_lock); + /* Calculated at init based on memory size */ + static unsigned int ip_nat_htable_size; + +-static struct list_head *bysource; +- + #define MAX_IP_NAT_PROTO 256 ++ ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_nat_bysource \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_bysource) ++#define ve_ip_nat_protos \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_protos) ++#else ++static struct list_head *bysource; ++#define ve_ip_nat_bysource bysource + static struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; ++#define ve_ip_nat_protos ip_nat_protos ++#endif + + static inline struct ip_nat_protocol * + __ip_nat_proto_find(u_int8_t protonum) + { +- return ip_nat_protos[protonum]; ++ return ve_ip_nat_protos[protonum]; + } + + struct ip_nat_protocol * +@@ -177,7 +188,7 @@ find_appropriate_src(const struct ip_con + struct ip_conntrack *ct; + + read_lock_bh(&ip_nat_lock); +- list_for_each_entry(ct, &bysource[h], nat.info.bysource) { ++ list_for_each_entry(ct, &ve_ip_nat_bysource[h], nat.info.bysource) { + if (same_src(ct, tuple)) { + /* Copy source part from reply tuple. */ + invert_tuplepr(result, +@@ -291,13 +302,22 @@ get_unique_tuple(struct ip_conntrack_tup + ip_nat_proto_put(proto); + } + ++void ip_nat_hash_conntrack(struct ip_conntrack *conntrack) ++{ ++ unsigned int srchash ++ = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple); ++ write_lock_bh(&ip_nat_lock); ++ list_add(&conntrack->nat.info.bysource, &ve_ip_nat_bysource[srchash]); ++ write_unlock_bh(&ip_nat_lock); ++} ++EXPORT_SYMBOL_GPL(ip_nat_hash_conntrack); ++ + unsigned int + ip_nat_setup_info(struct ip_conntrack *conntrack, + const struct ip_nat_range *range, + unsigned int hooknum) + { + struct ip_conntrack_tuple curr_tuple, new_tuple; +- struct ip_nat_info *info = &conntrack->nat.info; + int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + +@@ -332,14 +352,8 @@ ip_nat_setup_info(struct ip_conntrack *c + } + + /* Place in source hash if this is the first time. */ +- if (have_to_hash) { +- unsigned int srchash +- = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] +- .tuple); +- write_lock_bh(&ip_nat_lock); +- list_add(&info->bysource, &bysource[srchash]); +- write_unlock_bh(&ip_nat_lock); +- } ++ if (have_to_hash) ++ ip_nat_hash_conntrack(conntrack); + + /* It's done. */ + if (maniptype == IP_NAT_MANIP_DST) +@@ -521,11 +535,11 @@ int ip_nat_protocol_register(struct ip_n + int ret = 0; + + write_lock_bh(&ip_nat_lock); +- if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { ++ if (ve_ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } +- ip_nat_protos[proto->protonum] = proto; ++ ve_ip_nat_protos[proto->protonum] = proto; + out: + write_unlock_bh(&ip_nat_lock); + return ret; +@@ -536,7 +550,7 @@ EXPORT_SYMBOL(ip_nat_protocol_register); + void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) + { + write_lock_bh(&ip_nat_lock); +- ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; ++ ve_ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; + write_unlock_bh(&ip_nat_lock); + + /* Someone could be still looking at the proto in a bh. */ +@@ -589,38 +603,55 @@ EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_ + EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr); + #endif + +-static int __init ip_nat_init(void) ++static int ip_nat_init(void) + { + size_t i; ++ int ret; + +- /* Leave them the same for the moment. */ +- ip_nat_htable_size = ip_conntrack_htable_size; ++ if (ve_is_super(get_exec_env())) ++ ip_nat_htable_size = ip_conntrack_htable_size; + + /* One vmalloc for both hash tables */ +- bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); +- if (!bysource) +- return -ENOMEM; ++ ret = -ENOMEM; ++ ve_ip_nat_bysource = ++ ub_vmalloc(sizeof(struct list_head)*ip_nat_htable_size*2); ++ if (!ve_ip_nat_bysource) ++ goto nomem; ++ ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_protos = ++ ub_kmalloc(sizeof(void *)*MAX_IP_NAT_PROTO, GFP_KERNEL); ++ if (!ve_ip_nat_protos) ++ goto nomem2; ++#endif + + /* Sew in builtin protocols. */ + write_lock_bh(&ip_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) +- ip_nat_protos[i] = &ip_nat_unknown_protocol; +- ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; +- ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; +- ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; ++ ve_ip_nat_protos[i] = &ip_nat_unknown_protocol; ++ ve_ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; ++ ve_ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; ++ ve_ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; + write_unlock_bh(&ip_nat_lock); + + for (i = 0; i < ip_nat_htable_size; i++) { +- INIT_LIST_HEAD(&bysource[i]); ++ INIT_LIST_HEAD(&ve_ip_nat_bysource[i]); + } + + /* FIXME: Man, this is a hack. <SIGH> */ + IP_NF_ASSERT(ip_conntrack_destroyed == NULL); +- ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; ++ ve_ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + +- /* Initialize fake conntrack so that NAT will skip it */ +- ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; ++ if (ve_is_super(get_exec_env())) ++ /* Initialize fake conntrack so that NAT will skip it */ ++ ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + return 0; ++#ifdef CONFIG_VE_IPTABLES ++nomem2: ++#endif ++ vfree(ve_ip_nat_bysource); ++nomem: ++ return ret; + } + + /* Clear NAT section of all conntracks, in case we're loaded again. */ +@@ -631,14 +662,41 @@ static int clean_nat(struct ip_conntrack + return 0; + } + +-static void __exit ip_nat_cleanup(void) ++static void ip_nat_cleanup(void) + { + ip_ct_iterate_cleanup(&clean_nat, NULL); +- ip_conntrack_destroyed = NULL; +- vfree(bysource); ++ ve_ip_conntrack_destroyed = NULL; ++ vfree(ve_ip_nat_bysource); ++ ve_ip_nat_bysource = NULL; ++#ifdef CONFIG_VE_IPTABLES ++ kfree(ve_ip_nat_protos); ++ ve_ip_nat_protos = NULL; ++#endif ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ err = ip_nat_init(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(ip_nat_init); ++ KSYMRESOLVE(ip_nat_cleanup); ++ KSYMMODRESOLVE(ip_nat); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_nat); ++ KSYMUNRESOLVE(ip_nat_cleanup); ++ KSYMUNRESOLVE(ip_nat_init); ++ ip_nat_cleanup(); + } + + MODULE_LICENSE("GPL"); + +-module_init(ip_nat_init); +-module_exit(ip_nat_cleanup); ++fs_initcall(init); ++module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_ftp.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_ftp.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_ftp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_ftp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -19,6 +19,7 @@ + #include <linux/netfilter_ipv4/ip_nat_rule.h> + #include <linux/netfilter_ipv4/ip_conntrack_ftp.h> + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +@@ -154,18 +155,43 @@ static unsigned int ip_nat_ftp(struct sk + return NF_ACCEPT; + } + +-static void __exit fini(void) ++#ifdef CONFIG_VE_IPTABLES ++#undef ve_ip_nat_ftp_hook ++#define ve_ip_nat_ftp_hook \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_ftp_hook) ++#endif ++int init_iptable_nat_ftp(void) + { +- ip_nat_ftp_hook = NULL; ++ BUG_ON(ve_ip_nat_ftp_hook); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_ftp_hook = (ip_nat_helper_func)ip_nat_ftp; ++#else ++ ve_ip_nat_ftp_hook = ip_nat_ftp; ++#endif ++ return 0; ++} ++ ++void fini_iptable_nat_ftp(void) ++{ ++ ve_ip_nat_ftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); + } + ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_nat_ftp); ++ KSYMUNRESOLVE(init_iptable_nat_ftp); ++ KSYMUNRESOLVE(fini_iptable_nat_ftp); ++ fini_iptable_nat_ftp(); ++} ++ + static int __init init(void) + { +- BUG_ON(ip_nat_ftp_hook); +- ip_nat_ftp_hook = ip_nat_ftp; +- return 0; ++ KSYMRESOLVE(init_iptable_nat_ftp); ++ KSYMRESOLVE(fini_iptable_nat_ftp); ++ KSYMMODRESOLVE(ip_nat_ftp); ++ return init_iptable_nat_ftp(); + } + + /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_irc.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_irc.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_irc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_irc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -23,6 +23,7 @@ + #include <linux/netfilter_ipv4/ip_conntrack_irc.h> + #include <linux/netfilter_ipv4/ip_conntrack_helper.h> + #include <linux/moduleparam.h> ++#include <linux/nfcalls.h> + + #if 0 + #define DEBUGP printk +@@ -96,18 +97,44 @@ static unsigned int help(struct sk_buff + return ret; + } + +-static void __exit fini(void) ++#ifdef CONFIG_VE_IPTABLES ++#undef ve_ip_nat_irc_hook ++#define ve_ip_nat_irc_hook \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_irc_hook) ++#endif ++ ++int init_iptable_nat_irc(void) ++{ ++ BUG_ON(ve_ip_nat_irc_hook); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_irc_hook = (ip_nat_helper_func)help; ++#else ++ ve_ip_nat_irc_hook = help; ++#endif ++ return 0; ++} ++ ++void fini_iptable_nat_irc(void) + { +- ip_nat_irc_hook = NULL; ++ ve_ip_nat_irc_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); + } + ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip_nat_irc); ++ KSYMUNRESOLVE(init_iptable_nat_irc); ++ KSYMUNRESOLVE(fini_iptable_nat_irc); ++ fini_iptable_nat_irc(); ++} ++ + static int __init init(void) + { +- BUG_ON(ip_nat_irc_hook); +- ip_nat_irc_hook = help; +- return 0; ++ KSYMRESOLVE(init_iptable_nat_irc); ++ KSYMRESOLVE(fini_iptable_nat_irc); ++ KSYMMODRESOLVE(ip_nat_irc); ++ return init_iptable_nat_irc(); + } + + /* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_rule.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_rule.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_rule.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_rule.c 2006-07-04 14:41:39.000000000 +0400 +@@ -34,6 +34,13 @@ + #define DEBUGP(format, args...) + #endif + ++#ifdef CONFIG_VE_IPTABLES ++#define ve_ip_nat_table \ ++ (get_exec_env()->_ip_conntrack->_ip_nat_table) ++#else ++#define ve_ip_nat_table &nat_table ++#endif ++ + #define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) + + static struct +@@ -41,7 +48,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +-} nat_initial_table __initdata ++} nat_initial_table + = { { "nat", NAT_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, +@@ -235,6 +242,93 @@ static int ipt_dnat_checkentry(const cha + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat_to_user(void *target, void **dstptr, ++ int *size, int off) ++{ ++ struct ipt_entry_target *pt; ++ struct ip_nat_multi_range_compat *pinfo; ++ struct compat_ip_nat_multi_range info; ++ u_int16_t tsize; ++ ++ pt = (struct ipt_entry_target *)target; ++ tsize = pt->u.user.target_size; ++ if (__copy_to_user(*dstptr, pt, sizeof(struct ipt_entry_target))) ++ return -EFAULT; ++ pinfo = (struct ip_nat_multi_range_compat *)pt->data; ++ memset(&info, 0, sizeof(struct compat_ip_nat_multi_range)); ++ info.rangesize = pinfo->rangesize; ++ info.range[0].flags = pinfo->range[0].flags; ++ info.range[0].min_ip = pinfo->range[0].min_ip; ++ info.range[0].max_ip = pinfo->range[0].max_ip; ++ info.range[0].min = pinfo->range[0].min; ++ info.range[0].max = pinfo->range[0].max; ++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_target), ++ &info, sizeof(struct compat_ip_nat_multi_range))) ++ return -EFAULT; ++ tsize -= off; ++ if (put_user(tsize, (u_int16_t *)*dstptr)) ++ return -EFAULT; ++ *size -= off; ++ *dstptr += tsize; ++ return 0; ++} ++ ++static int compat_from_user(void *target, void **dstptr, ++ int *size, int off) ++{ ++ struct compat_ipt_entry_target *pt; ++ struct ipt_entry_target *dstpt; ++ struct compat_ip_nat_multi_range *pinfo; ++ struct ip_nat_multi_range_compat info; ++ u_int16_t tsize; ++ ++ pt = (struct compat_ipt_entry_target *)target; ++ dstpt = (struct ipt_entry_target *)*dstptr; ++ tsize = pt->u.user.target_size; ++ memcpy(*dstptr, pt, sizeof(struct compat_ipt_entry_target)); ++ pinfo = (struct compat_ip_nat_multi_range *)pt->data; ++ memset(&info, 0, sizeof(struct ip_nat_multi_range_compat)); ++ info.rangesize = pinfo->rangesize; ++ info.range[0].flags = pinfo->range[0].flags; ++ info.range[0].min_ip = pinfo->range[0].min_ip; ++ info.range[0].max_ip = pinfo->range[0].max_ip; ++ info.range[0].min = pinfo->range[0].min; ++ info.range[0].max = pinfo->range[0].max; ++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_target), ++ &info, sizeof(struct ip_nat_multi_range_compat)); ++ tsize += off; ++ dstpt->u.user.target_size = tsize; ++ *size += off; ++ *dstptr += tsize; ++ return 0; ++} ++ ++static int compat(void *target, void **dstptr, int *size, int convert) ++{ ++ int ret, off; ++ ++ off = IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat)) - ++ COMPAT_IPT_ALIGN(sizeof(struct compat_ip_nat_multi_range)); ++ switch (convert) { ++ case COMPAT_TO_USER: ++ ret = compat_to_user(target, dstptr, size, off); ++ break; ++ case COMPAT_FROM_USER: ++ ret = compat_from_user(target, dstptr, size, off); ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ ret = 0; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++#endif ++ + inline unsigned int + alloc_null_binding(struct ip_conntrack *conntrack, + struct ip_nat_info *info, +@@ -286,7 +380,7 @@ int ip_nat_rule_find(struct sk_buff **ps + { + int ret; + +- ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); ++ ret = ipt_do_table(pskb, hooknum, in, out, ve_ip_nat_table, NULL); + + if (ret == NF_ACCEPT) { + if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) +@@ -300,21 +394,33 @@ static struct ipt_target ipt_snat_reg = + .name = "SNAT", + .target = ipt_snat_target, + .checkentry = ipt_snat_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + }; + + static struct ipt_target ipt_dnat_reg = { + .name = "DNAT", + .target = ipt_dnat_target, + .checkentry = ipt_dnat_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + }; + +-int __init ip_nat_rule_init(void) ++int ip_nat_rule_init(void) + { + int ret; ++ struct ipt_table *tmp_table; ++ ++ tmp_table = ipt_register_table(&nat_table, ++ &nat_initial_table.repl); ++ if (IS_ERR(tmp_table)) ++ return PTR_ERR(tmp_table); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_table = tmp_table; ++#endif + +- ret = ipt_register_table(&nat_table, &nat_initial_table.repl); +- if (ret != 0) +- return ret; + ret = ipt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; +@@ -328,7 +434,10 @@ int __init ip_nat_rule_init(void) + unregister_snat: + ipt_unregister_target(&ipt_snat_reg); + unregister_table: +- ipt_unregister_table(&nat_table); ++ ipt_unregister_table(ve_ip_nat_table); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_table = NULL; ++#endif + + return ret; + } +@@ -337,5 +446,8 @@ void ip_nat_rule_cleanup(void) + { + ipt_unregister_target(&ipt_dnat_reg); + ipt_unregister_target(&ipt_snat_reg); +- ipt_unregister_table(&nat_table); ++ ipt_unregister_table(ve_ip_nat_table); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip_nat_table = NULL; ++#endif + } +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_snmp_basic.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_snmp_basic.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_snmp_basic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_snmp_basic.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1000,12 +1000,12 @@ static unsigned char snmp_trap_decode(st + + return 1; + ++err_addr_free: ++ kfree((unsigned long *)trap->ip_address); ++ + err_id_free: + kfree(trap->id); + +-err_addr_free: +- kfree((unsigned long *)trap->ip_address); +- + return 0; + } + +@@ -1123,11 +1123,10 @@ static int snmp_parse_mangle(unsigned ch + struct snmp_v1_trap trap; + unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); + +- /* Discard trap allocations regardless */ +- kfree(trap.id); +- kfree((unsigned long *)trap.ip_address); +- +- if (!ret) ++ if (ret) { ++ kfree(trap.id); ++ kfree((unsigned long *)trap.ip_address); ++ } else + return ret; + + } else { +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_standalone.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_standalone.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_nat_standalone.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_nat_standalone.c 2006-07-04 14:41:39.000000000 +0400 +@@ -30,6 +30,7 @@ + #include <net/ip.h> + #include <net/checksum.h> + #include <linux/spinlock.h> ++#include <linux/nfcalls.h> + + #define ASSERT_READ_LOCK(x) + #define ASSERT_WRITE_LOCK(x) +@@ -358,45 +359,45 @@ static int init_or_cleanup(int init) + { + int ret = 0; + +- need_conntrack(); +- + if (!init) goto cleanup; + +-#ifdef CONFIG_XFRM +- BUG_ON(ip_nat_decode_session != NULL); +- ip_nat_decode_session = nat_decode_session; +-#endif ++ if (!ve_is_super(get_exec_env())) ++ __module_get(THIS_MODULE); ++ + ret = ip_nat_rule_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); +- goto cleanup_decode_session; ++ goto cleanup_modput; + } +- ret = nf_register_hook(&ip_nat_in_ops); ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ return 0; ++ ++ ret = virt_nf_register_hook(&ip_nat_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register in hook.\n"); + goto cleanup_rule_init; + } +- ret = nf_register_hook(&ip_nat_out_ops); ++ ret = virt_nf_register_hook(&ip_nat_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register out hook.\n"); + goto cleanup_inops; + } +- ret = nf_register_hook(&ip_nat_adjust_in_ops); ++ ret = virt_nf_register_hook(&ip_nat_adjust_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register adjust in hook.\n"); + goto cleanup_outops; + } +- ret = nf_register_hook(&ip_nat_adjust_out_ops); ++ ret = virt_nf_register_hook(&ip_nat_adjust_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register adjust out hook.\n"); + goto cleanup_adjustin_ops; + } +- ret = nf_register_hook(&ip_nat_local_out_ops); ++ ret = virt_nf_register_hook(&ip_nat_local_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local out hook.\n"); + goto cleanup_adjustout_ops;; + } +- ret = nf_register_hook(&ip_nat_local_in_ops); ++ ret = virt_nf_register_hook(&ip_nat_local_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local in hook.\n"); + goto cleanup_localoutops; +@@ -404,38 +405,76 @@ static int init_or_cleanup(int init) + return ret; + + cleanup: +- nf_unregister_hook(&ip_nat_local_in_ops); ++ if (ve_is_super(get_exec_env()) && ip_conntrack_disable_ve0) ++ goto cleanup_rule_init; ++ virt_nf_unregister_hook(&ip_nat_local_in_ops); + cleanup_localoutops: +- nf_unregister_hook(&ip_nat_local_out_ops); ++ virt_nf_unregister_hook(&ip_nat_local_out_ops); + cleanup_adjustout_ops: +- nf_unregister_hook(&ip_nat_adjust_out_ops); ++ virt_nf_unregister_hook(&ip_nat_adjust_out_ops); + cleanup_adjustin_ops: +- nf_unregister_hook(&ip_nat_adjust_in_ops); ++ virt_nf_unregister_hook(&ip_nat_adjust_in_ops); + cleanup_outops: +- nf_unregister_hook(&ip_nat_out_ops); ++ virt_nf_unregister_hook(&ip_nat_out_ops); + cleanup_inops: +- nf_unregister_hook(&ip_nat_in_ops); ++ virt_nf_unregister_hook(&ip_nat_in_ops); + cleanup_rule_init: + ip_nat_rule_cleanup(); +- cleanup_decode_session: +-#ifdef CONFIG_XFRM +- ip_nat_decode_session = NULL; +- synchronize_net(); +-#endif ++ cleanup_modput: ++ if (!ve_is_super(get_exec_env())) ++ module_put(THIS_MODULE); + return ret; + } + +-static int __init init(void) ++int init_iptable_nat(void) + { + return init_or_cleanup(1); + } + +-static void __exit fini(void) ++void fini_iptable_nat(void) + { + init_or_cleanup(0); + } + +-module_init(init); ++static int __init init(void) ++{ ++ int err; ++ ++ need_conntrack(); ++ ++#ifdef CONFIG_XFRM ++ BUG_ON(ip_nat_decode_session != NULL); ++ ip_nat_decode_session = nat_decode_session; ++#endif ++ ++ err = init_iptable_nat(); ++ if (err < 0) { ++#ifdef CONFIG_XFRM ++ ip_nat_decode_session = NULL; ++ synchronize_net(); ++#endif ++ return err; ++ } ++ ++ KSYMRESOLVE(init_iptable_nat); ++ KSYMRESOLVE(fini_iptable_nat); ++ KSYMMODRESOLVE(iptable_nat); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(iptable_nat); ++ KSYMUNRESOLVE(init_iptable_nat); ++ KSYMUNRESOLVE(fini_iptable_nat); ++ fini_iptable_nat(); ++#ifdef CONFIG_XFRM ++ ip_nat_decode_session = NULL; ++ synchronize_net(); ++#endif ++} ++ ++fs_initcall(init); + module_exit(fini); + + MODULE_LICENSE("GPL"); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_queue.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_queue.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_queue.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_queue.c 2006-07-04 14:41:39.000000000 +0400 +@@ -542,8 +542,17 @@ ipq_rcv_sk(struct sock *sk, int len) + down(&ipqnl_sem); + + for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { ++#ifdef CONFIG_VE ++ struct ve_struct *env; ++#endif + skb = skb_dequeue(&sk->sk_receive_queue); ++#ifdef CONFIG_VE ++ env = set_exec_env(VE_OWNER_SKB(skb)); + ipq_rcv_skb(skb); ++ (void)set_exec_env(env); ++#else ++ ipq_rcv_skb(skb); ++#endif + kfree_skb(skb); + } + +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c linux-2.6.16-026test015/net/ipv4/netfilter/ip_tables.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ip_tables.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ip_tables.c 2006-07-04 14:41:39.000000000 +0400 +@@ -24,14 +24,17 @@ + #include <linux/module.h> + #include <linux/icmp.h> + #include <net/ip.h> ++#include <net/compat.h> + #include <asm/uaccess.h> + #include <asm/semaphore.h> + #include <linux/proc_fs.h> + #include <linux/err.h> + #include <linux/cpumask.h> ++#include <ub/ub_mem.h> + + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter_ipv4/ip_tables.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +@@ -70,6 +73,14 @@ do { \ + #define inline + #endif + ++#ifdef CONFIG_VE_IPTABLES ++/* include ve.h and define get_exec_env */ ++#include <linux/sched.h> ++#define ve_ipt_standard_target (get_exec_env()->_ipt_standard_target) ++#else ++#define ve_ipt_standard_target &ipt_standard_target ++#endif ++ + /* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore +@@ -480,7 +491,7 @@ standard_check(const struct ipt_entry_ta + if (t->u.target_size + != IPT_ALIGN(sizeof(struct ipt_standard_target))) { + duprintf("standard_check: target size %u != %u\n", +- t->u.target_size, ++ t->u.target_size, (unsigned int) + IPT_ALIGN(sizeof(struct ipt_standard_target))); + return 0; + } +@@ -565,7 +576,7 @@ check_entry(struct ipt_entry *e, const c + } + t->u.kernel.target = target; + +- if (t->u.kernel.target == &ipt_standard_target) { ++ if (t->u.kernel.target == ve_ipt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; +@@ -790,32 +801,45 @@ get_counters(const struct xt_table_info + } + } + +-static int +-copy_entries_to_user(unsigned int total_size, +- struct ipt_table *table, +- void __user *userptr) ++static inline struct xt_counters * alloc_counters(struct ipt_table *table) + { +- unsigned int off, num, countersize; +- struct ipt_entry *e; ++ unsigned int countersize; + struct xt_counters *counters; + struct xt_table_info *private = table->private; +- int ret = 0; +- void *loc_cpu_entry; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct xt_counters) * private->number; +- counters = vmalloc_node(countersize, numa_node_id()); ++ counters = ub_vmalloc_node(countersize, numa_node_id()); + + if (counters == NULL) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + /* First, sum counters... */ + write_lock_bh(&table->lock); + get_counters(private, counters); + write_unlock_bh(&table->lock); + ++ return counters; ++} ++ ++static int ++copy_entries_to_user(unsigned int total_size, ++ struct ipt_table *table, ++ void __user *userptr) ++{ ++ unsigned int off, num; ++ struct ipt_entry *e; ++ struct xt_counters *counters; ++ struct xt_table_info *private = table->private; ++ int ret = 0; ++ void *loc_cpu_entry; ++ ++ counters = alloc_counters(table); ++ if (IS_ERR(counters)) ++ return PTR_ERR(counters); ++ + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) +@@ -875,25 +899,391 @@ copy_entries_to_user(unsigned int total_ + return ret; + } + ++#ifdef CONFIG_COMPAT ++static DECLARE_MUTEX(compat_ipt_mutex); ++ ++struct compat_delta { ++ struct compat_delta *next; ++ u_int16_t offset; ++ short delta; ++}; ++ ++static struct compat_delta *compat_offsets = NULL; ++ ++static int compat_add_offset(u_int16_t offset, short delta) ++{ ++ struct compat_delta *tmp; ++ ++ tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ tmp->offset = offset; ++ tmp->delta = delta; ++ if (compat_offsets) { ++ tmp->next = compat_offsets->next; ++ compat_offsets->next = tmp; ++ } else { ++ compat_offsets = tmp; ++ tmp->next = NULL; ++ } ++ return 0; ++} ++ ++static void compat_flush_offsets(void) ++{ ++ struct compat_delta *tmp, *next; ++ ++ if (compat_offsets) { ++ for(tmp = compat_offsets; tmp; tmp = next) { ++ next = tmp->next; ++ kfree(tmp); ++ } ++ compat_offsets = NULL; ++ } ++} ++ ++static short compat_calc_jump(u_int16_t offset) ++{ ++ struct compat_delta *tmp; ++ short delta; ++ ++ for(tmp = compat_offsets, delta = 0; tmp; tmp = tmp->next) ++ if (tmp->offset < offset) ++ delta += tmp->delta; ++ return delta; ++} ++ ++struct compat_ipt_standard_target ++{ ++ struct compat_ipt_entry_target target; ++ compat_int_t verdict; ++}; ++ ++#define IPT_ST_OFFSET (sizeof(struct ipt_standard_target) - \ ++ sizeof(struct compat_ipt_standard_target)) ++ ++struct compat_ipt_standard ++{ ++ struct compat_ipt_entry entry; ++ struct compat_ipt_standard_target target; ++}; ++ ++static int compat_ipt_standard_fn(void *target, ++ void **dstptr, int *size, int convert) ++{ ++ struct compat_ipt_standard_target compat_st, *pcompat_st; ++ struct ipt_standard_target st, *pst; ++ int ret; ++ ++ ret = 0; ++ switch (convert) { ++ case COMPAT_TO_USER: ++ pst = (struct ipt_standard_target *)target; ++ memcpy(&compat_st.target, &pst->target, ++ sizeof(struct ipt_entry_target)); ++ compat_st.verdict = pst->verdict; ++ if (compat_st.verdict > 0) ++ compat_st.verdict -= ++ compat_calc_jump(compat_st.verdict); ++ compat_st.target.u.user.target_size = ++ sizeof(struct compat_ipt_standard_target); ++ if (__copy_to_user(*dstptr, &compat_st, ++ sizeof(struct compat_ipt_standard_target))) ++ ret = -EFAULT; ++ *size -= IPT_ST_OFFSET; ++ *dstptr += sizeof(struct compat_ipt_standard_target); ++ break; ++ case COMPAT_FROM_USER: ++ pcompat_st = ++ (struct compat_ipt_standard_target *)target; ++ memcpy(&st.target, &pcompat_st->target, ++ sizeof(struct ipt_entry_target)); ++ st.verdict = pcompat_st->verdict; ++ if (st.verdict > 0) ++ st.verdict += compat_calc_jump(st.verdict); ++ st.target.u.user.target_size = ++ sizeof(struct ipt_standard_target); ++ memcpy(*dstptr, &st, ++ sizeof(struct ipt_standard_target)); ++ *size += IPT_ST_OFFSET; ++ *dstptr += sizeof(struct ipt_standard_target); ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += IPT_ST_OFFSET; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++ ++int ipt_target_align_compat(void *target, void **dstptr, ++ int *size, int off, int convert) ++{ ++ struct compat_ipt_entry_target *pcompat; ++ struct ipt_entry_target *pt; ++ u_int16_t tsize; ++ int ret; ++ ++ ret = 0; ++ switch (convert) { ++ case COMPAT_TO_USER: ++ pt = (struct ipt_entry_target *)target; ++ tsize = pt->u.user.target_size; ++ if (__copy_to_user(*dstptr, pt, tsize)) { ++ ret = -EFAULT; ++ break; ++ } ++ tsize -= off; ++ if (put_user(tsize, (u_int16_t *)*dstptr)) ++ ret = -EFAULT; ++ *size -= off; ++ *dstptr += tsize; ++ break; ++ case COMPAT_FROM_USER: ++ pcompat = (struct compat_ipt_entry_target *)target; ++ pt = (struct ipt_entry_target *)*dstptr; ++ tsize = pcompat->u.user.target_size; ++ memcpy(pt, pcompat, tsize); ++ tsize += off; ++ pt->u.user.target_size = tsize; ++ *size += off; ++ *dstptr += tsize; ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++ ++int ipt_match_align_compat(void *match, void **dstptr, ++ int *size, int off, int convert) ++{ ++ struct compat_ipt_entry_match *pcompat_m; ++ struct ipt_entry_match *pm; ++ u_int16_t msize; ++ int ret; ++ ++ ret = 0; ++ switch (convert) { ++ case COMPAT_TO_USER: ++ pm = (struct ipt_entry_match *)match; ++ msize = pm->u.user.match_size; ++ if (__copy_to_user(*dstptr, pm, msize)) { ++ ret = -EFAULT; ++ break; ++ } ++ msize -= off; ++ if (put_user(msize, (u_int16_t *)*dstptr)) ++ ret = -EFAULT; ++ *size -= off; ++ *dstptr += msize; ++ break; ++ case COMPAT_FROM_USER: ++ pcompat_m = (struct compat_ipt_entry_match *)match; ++ pm = (struct ipt_entry_match *)*dstptr; ++ msize = pcompat_m->u.user.match_size; ++ memcpy(pm, pcompat_m, msize); ++ msize += off; ++ pm->u.user.match_size = msize; ++ *size += off; ++ *dstptr += msize; ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++ ++static int icmp_compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_icmp)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_icmp)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++ ++static inline int ++compat_calc_match(struct ipt_entry_match *m, int * size) ++{ ++ if (m->u.kernel.match->compat) ++ m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE); ++ return 0; ++} ++ ++static int compat_calc_entry(struct ipt_entry *e, struct xt_table_info *info, ++ void *base, struct xt_table_info *newinfo) ++{ ++ struct ipt_entry_target *t; ++ u_int16_t entry_offset; ++ int off, i, ret; ++ ++ off = 0; ++ entry_offset = (void *)e - base; ++ IPT_MATCH_ITERATE(e, compat_calc_match, &off); ++ t = ipt_get_target(e); ++ if (t->u.kernel.target->compat) ++ t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE); ++ newinfo->size -= off; ++ ret = compat_add_offset(entry_offset, off); ++ if (ret) ++ return ret; ++ ++ for (i = 0; i< NF_IP_NUMHOOKS; i++) { ++ if (info->hook_entry[i] && (e < (struct ipt_entry *) ++ (base + info->hook_entry[i]))) ++ newinfo->hook_entry[i] -= off; ++ if (info->underflow[i] && (e < (struct ipt_entry *) ++ (base + info->underflow[i]))) ++ newinfo->underflow[i] -= off; ++ } ++ return 0; ++} ++ ++static int compat_table_info(struct xt_table_info *info, ++ struct xt_table_info *newinfo) ++{ ++ void *loc_cpu_entry; ++ int i; ++ ++ if (!newinfo || !info) ++ return -EINVAL; ++ ++ memset(newinfo, 0, sizeof(struct xt_table_info)); ++ newinfo->size = info->size; ++ for (i = 0; i < NF_IP_NUMHOOKS; i++) { ++ newinfo->hook_entry[i] = info->hook_entry[i]; ++ newinfo->underflow[i] = info->underflow[i]; ++ } ++ loc_cpu_entry = info->entries[raw_smp_processor_id()]; ++ return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, ++ compat_calc_entry, info, loc_cpu_entry, newinfo); ++} ++#endif ++ ++static int get_info(void __user *user, int *len) ++{ ++ char name[IPT_TABLE_MAXNAMELEN]; ++ struct ipt_table *t; ++ int ret, size; ++ ++#ifdef CONFIG_COMPAT ++ if (is_current_32bits()) ++ size = sizeof(struct compat_ipt_getinfo); ++ else ++#endif ++ size = sizeof(struct ipt_getinfo); ++ ++ if (*len != size) { ++ duprintf("length %u != %u\n", *len, ++ (unsigned int)sizeof(struct ipt_getinfo)); ++ return -EINVAL; ++ } ++ ++ if (copy_from_user(name, user, sizeof(name)) != 0) ++ return -EFAULT; ++ ++ name[IPT_TABLE_MAXNAMELEN-1] = '\0'; ++#ifdef CONFIG_COMPAT ++ down(&compat_ipt_mutex); ++#endif ++ t = try_then_request_module(xt_find_table_lock(AF_INET, name), ++ "iptable_%s", name); ++ if (t && !IS_ERR(t)) { ++ struct ipt_getinfo info; ++ struct xt_table_info *private = t->private; ++#ifdef CONFIG_COMPAT ++ struct compat_ipt_getinfo compat_info; ++#endif ++ void *pinfo; ++ ++#ifdef CONFIG_COMPAT ++ if (is_current_32bits()) { ++ struct xt_table_info tmp; ++ ret = compat_table_info(private, &tmp); ++ compat_flush_offsets(); ++ memcpy(compat_info.hook_entry, tmp.hook_entry, ++ sizeof(compat_info.hook_entry)); ++ memcpy(compat_info.underflow, tmp.underflow, ++ sizeof(compat_info.underflow)); ++ compat_info.valid_hooks = t->valid_hooks; ++ compat_info.num_entries = private->number; ++ compat_info.size = tmp.size; ++ strcpy(compat_info.name, name); ++ pinfo = (void *)&compat_info; ++ } else ++#endif ++ { ++ info.valid_hooks = t->valid_hooks; ++ memcpy(info.hook_entry, private->hook_entry, ++ sizeof(info.hook_entry)); ++ memcpy(info.underflow, private->underflow, ++ sizeof(info.underflow)); ++ info.num_entries = private->number; ++ info.size = private->size; ++ strcpy(info.name, name); ++ pinfo = (void *)&info; ++ } ++ ++ if (copy_to_user(user, pinfo, *len) != 0) ++ ret = -EFAULT; ++ else ++ ret = 0; ++ ++ xt_table_unlock(t); ++ module_put(t->me); ++ } else ++ ret = t ? PTR_ERR(t) : -ENOENT; ++#ifdef CONFIG_COMPAT ++ up(&compat_ipt_mutex); ++#endif ++ return ret; ++} ++ + static int +-get_entries(const struct ipt_get_entries *entries, +- struct ipt_get_entries __user *uptr) ++get_entries(struct ipt_get_entries __user *uptr, int *len) + { + int ret; ++ struct ipt_get_entries get; + struct ipt_table *t; + +- t = xt_find_table_lock(AF_INET, entries->name); ++ if (*len < sizeof(get)) { ++ duprintf("get_entries: %u < %d\n", *len, ++ (unsigned int)sizeof(get)); ++ return -EINVAL; ++ } ++ if (copy_from_user(&get, uptr, sizeof(get)) != 0) ++ return -EFAULT; ++ if (*len != sizeof(struct ipt_get_entries) + get.size) { ++ duprintf("get_entries: %u != %u\n", *len, ++ (unsigned int)(sizeof(struct ipt_get_entries) + ++ get.size)); ++ return -EINVAL; ++ } ++ ++ t = xt_find_table_lock(AF_INET, get.name); + if (t && !IS_ERR(t)) { + struct xt_table_info *private = t->private; + duprintf("t->private->number = %u\n", + private->number); +- if (entries->size == private->size) ++ if (get.size == private->size) + ret = copy_entries_to_user(private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + private->size, +- entries->size); ++ get.size); + ret = -EINVAL; + } + module_put(t->me); +@@ -905,71 +1295,39 @@ get_entries(const struct ipt_get_entries + } + + static int +-do_replace(void __user *user, unsigned int len) ++__do_replace(const char *name, unsigned int valid_hooks, ++ struct xt_table_info *newinfo, unsigned int num_counters, ++ void __user *counters_ptr) + { + int ret; +- struct ipt_replace tmp; + struct ipt_table *t; +- struct xt_table_info *newinfo, *oldinfo; ++ struct xt_table_info *oldinfo; + struct xt_counters *counters; +- void *loc_cpu_entry, *loc_cpu_old_entry; +- +- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) +- return -EFAULT; +- +- /* Hack: Causes ipchains to give correct error msg --RR */ +- if (len != sizeof(tmp) + tmp.size) +- return -ENOPROTOOPT; +- +- /* overflow check */ +- if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - +- SMP_CACHE_BYTES) +- return -ENOMEM; +- if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) +- return -ENOMEM; +- +- newinfo = xt_alloc_table_info(tmp.size); +- if (!newinfo) +- return -ENOMEM; +- +- /* choose the copy that is our node/cpu */ +- loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; +- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), +- tmp.size) != 0) { +- ret = -EFAULT; +- goto free_newinfo; +- } ++ void *loc_cpu_old_entry; + +- counters = vmalloc(tmp.num_counters * sizeof(struct xt_counters)); ++ ret = 0; ++ counters = ub_vmalloc_best(num_counters * sizeof(struct xt_counters)); + if (!counters) { + ret = -ENOMEM; +- goto free_newinfo; ++ goto out; + } + +- ret = translate_table(tmp.name, tmp.valid_hooks, +- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, +- tmp.hook_entry, tmp.underflow); +- if (ret != 0) +- goto free_newinfo_counters; +- +- duprintf("ip_tables: Translated table\n"); +- +- t = try_then_request_module(xt_find_table_lock(AF_INET, tmp.name), +- "iptable_%s", tmp.name); ++ t = try_then_request_module(xt_find_table_lock(AF_INET, name), ++ "iptable_%s", name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ +- if (tmp.valid_hooks != t->valid_hooks) { ++ if (valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", +- tmp.valid_hooks, t->valid_hooks); ++ valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + +- oldinfo = xt_replace_table(t, tmp.num_counters, newinfo, &ret); ++ oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + +@@ -989,8 +1347,8 @@ do_replace(void __user *user, unsigned i + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + xt_free_table_info(oldinfo); +- if (copy_to_user(tmp.counters, counters, +- sizeof(struct xt_counters) * tmp.num_counters) != 0) ++ if (copy_to_user(counters_ptr, counters, ++ sizeof(struct xt_counters) * num_counters) != 0) + ret = -EFAULT; + vfree(counters); + xt_table_unlock(t); +@@ -1000,9 +1358,62 @@ do_replace(void __user *user, unsigned i + module_put(t->me); + xt_table_unlock(t); + free_newinfo_counters_untrans: +- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); +- free_newinfo_counters: + vfree(counters); ++ out: ++ return ret; ++} ++ ++static int ++do_replace(void __user *user, unsigned int len) ++{ ++ int ret; ++ struct ipt_replace tmp; ++ struct xt_table_info *newinfo; ++ void *loc_cpu_entry; ++ ++ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) ++ return -EFAULT; ++ ++ /* Hack: Causes ipchains to give correct error msg --RR */ ++ if (len != sizeof(tmp) + tmp.size) ++ return -ENOPROTOOPT; ++ ++ /* overflow check */ ++ if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - ++ SMP_CACHE_BYTES) ++ return -ENOMEM; ++ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) ++ return -ENOMEM; ++ ++ newinfo = xt_alloc_table_info(tmp.size); ++ if (!newinfo) ++ return -ENOMEM; ++ ++ /* choose the copy that is our node/cpu */ ++ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; ++ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), ++ tmp.size) != 0) { ++ ret = -EFAULT; ++ goto free_newinfo; ++ } ++ ++ ret = translate_table(tmp.name, tmp.valid_hooks, ++ newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, ++ tmp.hook_entry, tmp.underflow); ++ if (ret != 0) ++ goto free_newinfo; ++ ++ duprintf("ip_tables: Translated table\n"); ++ ++ ret = __do_replace(tmp.name, tmp.valid_hooks, ++ newinfo, tmp.num_counters, ++ tmp.counters); ++ if (ret) ++ goto free_newinfo_untrans; ++ return 0; ++ ++ free_newinfo_untrans: ++ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); + free_newinfo: + xt_free_table_info(newinfo); + return ret; +@@ -1034,28 +1445,56 @@ static int + do_add_counters(void __user *user, unsigned int len) + { + unsigned int i; +- struct xt_counters_info tmp, *paddc; ++ struct xt_counters_info tmp; ++ struct xt_counters *paddc; ++ unsigned int num_counters; ++ char *name; ++ int size; ++ void *ptmp; + struct ipt_table *t; + struct xt_table_info *private; + int ret = 0; + void *loc_cpu_entry; ++#ifdef CONFIG_COMPAT ++ struct compat_xt_counters_info compat_tmp; + +- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) ++ if (is_current_32bits()) { ++ ptmp = &compat_tmp; ++ size = sizeof(struct compat_xt_counters_info); ++ } else ++#endif ++ { ++ ptmp = &tmp; ++ size = sizeof(struct xt_counters_info); ++ } ++ ++ if (copy_from_user(ptmp, user, size) != 0) + return -EFAULT; + +- if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct xt_counters)) ++#ifdef CONFIG_COMPAT ++ if (is_current_32bits()) { ++ num_counters = compat_tmp.num_counters; ++ name = compat_tmp.name; ++ } else ++#endif ++ { ++ num_counters = tmp.num_counters; ++ name = tmp.name; ++ } ++ ++ if (len != size + num_counters * sizeof(struct xt_counters)) + return -EINVAL; + +- paddc = vmalloc_node(len, numa_node_id()); ++ paddc = ub_vmalloc_node(len - size, numa_node_id()); + if (!paddc) + return -ENOMEM; + +- if (copy_from_user(paddc, user, len) != 0) { ++ if (copy_from_user(paddc, user + size, len - size) != 0) { + ret = -EFAULT; + goto free; + } + +- t = xt_find_table_lock(AF_INET, tmp.name); ++ t = xt_find_table_lock(AF_INET, name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; +@@ -1063,7 +1502,7 @@ do_add_counters(void __user *user, unsig + + write_lock_bh(&t->lock); + private = t->private; +- if (private->number != paddc->num_counters) { ++ if (private->number != num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } +@@ -1074,7 +1513,7 @@ do_add_counters(void __user *user, unsig + IPT_ENTRY_ITERATE(loc_cpu_entry, + private->size, + add_counter_to_entry, +- paddc->counters, ++ paddc, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); +@@ -1086,14 +1525,590 @@ do_add_counters(void __user *user, unsig + return ret; + } + ++#ifdef CONFIG_COMPAT ++struct compat_ipt_replace { ++ char name[IPT_TABLE_MAXNAMELEN]; ++ u32 valid_hooks; ++ u32 num_entries; ++ u32 size; ++ u32 hook_entry[NF_IP_NUMHOOKS]; ++ u32 underflow[NF_IP_NUMHOOKS]; ++ u32 num_counters; ++ compat_uptr_t counters; /* struct ipt_counters * */ ++ struct compat_ipt_entry entries[0]; ++}; ++ ++static inline int compat_copy_match_to_user(struct ipt_entry_match *m, ++ void __user **dstptr, compat_uint_t *size) ++{ ++ if (m->u.kernel.match->compat) ++ m->u.kernel.match->compat(m, dstptr, size, COMPAT_TO_USER); ++ else { ++ if (__copy_to_user(*dstptr, m, m->u.match_size)) ++ return -EFAULT; ++ *dstptr += m->u.match_size; ++ } ++ return 0; ++} ++ ++static int compat_copy_entry_to_user(struct ipt_entry *e, ++ void __user **dstptr, compat_uint_t *size) ++{ ++ struct ipt_entry_target __user *t; ++ struct compat_ipt_entry __user *ce; ++ u_int16_t target_offset, next_offset; ++ compat_uint_t origsize; ++ int ret; ++ ++ ret = -EFAULT; ++ origsize = *size; ++ ce = (struct compat_ipt_entry __user *)*dstptr; ++ if (__copy_to_user(ce, e, sizeof(struct ipt_entry))) ++ goto out; ++ ++ *dstptr += sizeof(struct compat_ipt_entry); ++ ret = IPT_MATCH_ITERATE(e, compat_copy_match_to_user, dstptr, size); ++ target_offset = e->target_offset - (origsize - *size); ++ if (ret) ++ goto out; ++ t = ipt_get_target(e); ++ if (t->u.kernel.target->compat) { ++ ret = t->u.kernel.target->compat(t, ++ dstptr, size, COMPAT_TO_USER); ++ if (ret) ++ goto out; ++ } else { ++ ret = -EFAULT; ++ if (__copy_to_user(*dstptr, t, t->u.target_size)) ++ goto out; ++ *dstptr += t->u.target_size; ++ } ++ ret = -EFAULT; ++ next_offset = e->next_offset - (origsize - *size); ++ if (__put_user(target_offset, &ce->target_offset)) ++ goto out; ++ if (__put_user(next_offset, &ce->next_offset)) ++ goto out; ++ return 0; ++out: ++ return ret; ++} ++ ++static inline int ++compat_check_calc_match(struct ipt_entry_match *m, ++ const char *name, ++ const struct ipt_ip *ip, ++ unsigned int hookmask, ++ int *size, int *i) ++{ ++ struct ipt_match *match; ++ ++ match = try_then_request_module(xt_find_match(AF_INET, m->u.user.name, ++ m->u.user.revision), ++ "ipt_%s", m->u.user.name); ++ if (IS_ERR(match) || !match) { ++ duprintf("compat_check_calc_match: `%s' not found\n", ++ m->u.user.name); ++ return match ? PTR_ERR(match) : -ENOENT; ++ } ++ m->u.kernel.match = match; ++ ++ if (m->u.kernel.match->compat) ++ m->u.kernel.match->compat(m, NULL, size, COMPAT_CALC_SIZE); ++ ++ (*i)++; ++ return 0; ++} ++ ++static inline int ++check_compat_entry_size_and_hooks(struct ipt_entry *e, ++ struct xt_table_info *newinfo, ++ unsigned int *size, ++ unsigned char *base, ++ unsigned char *limit, ++ unsigned int *hook_entries, ++ unsigned int *underflows, ++ unsigned int *i, ++ const char *name) ++{ ++ struct ipt_entry_target *t; ++ struct ipt_target *target; ++ u_int16_t entry_offset; ++ int ret, off, h, j; ++ ++ duprintf("check_compat_entry_size_and_hooks %p\n", e); ++ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ++ || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { ++ duprintf("Bad offset %p, limit = %p\n", e, limit); ++ return -EINVAL; ++ } ++ ++ if (e->next_offset < sizeof(struct compat_ipt_entry) + ++ sizeof(struct compat_ipt_entry_target)) { ++ duprintf("checking: element %p size %u\n", ++ e, e->next_offset); ++ return -EINVAL; ++ } ++ ++ if (!ip_checkentry(&e->ip)) { ++ duprintf("ip_tables: ip check failed %p %s.\n", e, name); ++ return -EINVAL; ++ } ++ ++ off = 0; ++ entry_offset = (void *)e - (void *)base; ++ j = 0; ++ ret = IPT_MATCH_ITERATE(e, compat_check_calc_match, name, &e->ip, ++ e->comefrom, &off, &j); ++ if (ret != 0) ++ goto out; ++ ++ t = ipt_get_target(e); ++ target = try_then_request_module(xt_find_target(AF_INET, ++ t->u.user.name, ++ t->u.user.revision), ++ "ipt_%s", t->u.user.name); ++ if (IS_ERR(target) || !target) { ++ duprintf("check_entry: `%s' not found\n", t->u.user.name); ++ ret = target ? PTR_ERR(target) : -ENOENT; ++ goto out; ++ } ++ t->u.kernel.target = target; ++ ++ if (t->u.kernel.target->compat) ++ t->u.kernel.target->compat(t, NULL, &off, COMPAT_CALC_SIZE); ++ *size += off; ++ ret = compat_add_offset(entry_offset, off); ++ if (ret) ++ goto out; ++ ++ /* Check hooks & underflows */ ++ for (h = 0; h < NF_IP_NUMHOOKS; h++) { ++ if ((unsigned char *)e - base == hook_entries[h]) ++ newinfo->hook_entry[h] = hook_entries[h]; ++ if ((unsigned char *)e - base == underflows[h]) ++ newinfo->underflow[h] = underflows[h]; ++ } ++ ++ /* Clear counters and comefrom */ ++ e->counters = ((struct ipt_counters) { 0, 0 }); ++ e->comefrom = 0; ++ ++ (*i)++; ++ return 0; ++out: ++ IPT_MATCH_ITERATE(e, cleanup_match, &j); ++ return ret; ++} ++ ++static inline int compat_copy_match_from_user(struct ipt_entry_match *m, ++ void **dstptr, compat_uint_t *size, const char *name, ++ const struct ipt_ip *ip, unsigned int hookmask) ++{ ++ struct ipt_entry_match *dm; ++ ++ dm = (struct ipt_entry_match *)*dstptr; ++ if (m->u.kernel.match->compat) ++ m->u.kernel.match->compat(m, dstptr, size, COMPAT_FROM_USER); ++ else { ++ memcpy(*dstptr, m, m->u.match_size); ++ *dstptr += m->u.match_size; ++ } ++ ++ if (dm->u.kernel.match->checkentry ++ && !dm->u.kernel.match->checkentry(name, ip, dm->data, ++ dm->u.match_size - sizeof(*dm), ++ hookmask)) { ++ module_put(dm->u.kernel.match->me); ++ duprintf("ip_tables: check failed for `%s'.\n", ++ dm->u.kernel.match->name); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int compat_copy_entry_from_user(struct ipt_entry *e, void **dstptr, ++ unsigned int *size, const char *name, ++ struct xt_table_info *newinfo, unsigned char *base) ++{ ++ struct ipt_entry_target *t; ++ struct ipt_entry *de; ++ unsigned int origsize; ++ int ret, h; ++ ++ ret = 0; ++ origsize = *size; ++ de = (struct ipt_entry *)*dstptr; ++ memcpy(de, e, sizeof(struct ipt_entry)); ++ ++ *dstptr += sizeof(struct compat_ipt_entry); ++ ret = IPT_MATCH_ITERATE(e, compat_copy_match_from_user, dstptr, size, ++ name, &de->ip, de->comefrom); ++ if (ret) ++ goto out; ++ de->target_offset = e->target_offset - (origsize - *size); ++ t = ipt_get_target(e); ++ if (t->u.kernel.target->compat) ++ t->u.kernel.target->compat(t, ++ dstptr, size, COMPAT_FROM_USER); ++ else { ++ memcpy(*dstptr, t, t->u.target_size); ++ *dstptr += t->u.target_size; ++ } ++ ++ de->next_offset = e->next_offset - (origsize - *size); ++ for (h = 0; h < NF_IP_NUMHOOKS; h++) { ++ if ((unsigned char *)de - base < newinfo->hook_entry[h]) ++ newinfo->hook_entry[h] -= origsize - *size; ++ if ((unsigned char *)de - base < newinfo->underflow[h]) ++ newinfo->underflow[h] -= origsize - *size; ++ } ++ ++ ret = -EINVAL; ++ t = ipt_get_target(de); ++ if (t->u.kernel.target == &ipt_standard_target) { ++ if (!standard_check(t, *size)) ++ goto out; ++ } else if (t->u.kernel.target->checkentry ++ && !t->u.kernel.target->checkentry(name, de, t->data, ++ t->u.target_size ++ - sizeof(*t), ++ de->comefrom)) { ++ module_put(t->u.kernel.target->me); ++ duprintf("ip_tables: compat: check failed for `%s'.\n", ++ t->u.kernel.target->name); ++ goto out; ++ } ++ ret = 0; ++out: ++ return ret; ++} ++ ++static int ++translate_compat_table(const char *name, ++ unsigned int valid_hooks, ++ struct xt_table_info **pinfo, ++ void **pentry0, ++ unsigned int total_size, ++ unsigned int number, ++ unsigned int *hook_entries, ++ unsigned int *underflows) ++{ ++ unsigned int i; ++ struct xt_table_info *newinfo, *info; ++ void *pos, *entry0, *entry1; ++ unsigned int size; ++ int ret; ++ ++ info = *pinfo; ++ entry0 = *pentry0; ++ size = total_size; ++ info->number = number; ++ ++ /* Init all hooks to impossible value. */ ++ for (i = 0; i < NF_IP_NUMHOOKS; i++) { ++ info->hook_entry[i] = 0xFFFFFFFF; ++ info->underflow[i] = 0xFFFFFFFF; ++ } ++ ++ duprintf("translate_compat_table: size %u\n", info->size); ++ i = 0; ++ down(&compat_ipt_mutex); ++ /* Walk through entries, checking offsets. */ ++ ret = IPT_ENTRY_ITERATE(entry0, total_size, ++ check_compat_entry_size_and_hooks, ++ info, &size, entry0, ++ entry0 + total_size, ++ hook_entries, underflows, &i, name); ++ if (ret != 0) ++ goto out_unlock; ++ ++ ret = -EINVAL; ++ if (i != number) { ++ duprintf("translate_compat_table: %u not %u entries\n", ++ i, number); ++ goto out_unlock; ++ } ++ ++ /* Check hooks all assigned */ ++ for (i = 0; i < NF_IP_NUMHOOKS; i++) { ++ /* Only hooks which are valid */ ++ if (!(valid_hooks & (1 << i))) ++ continue; ++ if (info->hook_entry[i] == 0xFFFFFFFF) { ++ duprintf("Invalid hook entry %u %u\n", ++ i, hook_entries[i]); ++ goto out_unlock; ++ } ++ if (info->underflow[i] == 0xFFFFFFFF) { ++ duprintf("Invalid underflow %u %u\n", ++ i, underflows[i]); ++ goto out_unlock; ++ } ++ } ++ ++ ret = -ENOMEM; ++ newinfo = xt_alloc_table_info(size); ++ if (!newinfo) ++ goto out_unlock; ++ ++ newinfo->number = number; ++ for (i = 0; i < NF_IP_NUMHOOKS; i++) { ++ newinfo->hook_entry[i] = info->hook_entry[i]; ++ newinfo->underflow[i] = info->underflow[i]; ++ } ++ entry1 = newinfo->entries[raw_smp_processor_id()]; ++ pos = entry1; ++ size = total_size; ++ ret = IPT_ENTRY_ITERATE(entry0, total_size, ++ compat_copy_entry_from_user, &pos, &size, ++ name, newinfo, entry1); ++ compat_flush_offsets(); ++ up(&compat_ipt_mutex); ++ if (ret) ++ goto free_newinfo; ++ ++ ret = -ELOOP; ++ if (!mark_source_chains(newinfo, valid_hooks, entry1)) ++ goto free_newinfo; ++ ++ /* And one copy for every other CPU */ ++ for_each_cpu(i) ++ if (newinfo->entries[i] && newinfo->entries[i] != entry1) ++ memcpy(newinfo->entries[i], entry1, newinfo->size); ++ ++ *pinfo = newinfo; ++ *pentry0 = entry1; ++ xt_free_table_info(info); ++ return 0; ++ ++free_newinfo: ++ xt_free_table_info(newinfo); ++out: ++ return ret; ++out_unlock: ++ up(&compat_ipt_mutex); ++ goto out; ++} ++ ++static int ++compat_do_replace(void __user *user, unsigned int len) ++{ ++ int ret; ++ struct compat_ipt_replace tmp; ++ struct xt_table_info *newinfo; ++ void *loc_cpu_entry; ++ ++ if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) ++ return -EFAULT; ++ ++ /* Hack: Causes ipchains to give correct error msg --RR */ ++ if (len != sizeof(tmp) + tmp.size) ++ return -ENOPROTOOPT; ++ ++ /* overflow check */ ++ if (tmp.size >= (INT_MAX - sizeof(struct xt_table_info)) / NR_CPUS - ++ SMP_CACHE_BYTES) ++ return -ENOMEM; ++ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) ++ return -ENOMEM; ++ ++ newinfo = xt_alloc_table_info(tmp.size); ++ if (!newinfo) ++ return -ENOMEM; ++ ++ /* choose the copy that is our node/cpu */ ++ loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; ++ if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), ++ tmp.size) != 0) { ++ ret = -EFAULT; ++ goto free_newinfo; ++ } ++ ++ ret = translate_compat_table(tmp.name, tmp.valid_hooks, ++ &newinfo, &loc_cpu_entry, tmp.size, ++ tmp.num_entries, tmp.hook_entry, tmp.underflow); ++ if (ret != 0) ++ goto free_newinfo; ++ ++ duprintf("compat_do_replace: Translated table\n"); ++ ++ ret = __do_replace(tmp.name, tmp.valid_hooks, ++ newinfo, tmp.num_counters, ++ compat_ptr(tmp.counters)); ++ if (ret) ++ goto free_newinfo_untrans; ++ return 0; ++ ++ free_newinfo_untrans: ++ IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); ++ free_newinfo: ++ xt_free_table_info(newinfo); ++ return ret; ++} ++ ++struct compat_ipt_get_entries ++{ ++ char name[IPT_TABLE_MAXNAMELEN]; ++ compat_uint_t size; ++ struct compat_ipt_entry entrytable[0]; ++}; ++ ++static int compat_copy_entries_to_user(unsigned int total_size, ++ struct ipt_table *table, void __user *userptr) ++{ ++ unsigned int off, num; ++ struct compat_ipt_entry e; ++ struct xt_counters *counters; ++ struct xt_table_info *private = table->private; ++ void __user *pos; ++ unsigned int size; ++ int ret = 0; ++ void *loc_cpu_entry; ++ ++ counters = alloc_counters(table); ++ if (IS_ERR(counters)) ++ return PTR_ERR(counters); ++ ++ /* choose the copy that is on our node/cpu, ... ++ * This choice is lazy (because current thread is ++ * allowed to migrate to another cpu) ++ */ ++ loc_cpu_entry = private->entries[raw_smp_processor_id()]; ++ pos = userptr; ++ size = total_size; ++ ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, ++ compat_copy_entry_to_user, &pos, &size); ++ if (ret) ++ goto free_counters; ++ ++ /* ... then go back and fix counters and names */ ++ for (off = 0, num = 0; off < size; off += e.next_offset, num++) { ++ unsigned int i; ++ struct ipt_entry_match m; ++ struct ipt_entry_target t; ++ ++ ret = -EFAULT; ++ if (copy_from_user(&e, userptr + off, ++ sizeof(struct compat_ipt_entry))) ++ goto free_counters; ++ if (copy_to_user(userptr + off + ++ offsetof(struct compat_ipt_entry, counters), ++ &counters[num], sizeof(counters[num]))) ++ goto free_counters; ++ ++ for (i = sizeof(struct compat_ipt_entry); ++ i < e.target_offset; i += m.u.match_size) { ++ if (copy_from_user(&m, userptr + off + i, ++ sizeof(struct ipt_entry_match))) ++ goto free_counters; ++ if (copy_to_user(userptr + off + i + ++ offsetof(struct ipt_entry_match, u.user.name), ++ m.u.kernel.match->name, ++ strlen(m.u.kernel.match->name) + 1)) ++ goto free_counters; ++ } ++ ++ if (copy_from_user(&t, userptr + off + e.target_offset, ++ sizeof(struct ipt_entry_target))) ++ goto free_counters; ++ if (copy_to_user(userptr + off + e.target_offset + ++ offsetof(struct ipt_entry_target, u.user.name), ++ t.u.kernel.target->name, ++ strlen(t.u.kernel.target->name) + 1)) ++ goto free_counters; ++ } ++ ret = 0; ++free_counters: ++ vfree(counters); ++ return ret; ++} ++ ++static int ++compat_get_entries(struct compat_ipt_get_entries __user *uptr, int *len) ++{ ++ int ret; ++ struct compat_ipt_get_entries get; ++ struct ipt_table *t; ++ ++ ++ if (*len < sizeof(get)) { ++ duprintf("compat_get_entries: %u < %u\n", ++ *len, (unsigned int)sizeof(get)); ++ return -EINVAL; ++ } ++ ++ if (copy_from_user(&get, uptr, sizeof(get)) != 0) ++ return -EFAULT; ++ ++ if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { ++ duprintf("compat_get_entries: %u != %u\n", *len, ++ (unsigned int)(sizeof(struct compat_ipt_get_entries) + ++ get.size)); ++ return -EINVAL; ++ } ++ ++ down(&compat_ipt_mutex); ++ t = xt_find_table_lock(AF_INET, get.name); ++ if (t && !IS_ERR(t)) { ++ struct xt_table_info *private = t->private; ++ struct xt_table_info info; ++ duprintf("t->private->number = %u\n", ++ private->number); ++ ret = compat_table_info(private, &info); ++ if (!ret && get.size == info.size) { ++ ret = compat_copy_entries_to_user(private->size, ++ t, uptr->entrytable); ++ } else if (!ret) { ++ duprintf("compat_get_entries: I've got %u not %u!\n", ++ private->size, ++ get.size); ++ ret = -EINVAL; ++ } ++ compat_flush_offsets(); ++ module_put(t->me); ++ xt_table_unlock(t); ++ } else ++ ret = t ? PTR_ERR(t) : -ENOENT; ++ ++ up(&compat_ipt_mutex); ++ return ret; ++} ++ ++static int ++compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ++{ ++ int ret; ++ ++ switch (cmd) { ++ case IPT_SO_GET_INFO: ++ ret = get_info(user, len); ++ break; ++ case IPT_SO_GET_ENTRIES: ++ ret = compat_get_entries(user, len); ++ break; ++ default: ++ duprintf("compat_do_ipt_get_ctl: unknown request %i\n", cmd); ++ ret = -EINVAL; ++ } ++ return ret; ++} ++#endif ++ + static int + do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + ++#ifdef CONFIG_COMPAT ++ if (is_current_32bits() && (cmd == IPT_SO_SET_REPLACE)) ++ return compat_do_replace(user, len); ++#endif ++ + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = do_replace(user, len); +@@ -1116,69 +2131,22 @@ do_ipt_get_ctl(struct sock *sk, int cmd, + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + +- switch (cmd) { +- case IPT_SO_GET_INFO: { +- char name[IPT_TABLE_MAXNAMELEN]; +- struct ipt_table *t; +- +- if (*len != sizeof(struct ipt_getinfo)) { +- duprintf("length %u != %u\n", *len, +- sizeof(struct ipt_getinfo)); +- ret = -EINVAL; +- break; +- } +- +- if (copy_from_user(name, user, sizeof(name)) != 0) { +- ret = -EFAULT; +- break; +- } +- name[IPT_TABLE_MAXNAMELEN-1] = '\0'; +- +- t = try_then_request_module(xt_find_table_lock(AF_INET, name), +- "iptable_%s", name); +- if (t && !IS_ERR(t)) { +- struct ipt_getinfo info; +- struct xt_table_info *private = t->private; +- +- info.valid_hooks = t->valid_hooks; +- memcpy(info.hook_entry, private->hook_entry, +- sizeof(info.hook_entry)); +- memcpy(info.underflow, private->underflow, +- sizeof(info.underflow)); +- info.num_entries = private->number; +- info.size = private->size; +- memcpy(info.name, name, sizeof(info.name)); +- +- if (copy_to_user(user, &info, *len) != 0) +- ret = -EFAULT; +- else +- ret = 0; +- xt_table_unlock(t); +- module_put(t->me); +- } else +- ret = t ? PTR_ERR(t) : -ENOENT; +- } +- break; ++#ifdef CONFIG_COMPAT ++ if (is_current_32bits()) ++ return compat_do_ipt_get_ctl(sk, cmd, user, len); ++#endif + +- case IPT_SO_GET_ENTRIES: { +- struct ipt_get_entries get; ++ switch (cmd) { ++ case IPT_SO_GET_INFO: ++ ret = get_info(user, len); ++ break; + +- if (*len < sizeof(get)) { +- duprintf("get_entries: %u < %u\n", *len, sizeof(get)); +- ret = -EINVAL; +- } else if (copy_from_user(&get, user, sizeof(get)) != 0) { +- ret = -EFAULT; +- } else if (*len != sizeof(struct ipt_get_entries) + get.size) { +- duprintf("get_entries: %u != %u\n", *len, +- sizeof(struct ipt_get_entries) + get.size); +- ret = -EINVAL; +- } else +- ret = get_entries(&get, user); ++ case IPT_SO_GET_ENTRIES: ++ ret = get_entries(user, len); + break; +- } + + case IPT_SO_GET_REVISION_MATCH: + case IPT_SO_GET_REVISION_TARGET: { +@@ -1214,7 +2182,8 @@ do_ipt_get_ctl(struct sock *sk, int cmd, + return ret; + } + +-int ipt_register_table(struct xt_table *table, const struct ipt_replace *repl) ++struct xt_table *ipt_register_table(struct xt_table *table, ++ const struct ipt_replace *repl) + { + int ret; + struct xt_table_info *newinfo; +@@ -1224,7 +2193,7 @@ int ipt_register_table(struct xt_table * + + newinfo = xt_alloc_table_info(repl->size); + if (!newinfo) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + /* choose the copy on our node/cpu + * but dont care of preemption +@@ -1239,15 +2208,14 @@ int ipt_register_table(struct xt_table * + repl->underflow); + if (ret != 0) { + xt_free_table_info(newinfo); +- return ret; ++ return ERR_PTR(ret); + } + +- if (xt_register_table(table, &bootstrap, newinfo) != 0) { ++ table = virt_xt_register_table(table, &bootstrap, newinfo); ++ if (IS_ERR(table)) + xt_free_table_info(newinfo); +- return ret; +- } + +- return 0; ++ return table; + } + + void ipt_unregister_table(struct ipt_table *table) +@@ -1255,7 +2223,7 @@ void ipt_unregister_table(struct ipt_tab + struct xt_table_info *private; + void *loc_cpu_entry; + +- private = xt_unregister_table(table); ++ private = virt_xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; +@@ -1263,6 +2231,29 @@ void ipt_unregister_table(struct ipt_tab + xt_free_table_info(private); + } + ++void ipt_flush_table(struct xt_table *table) ++{ ++ struct xt_table *t; ++ void *loc_cpu_entry; ++ ++ if (table == NULL) ++ return; ++ ++ t = xt_find_table_lock(AF_INET, table->name); ++ if (t && !IS_ERR(t)) { ++ struct xt_table_info *private; ++ private = t->private; ++ loc_cpu_entry = private->entries[raw_smp_processor_id()]; ++ IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, ++ cleanup_entry, NULL); ++ if (private->number > private->initial_entries) ++ module_put(t->me); ++ private->size = 0; ++ xt_table_unlock(t); ++ module_put(t->me); ++ } ++} ++ + /* Returns 1 if the type and code is matched by the range, 0 otherwise */ + static inline int + icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, +@@ -1327,6 +2318,9 @@ icmp_checkentry(const char *tablename, + /* The built-in targets: standard (NULL) and error. */ + static struct ipt_target ipt_standard_target = { + .name = IPT_STANDARD_TARGET, ++#ifdef CONFIG_COMPAT ++ .compat = &compat_ipt_standard_fn, ++#endif + }; + + static struct ipt_target ipt_error_target = { +@@ -1348,43 +2342,107 @@ static struct ipt_match icmp_matchstruct + .name = "icmp", + .match = &icmp_match, + .checkentry = &icmp_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &icmp_compat, ++#endif + }; + +-static int __init init(void) ++static int init_iptables(void) + { + int ret; + +- xt_proto_init(AF_INET); ++ if (ve_ipt_standard_target != NULL) ++ return -EEXIST; ++ ++ ret = xt_register_target(AF_INET, &ipt_standard_target); ++ if (ret) ++ goto out; ++#ifdef CONFIG_VE_IPTABLES ++ ve_ipt_standard_target = xt_find_target(AF_INET, IPT_STANDARD_TARGET, 0); ++ if (IS_ERR(ve_ipt_standard_target)) ++ goto out_standard; ++#endif ++ ret = xt_register_target(AF_INET, &ipt_error_target); ++ if (ret) ++ goto out_error; ++ ret = xt_register_match(AF_INET, &icmp_matchstruct); ++ if (ret) ++ goto out_icmp; ++ ret = xt_proto_init(AF_INET); ++ if (ret) ++ goto out_proc; ++ return 0; ++ ++out_proc: ++ xt_unregister_match(AF_INET, &icmp_matchstruct); ++out_icmp: ++ xt_unregister_target(AF_INET, &ipt_error_target); ++out_error: ++#ifdef CONFIG_VE_IPTABLES ++ ve_ipt_standard_target = NULL; ++out_standard: ++#endif ++ xt_unregister_target(AF_INET, &ipt_standard_target); ++out: ++ return ret; ++} ++ ++static void fini_iptables(void) ++{ ++ xt_proto_fini(AF_INET); ++ xt_unregister_match(AF_INET, &icmp_matchstruct); ++ xt_unregister_target(AF_INET, &ipt_error_target); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ipt_standard_target = NULL; ++#endif ++ xt_unregister_target(AF_INET, &ipt_standard_target); ++} + +- /* Noone else will be downing sem now, so we won't sleep */ +- xt_register_target(AF_INET, &ipt_standard_target); +- xt_register_target(AF_INET, &ipt_error_target); +- xt_register_match(AF_INET, &icmp_matchstruct); ++static int __init init(void) ++{ ++ int ret; ++ ++ ret = init_iptables(); ++ if (ret) ++ goto out; + + /* Register setsockopt */ + ret = nf_register_sockopt(&ipt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); +- return ret; ++ goto out_sockopts; + } + ++ KSYMRESOLVE(init_iptables); ++ KSYMRESOLVE(fini_iptables); ++ KSYMRESOLVE(ipt_flush_table); ++ KSYMMODRESOLVE(ip_tables); + printk("ip_tables: (C) 2000-2006 Netfilter Core Team\n"); + return 0; ++ ++out_sockopts: ++ fini_iptables(); ++out: ++ return ret; + } + + static void __exit fini(void) + { ++ KSYMMODUNRESOLVE(ip_tables); ++ KSYMUNRESOLVE(init_iptables); ++ KSYMUNRESOLVE(fini_iptables); ++ KSYMUNRESOLVE(ipt_flush_table); + nf_unregister_sockopt(&ipt_sockopts); +- +- xt_unregister_match(AF_INET, &icmp_matchstruct); +- xt_unregister_target(AF_INET, &ipt_error_target); +- xt_unregister_target(AF_INET, &ipt_standard_target); +- +- xt_proto_fini(AF_INET); ++ fini_iptables(); + } + + EXPORT_SYMBOL(ipt_register_table); + EXPORT_SYMBOL(ipt_unregister_table); + EXPORT_SYMBOL(ipt_do_table); +-module_init(init); ++#ifdef CONFIG_COMPAT ++EXPORT_SYMBOL(ipt_match_align_compat); ++EXPORT_SYMBOL(ipt_target_align_compat); ++#endif ++EXPORT_SYMBOL(ipt_flush_table); ++subsys_initcall(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_LOG.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_LOG.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_LOG.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_LOG.c 2006-07-04 14:41:39.000000000 +0400 +@@ -18,6 +18,7 @@ + #include <net/udp.h> + #include <net/tcp.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -463,10 +464,25 @@ static int ipt_log_checkentry(const char + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int ipt_log_compat(void *target, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_log_info)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_log_info)); ++ return ipt_target_align_compat(target, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_target ipt_log_reg = { + .name = "LOG", + .target = ipt_log_target, + .checkentry = ipt_log_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = ipt_log_compat, ++#endif + .me = THIS_MODULE, + }; + +@@ -476,24 +492,44 @@ static struct nf_logger ipt_log_logger = + .me = THIS_MODULE, + }; + ++int init_iptable_LOG(void) ++{ ++ return ipt_register_target(&ipt_log_reg); ++} ++ ++void fini_iptable_LOG(void) ++{ ++ ipt_unregister_target(&ipt_log_reg); ++} ++ + static int __init init(void) + { +- if (ipt_register_target(&ipt_log_reg)) +- return -EINVAL; ++ int err; ++ ++ err = init_iptable_LOG(); ++ if (err < 0) ++ return err; + if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { +- printk(KERN_WARNING "ipt_LOG: not logging via system console " ++ ve_printk(VE_LOG, KERN_WARNING "ipt_LOG: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * iptables userspace would abort */ + } + ++ ++ KSYMRESOLVE(init_iptable_LOG); ++ KSYMRESOLVE(fini_iptable_LOG); ++ KSYMMODRESOLVE(ipt_LOG); + return 0; + } + + static void __exit fini(void) + { ++ KSYMMODUNRESOLVE(ipt_LOG); ++ KSYMUNRESOLVE(init_iptable_LOG); ++ KSYMUNRESOLVE(fini_iptable_LOG); + nf_log_unregister_logger(&ipt_log_logger); +- ipt_unregister_target(&ipt_log_reg); ++ fini_iptable_LOG(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_MASQUERADE.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_MASQUERADE.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_MASQUERADE.c 2006-07-04 14:41:39.000000000 +0400 +@@ -120,6 +120,7 @@ masquerade_target(struct sk_buff **pskb, + return ip_nat_setup_info(ct, &newrange, hooknum); + } + ++#if 0 + static inline int + device_cmp(struct ip_conntrack *i, void *ifindex) + { +@@ -175,6 +176,7 @@ static struct notifier_block masq_dev_no + static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, + }; ++#endif + + static struct ipt_target masquerade = { + .name = "MASQUERADE", +@@ -189,12 +191,16 @@ static int __init init(void) + + ret = ipt_register_target(&masquerade); + ++#if 0 ++/* These notifiers are unnecessary and may ++ lead to oops in virtual environments */ + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } ++#endif + + return ret; + } +@@ -202,8 +208,8 @@ static int __init init(void) + static void __exit fini(void) + { + ipt_unregister_target(&masquerade); +- unregister_netdevice_notifier(&masq_dev_notifier); +- unregister_inetaddr_notifier(&masq_inet_notifier); ++/* unregister_netdevice_notifier(&masq_dev_notifier); ++ unregister_inetaddr_notifier(&masq_inet_notifier); */ + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_REDIRECT.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REDIRECT.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_REDIRECT.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REDIRECT.c 2006-07-04 14:41:39.000000000 +0400 +@@ -17,6 +17,7 @@ + #include <linux/inetdevice.h> + #include <net/protocol.h> + #include <net/checksum.h> ++#include <linux/nfcalls.h> + #include <linux/netfilter_ipv4.h> + #include <linux/netfilter_ipv4/ip_nat_rule.h> + +@@ -25,7 +26,7 @@ MODULE_AUTHOR("Netfilter Core Team <core + MODULE_DESCRIPTION("iptables REDIRECT target module"); + + #if 0 +-#define DEBUGP printk ++#define DEBUGP ve_printk + #else + #define DEBUGP(format, args...) + #endif +@@ -94,8 +95,14 @@ redirect_target(struct sk_buff **pskb, + + rcu_read_lock(); + indev = __in_dev_get_rcu((*pskb)->dev); +- if (indev && (ifa = indev->ifa_list)) ++ if (indev && (ifa = indev->ifa_list)) { ++ /* because of venet device specific, we should use ++ * second ifa in the list */ ++ if (IN_LOOPBACK(ntohl(ifa->ifa_local)) && ++ ifa->ifa_next) ++ ifa = ifa->ifa_next; + newdst = ifa->ifa_local; ++ } + rcu_read_unlock(); + + if (!newdst) +@@ -119,15 +126,37 @@ static struct ipt_target redirect_reg = + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_REDIRECT(void) + { + return ipt_register_target(&redirect_reg); + } + +-static void __exit fini(void) ++void fini_iptable_REDIRECT(void) + { + ipt_unregister_target(&redirect_reg); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_REDIRECT(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_REDIRECT); ++ KSYMRESOLVE(fini_iptable_REDIRECT); ++ KSYMMODRESOLVE(ipt_REDIRECT); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_REDIRECT); ++ KSYMUNRESOLVE(init_iptable_REDIRECT); ++ KSYMUNRESOLVE(fini_iptable_REDIRECT); ++ fini_iptable_REDIRECT(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_REJECT.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REJECT.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_REJECT.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_REJECT.c 2006-07-04 14:41:39.000000000 +0400 +@@ -22,6 +22,7 @@ + #include <net/ip.h> + #include <net/tcp.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + #include <net/dst.h> + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_REJECT.h> +@@ -322,22 +323,59 @@ static int check(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *target, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_reject_info)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_reject_info)); ++ return ipt_target_align_compat(target, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_target ipt_reject_reg = { + .name = "REJECT", + .target = reject, + .checkentry = check, ++#ifdef CONFIG_COMPAT ++ .compat = compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_REJECT(void) + { + return ipt_register_target(&ipt_reject_reg); + } + +-static void __exit fini(void) ++void fini_iptable_REJECT(void) + { + ipt_unregister_target(&ipt_reject_reg); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_REJECT(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_REJECT); ++ KSYMRESOLVE(fini_iptable_REJECT); ++ KSYMMODRESOLVE(ipt_REJECT); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_REJECT); ++ KSYMUNRESOLVE(init_iptable_REJECT); ++ KSYMUNRESOLVE(fini_iptable_REJECT); ++ fini_iptable_REJECT(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_TCPMSS.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TCPMSS.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_TCPMSS.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TCPMSS.c 2006-07-04 14:41:39.000000000 +0400 +@@ -13,6 +13,7 @@ + + #include <linux/ip.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_TCPMSS.h> +@@ -242,22 +243,59 @@ ipt_tcpmss_checkentry(const char *tablen + return 0; + } + ++#ifdef CONFIG_COMPAT ++static int ipt_tcpmss_compat(void *target, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_tcpmss_info)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_tcpmss_info)); ++ return ipt_target_align_compat(target, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_target ipt_tcpmss_reg = { + .name = "TCPMSS", + .target = ipt_tcpmss_target, + .checkentry = ipt_tcpmss_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = ipt_tcpmss_compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_TCPMSS(void) + { + return ipt_register_target(&ipt_tcpmss_reg); + } + +-static void __exit fini(void) ++void fini_iptable_TCPMSS(void) + { + ipt_unregister_target(&ipt_tcpmss_reg); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_TCPMSS(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_TCPMSS); ++ KSYMRESOLVE(fini_iptable_TCPMSS); ++ KSYMMODRESOLVE(ipt_TCPMSS); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_TCPMSS); ++ KSYMUNRESOLVE(init_iptable_TCPMSS); ++ KSYMUNRESOLVE(fini_iptable_TCPMSS); ++ fini_iptable_TCPMSS(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_TOS.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TOS.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_TOS.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_TOS.c 2006-07-04 14:41:39.000000000 +0400 +@@ -15,6 +15,7 @@ + + #include <linux/netfilter_ipv4/ip_tables.h> + #include <linux/netfilter_ipv4/ipt_TOS.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +@@ -83,22 +84,59 @@ checkentry(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *target, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_tos_target_info)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_tos_target_info)); ++ return ipt_target_align_compat(target, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_target ipt_tos_reg = { + .name = "TOS", + .target = target, + .checkentry = checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_TOS(void) + { + return ipt_register_target(&ipt_tos_reg); + } + +-static void __exit fini(void) ++void fini_iptable_TOS(void) + { + ipt_unregister_target(&ipt_tos_reg); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_TOS(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_TOS); ++ KSYMRESOLVE(fini_iptable_TOS); ++ KSYMMODRESOLVE(ipt_TOS); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_TOS); ++ KSYMUNRESOLVE(init_iptable_TOS); ++ KSYMUNRESOLVE(fini_iptable_TOS); ++ fini_iptable_TOS(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_multiport.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_multiport.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_multiport.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_multiport.c 2006-07-04 14:41:39.000000000 +0400 +@@ -13,6 +13,7 @@ + #include <linux/types.h> + #include <linux/udp.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_multiport.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -21,6 +22,13 @@ MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + MODULE_DESCRIPTION("iptables multiple port match module"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_multiport_match (*(get_exec_env()->_multiport_match)) ++#else ++#define ve_multiport_match multiport_match ++#endif ++ + #if 0 + #define duprintf(format, args...) printk(format , ## args) + #else +@@ -174,11 +182,36 @@ checkentry_v1(const char *tablename, + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1))); + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_multiport)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_multiport)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++ ++static int compat_v1(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_multiport_v1)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_multiport_v1)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_match multiport_match = { + .name = "multiport", + .revision = 0, + .match = &match, + .checkentry = &checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +@@ -187,10 +220,13 @@ static struct ipt_match multiport_match_ + .revision = 1, + .match = &match_v1, + .checkentry = &checkentry_v1, ++#ifdef CONFIG_COMPAT ++ .compat = &compat_v1, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_multiport(void) + { + int err; + +@@ -204,11 +240,33 @@ static int __init init(void) + return err; + } + +-static void __exit fini(void) ++void fini_iptable_multiport(void) + { + ipt_unregister_match(&multiport_match); + ipt_unregister_match(&multiport_match_v1); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_multiport(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_multiport); ++ KSYMRESOLVE(fini_iptable_multiport); ++ KSYMMODRESOLVE(ipt_multiport); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_multiport); ++ KSYMUNRESOLVE(init_iptable_multiport); ++ KSYMUNRESOLVE(fini_iptable_multiport); ++ fini_iptable_multiport(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_tos.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_tos.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_tos.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_tos.c 2006-07-04 14:41:39.000000000 +0400 +@@ -10,6 +10,7 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_tos.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -17,6 +18,13 @@ + MODULE_LICENSE("GPL"); + MODULE_DESCRIPTION("iptables TOS match module"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_tos_match (*(get_exec_env()->_tos_match)) ++#else ++#define ve_tos_match tos_match ++#endif ++ + static int + match(const struct sk_buff *skb, + const struct net_device *in, +@@ -44,22 +52,59 @@ checkentry(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_tos_info)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_tos_info)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_match tos_match = { + .name = "tos", + .match = &match, + .checkentry = &checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_tos(void) + { + return ipt_register_match(&tos_match); + } + +-static void __exit fini(void) ++void fini_iptable_tos(void) + { + ipt_unregister_match(&tos_match); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_tos(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_tos); ++ KSYMRESOLVE(fini_iptable_tos); ++ KSYMMODRESOLVE(ipt_tos); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_tos); ++ KSYMUNRESOLVE(init_iptable_tos); ++ KSYMUNRESOLVE(fini_iptable_tos); ++ fini_iptable_tos(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/ipt_ttl.c linux-2.6.16-026test015/net/ipv4/netfilter/ipt_ttl.c +--- linux-2.6.16.orig/net/ipv4/netfilter/ipt_ttl.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/ipt_ttl.c 2006-07-04 14:41:39.000000000 +0400 +@@ -11,6 +11,7 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv4/ipt_ttl.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -57,22 +58,58 @@ static int checkentry(const char *tablen + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = IPT_ALIGN(sizeof(struct ipt_ttl_info)) - ++ COMPAT_IPT_ALIGN(sizeof(struct ipt_ttl_info)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++#endif ++ + static struct ipt_match ttl_match = { + .name = "ttl", + .match = &match, + .checkentry = &checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_iptable_ttl(void) + { + return ipt_register_match(&ttl_match); + } + +-static void __exit fini(void) ++void fini_iptable_ttl(void) + { + ipt_unregister_match(&ttl_match); ++} + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_ttl(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_ttl); ++ KSYMRESOLVE(fini_iptable_ttl); ++ KSYMMODRESOLVE(ipt_ttl); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ipt_ttl); ++ KSYMUNRESOLVE(init_iptable_ttl); ++ KSYMUNRESOLVE(fini_iptable_ttl); ++ fini_iptable_ttl(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_filter.c linux-2.6.16-026test015/net/ipv4/netfilter/iptable_filter.c +--- linux-2.6.16.orig/net/ipv4/netfilter/iptable_filter.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/iptable_filter.c 2006-07-04 14:41:39.000000000 +0400 +@@ -12,12 +12,20 @@ + + #include <linux/module.h> + #include <linux/moduleparam.h> ++#include <linux/nfcalls.h> + #include <linux/netfilter_ipv4/ip_tables.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + MODULE_DESCRIPTION("iptables filter table"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_packet_filter (get_exec_env()->_ve_ipt_filter_pf) ++#else ++#define ve_packet_filter &packet_filter ++#endif ++ + #define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) + + static struct +@@ -25,7 +33,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +-} initial_table __initdata ++} initial_table + = { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_LOCAL_IN] = 0, +@@ -90,7 +98,7 @@ ipt_hook(unsigned int hook, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { +- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); ++ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); + } + + static unsigned int +@@ -108,7 +116,7 @@ ipt_local_out_hook(unsigned int hook, + return NF_ACCEPT; + } + +- return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); ++ return ipt_do_table(pskb, hook, in, out, ve_packet_filter, NULL); + } + + static struct nf_hook_ops ipt_ops[] = { +@@ -139,56 +147,89 @@ static struct nf_hook_ops ipt_ops[] = { + static int forward = NF_ACCEPT; + module_param(forward, bool, 0000); + +-static int __init init(void) ++int init_iptable_filter(void) + { + int ret; +- +- if (forward < 0 || forward > NF_MAX_VERDICT) { +- printk("iptables forward must be 0 or 1\n"); +- return -EINVAL; +- } +- +- /* Entry 1 is the FORWARD hook */ +- initial_table.entries[1].target.verdict = -forward - 1; ++ struct ipt_table *tmp_filter; + + /* Register table */ +- ret = ipt_register_table(&packet_filter, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp_filter = ipt_register_table(&packet_filter, ++ &initial_table.repl); ++ if (IS_ERR(tmp_filter)) ++ return PTR_ERR(tmp_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = tmp_filter; ++#endif + + /* Register hooks */ +- ret = nf_register_hook(&ipt_ops[0]); ++ ret = virt_nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + +- ret = nf_register_hook(&ipt_ops[1]); ++ ret = virt_nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +- ret = nf_register_hook(&ipt_ops[2]); ++ ret = virt_nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: +- nf_unregister_hook(&ipt_ops[1]); ++ virt_nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: +- nf_unregister_hook(&ipt_ops[0]); ++ virt_nf_unregister_hook(&ipt_ops[0]); + cleanup_table: +- ipt_unregister_table(&packet_filter); ++ ipt_unregister_table(ve_packet_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = NULL; ++#endif + + return ret; + } + +-static void __exit fini(void) ++void fini_iptable_filter(void) + { + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) +- nf_unregister_hook(&ipt_ops[i]); ++ virt_nf_unregister_hook(&ipt_ops[i]); + +- ipt_unregister_table(&packet_filter); ++ ipt_unregister_table(ve_packet_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = NULL; ++#endif ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ if (forward < 0 || forward > NF_MAX_VERDICT) { ++ printk("iptables forward must be 0 or 1\n"); ++ return -EINVAL; ++ } ++ ++ /* Entry 1 is the FORWARD hook */ ++ initial_table.entries[1].target.verdict = -forward - 1; ++ ++ err = init_iptable_filter(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_filter); ++ KSYMRESOLVE(fini_iptable_filter); ++ KSYMMODRESOLVE(iptable_filter); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(iptable_filter); ++ KSYMUNRESOLVE(init_iptable_filter); ++ KSYMUNRESOLVE(fini_iptable_filter); ++ fini_iptable_filter(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_mangle.c linux-2.6.16-026test015/net/ipv4/netfilter/iptable_mangle.c +--- linux-2.6.16.orig/net/ipv4/netfilter/iptable_mangle.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/iptable_mangle.c 2006-07-04 14:41:39.000000000 +0400 +@@ -17,6 +17,7 @@ + #include <linux/skbuff.h> + #include <net/sock.h> + #include <net/route.h> ++#include <linux/nfcalls.h> + #include <linux/ip.h> + + MODULE_LICENSE("GPL"); +@@ -35,7 +36,7 @@ static struct + struct ipt_replace repl; + struct ipt_standard entries[5]; + struct ipt_error term; +-} initial_table __initdata ++} initial_table + = { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, +@@ -112,6 +113,13 @@ static struct ipt_table packet_mangler = + .af = AF_INET, + }; + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_packet_mangler (get_exec_env()->_ipt_mangle_table) ++#else ++#define ve_packet_mangler &packet_mangler ++#endif ++ + /* The work comes in here from netfilter.c. */ + static unsigned int + ipt_route_hook(unsigned int hook, +@@ -120,7 +128,7 @@ ipt_route_hook(unsigned int hook, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { +- return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); ++ return ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); + } + + static unsigned int +@@ -149,7 +157,8 @@ ipt_local_hook(unsigned int hook, + daddr = (*pskb)->nh.iph->daddr; + tos = (*pskb)->nh.iph->tos; + +- ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); ++ ret = ipt_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); ++ + /* Reroute for ANY change. */ + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE + && ((*pskb)->nh.iph->saddr != saddr +@@ -201,60 +210,103 @@ static struct nf_hook_ops ipt_ops[] = { + }, + }; + +-static int __init init(void) ++static int mangle_init(struct nf_hook_ops ipt_ops[]) + { + int ret; ++ struct ipt_table *tmp_mangler; + + /* Register table */ +- ret = ipt_register_table(&packet_mangler, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp_mangler = ipt_register_table(&packet_mangler, ++ &initial_table.repl); ++ if (IS_ERR(tmp_mangler)) ++ return PTR_ERR(tmp_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = tmp_mangler; ++#endif + + /* Register hooks */ +- ret = nf_register_hook(&ipt_ops[0]); ++ ret = virt_nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + +- ret = nf_register_hook(&ipt_ops[1]); ++ ret = virt_nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +- ret = nf_register_hook(&ipt_ops[2]); ++ ret = virt_nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + +- ret = nf_register_hook(&ipt_ops[3]); ++ ret = virt_nf_register_hook(&ipt_ops[3]); + if (ret < 0) + goto cleanup_hook2; + +- ret = nf_register_hook(&ipt_ops[4]); ++ ret = virt_nf_register_hook(&ipt_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: +- nf_unregister_hook(&ipt_ops[3]); ++ virt_nf_unregister_hook(&ipt_ops[3]); + cleanup_hook2: +- nf_unregister_hook(&ipt_ops[2]); ++ virt_nf_unregister_hook(&ipt_ops[2]); + cleanup_hook1: +- nf_unregister_hook(&ipt_ops[1]); ++ virt_nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: +- nf_unregister_hook(&ipt_ops[0]); ++ virt_nf_unregister_hook(&ipt_ops[0]); + cleanup_table: +- ipt_unregister_table(&packet_mangler); ++ ipt_unregister_table(ve_packet_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = NULL; ++#endif + + return ret; + } + +-static void __exit fini(void) ++static void mangle_fini(struct nf_hook_ops ipt_ops[]) + { + unsigned int i; + +- for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) +- nf_unregister_hook(&ipt_ops[i]); ++ for (i = 0; i < 5; i++) ++ virt_nf_unregister_hook(&ipt_ops[i]); ++ ++ ipt_unregister_table(ve_packet_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = NULL; ++#endif ++} ++ ++int init_iptable_mangle(void) ++{ ++ return mangle_init(ipt_ops); ++} ++ ++void fini_iptable_mangle(void) ++{ ++ mangle_fini(ipt_ops); ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_iptable_mangle(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_iptable_mangle); ++ KSYMRESOLVE(fini_iptable_mangle); ++ KSYMMODRESOLVE(iptable_mangle); ++ return 0; ++} + +- ipt_unregister_table(&packet_mangler); ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(iptable_mangle); ++ KSYMUNRESOLVE(init_iptable_mangle); ++ KSYMUNRESOLVE(fini_iptable_mangle); ++ fini_iptable_mangle(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/iptable_raw.c linux-2.6.16-026test015/net/ipv4/netfilter/iptable_raw.c +--- linux-2.6.16.orig/net/ipv4/netfilter/iptable_raw.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/iptable_raw.c 2006-07-04 14:41:39.000000000 +0400 +@@ -118,12 +118,13 @@ static struct nf_hook_ops ipt_ops[] = { + + static int __init init(void) + { ++ struct ipt_table *tmp; + int ret; + + /* Register table */ +- ret = ipt_register_table(&packet_raw, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp = ipt_register_table(&packet_raw, &initial_table.repl); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); +diff -upr linux-2.6.16.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c linux-2.6.16-026test015/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +--- linux-2.6.16.orig/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 2006-07-04 14:41:36.000000000 +0400 +@@ -354,6 +354,7 @@ getorigdst(struct sock *sk, int optval, + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; ++ memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); +diff -upr linux-2.6.16.orig/net/ipv4/proc.c linux-2.6.16-026test015/net/ipv4/proc.c +--- linux-2.6.16.orig/net/ipv4/proc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/proc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -258,11 +258,12 @@ static int snmp_seq_show(struct seq_file + seq_printf(seq, " %s", snmp4_ipstats_list[i].name); + + seq_printf(seq, "\nIp: %d %d", +- ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); ++ ve_ipv4_devconf.forwarding ? 1 : 2, ++ sysctl_ip_default_ttl); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) ip_statistics, ++ fold_field((void **) ve_ip_statistics, + snmp4_ipstats_list[i].entry)); + + seq_puts(seq, "\nIcmp:"); +@@ -272,7 +273,7 @@ static int snmp_seq_show(struct seq_file + seq_puts(seq, "\nIcmp:"); + for (i = 0; snmp4_icmp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) icmp_statistics, ++ fold_field((void **) ve_icmp_statistics, + snmp4_icmp_list[i].entry)); + + seq_puts(seq, "\nTcp:"); +@@ -284,11 +285,11 @@ static int snmp_seq_show(struct seq_file + /* MaxConn field is signed, RFC 2012 */ + if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) + seq_printf(seq, " %ld", +- fold_field((void **) tcp_statistics, ++ fold_field((void **) ve_tcp_statistics, + snmp4_tcp_list[i].entry)); + else + seq_printf(seq, " %lu", +- fold_field((void **) tcp_statistics, ++ fold_field((void **) ve_tcp_statistics, + snmp4_tcp_list[i].entry)); + } + +@@ -299,7 +300,7 @@ static int snmp_seq_show(struct seq_file + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) udp_statistics, ++ fold_field((void **) ve_udp_statistics, + snmp4_udp_list[i].entry)); + + seq_putc(seq, '\n'); +@@ -333,7 +334,7 @@ static int netstat_seq_show(struct seq_f + seq_puts(seq, "\nTcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %lu", +- fold_field((void **) net_statistics, ++ fold_field((void **) ve_net_statistics, + snmp4_net_list[i].entry)); + + seq_putc(seq, '\n'); +@@ -357,10 +358,10 @@ int __init ip_misc_proc_init(void) + { + int rc = 0; + +- if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) ++ if (!proc_glob_fops_create("net/netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + +- if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) ++ if (!proc_glob_fops_create("net/snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; + + if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) +@@ -368,9 +369,9 @@ int __init ip_misc_proc_init(void) + out: + return rc; + out_sockstat: +- proc_net_remove("snmp"); ++ remove_proc_glob_entry("net/snmp", NULL); + out_snmp: +- proc_net_remove("netstat"); ++ remove_proc_glob_entry("net/netstat", NULL); + out_netstat: + rc = -ENOMEM; + goto out; +diff -upr linux-2.6.16.orig/net/ipv4/raw.c linux-2.6.16-026test015/net/ipv4/raw.c +--- linux-2.6.16.orig/net/ipv4/raw.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/raw.c 2006-07-04 14:41:38.000000000 +0400 +@@ -114,7 +114,8 @@ struct sock *__raw_v4_lookup(struct sock + if (inet->num == num && + !(inet->daddr && inet->daddr != raddr) && + !(inet->rcv_saddr && inet->rcv_saddr != laddr) && +- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) ++ !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) && ++ ve_accessible_strict(VE_OWNER_SK(sk), get_exec_env())) + goto found; /* gotcha */ + } + sk = NULL; +@@ -753,8 +754,12 @@ static struct sock *raw_get_first(struct + struct hlist_node *node; + + sk_for_each(sk, node, &raw_v4_htable[state->bucket]) +- if (sk->sk_family == PF_INET) ++ if (sk->sk_family == PF_INET) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ++ get_exec_env())) ++ continue; + goto found; ++ } + } + sk = NULL; + found: +@@ -768,8 +773,14 @@ static struct sock *raw_get_next(struct + do { + sk = sk_next(sk); + try_again: +- ; +- } while (sk && sk->sk_family != PF_INET); ++ if (!sk) ++ break; ++ if (sk->sk_family != PF_INET) ++ continue; ++ if (ve_accessible(VE_OWNER_SK(sk), ++ get_exec_env())) ++ break; ++ } while (1); + + if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { + sk = sk_head(&raw_v4_htable[state->bucket]); +@@ -886,13 +897,13 @@ static struct file_operations raw_seq_fo + + int __init raw_proc_init(void) + { +- if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) ++ if (!proc_glob_fops_create("net/raw", S_IRUGO, &raw_seq_fops)) + return -ENOMEM; + return 0; + } + + void __init raw_proc_exit(void) + { +- proc_net_remove("raw"); ++ remove_proc_glob_entry("net/raw", NULL); + } + #endif /* CONFIG_PROC_FS */ +diff -upr linux-2.6.16.orig/net/ipv4/route.c linux-2.6.16-026test015/net/ipv4/route.c +--- linux-2.6.16.orig/net/ipv4/route.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/route.c 2006-07-04 14:41:39.000000000 +0400 +@@ -114,6 +114,8 @@ + + #define RT_GC_TIMEOUT (300*HZ) + ++int ip_rt_src_check = 1; ++ + static int ip_rt_min_delay = 2 * HZ; + static int ip_rt_max_delay = 10 * HZ; + static int ip_rt_max_size; +@@ -253,11 +255,28 @@ static unsigned int rt_hash_code(u32 dad + & rt_hash_mask); + } + ++void prepare_rt_cache(void) ++{ ++#ifdef CONFIG_VE ++ struct rtable *r; ++ int i; ++ ++ for (i = rt_hash_mask; i >= 0; i--) { ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { ++ r->fl.owner_env = get_ve0(); ++ } ++ spin_unlock_bh(rt_hash_lock_addr(i)); ++ } ++#endif ++} ++ + #ifdef CONFIG_PROC_FS + struct rt_cache_iter_state { + int bucket; + }; + ++static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r); + static struct rtable *rt_cache_get_first(struct seq_file *seq) + { + struct rtable *r = NULL; +@@ -270,6 +289,8 @@ static struct rtable *rt_cache_get_first + break; + rcu_read_unlock_bh(); + } ++ if (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())) ++ r = rt_cache_get_next(seq, r); + return r; + } + +@@ -277,14 +298,19 @@ static struct rtable *rt_cache_get_next( + { + struct rt_cache_iter_state *st = rcu_dereference(seq->private); + +- r = r->u.rt_next; ++start: ++ do { ++ r = r->u.rt_next; ++ } while (r && !ve_accessible_strict(r->fl.owner_env, get_exec_env())); + while (!r) { + rcu_read_unlock_bh(); + if (--st->bucket < 0) +- break; ++ goto out; + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + } ++ goto start; ++out: + return r; + } + +@@ -556,7 +582,8 @@ static inline int compare_keys(struct fl + { + return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && +- fl1->iif == fl2->iif; ++ fl1->iif == fl2->iif && ++ ve_accessible_strict(fl1->owner_env, fl2->owner_env); + } + + #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +@@ -670,26 +697,105 @@ static void rt_check_expire(unsigned lon + mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval); + } + ++typedef unsigned long rt_flush_gen_t; ++ ++#ifdef CONFIG_VE ++ ++static rt_flush_gen_t rt_flush_gen; ++ ++/* called under rt_flush_lock */ ++static void set_rt_flush_required(struct ve_struct *env) ++{ ++ /* ++ * If the global generation rt_flush_gen is equal to G, then ++ * the pass considering entries labelled by G is yet to come. ++ */ ++ env->rt_flush_required = rt_flush_gen; ++} ++ ++static spinlock_t rt_flush_lock; ++static rt_flush_gen_t reset_rt_flush_required(void) ++{ ++ rt_flush_gen_t g; ++ ++ spin_lock_bh(&rt_flush_lock); ++ g = rt_flush_gen++; ++ spin_unlock_bh(&rt_flush_lock); ++ return g; ++} ++ ++static int check_rt_flush_required(struct ve_struct *env, rt_flush_gen_t gen) ++{ ++ /* can be checked without the lock */ ++ return env->rt_flush_required >= gen; ++} ++ ++#else ++ ++static void set_rt_flush_required(struct ve_struct *env) ++{ ++} ++ ++static rt_flush_gen_t reset_rt_flush_required(void) ++{ ++ return 0; ++} ++ ++#endif ++ + /* This can run from both BH and non-BH contexts, the latter + * in the case of a forced flush event. + */ + static void rt_run_flush(unsigned long dummy) + { + int i; +- struct rtable *rth, *next; ++ struct rtable * rth, * next; ++ struct rtable * tail; ++ rt_flush_gen_t gen; + + rt_deadline = 0; + + get_random_bytes(&rt_hash_rnd, 4); + ++ gen = reset_rt_flush_required(); ++ + for (i = rt_hash_mask; i >= 0; i--) { ++#ifdef CONFIG_VE ++ struct rtable ** prev, * p; ++ ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ rth = rt_hash_table[i].chain; ++ ++ /* defer releasing the head of the list after spin_unlock */ ++ for (tail = rth; tail; tail = tail->u.rt_next) ++ if (!check_rt_flush_required(tail->fl.owner_env, gen)) ++ break; ++ if (rth != tail) ++ rt_hash_table[i].chain = tail; ++ ++ /* call rt_free on entries after the tail requiring flush */ ++ prev = &rt_hash_table[i].chain; ++ for (p = *prev; p; p = next) { ++ next = p->u.rt_next; ++ if (!check_rt_flush_required(p->fl.owner_env, gen)) { ++ prev = &p->u.rt_next; ++ } else { ++ *prev = next; ++ rt_free(p); ++ } ++ } ++ ++#else + spin_lock_bh(rt_hash_lock_addr(i)); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; ++ tail = NULL; ++ ++#endif + spin_unlock_bh(rt_hash_lock_addr(i)); + +- for (; rth; rth = next) { ++ for (; rth != tail; rth = next) { + next = rth->u.rt_next; + rt_free(rth); + } +@@ -728,6 +834,8 @@ void rt_cache_flush(int delay) + delay = tmo; + } + ++ set_rt_flush_required(get_exec_env()); ++ + if (delay <= 0) { + spin_unlock_bh(&rt_flush_lock); + rt_run_flush(0); +@@ -743,9 +851,30 @@ void rt_cache_flush(int delay) + + static void rt_secret_rebuild(unsigned long dummy) + { ++ int i; ++ struct rtable *rth, *next; + unsigned long now = jiffies; + +- rt_cache_flush(0); ++ spin_lock_bh(&rt_flush_lock); ++ del_timer(&rt_flush_timer); ++ spin_unlock_bh(&rt_flush_lock); ++ ++ rt_deadline = 0; ++ get_random_bytes(&rt_hash_rnd, 4); ++ ++ for (i = rt_hash_mask; i >= 0; i--) { ++ spin_lock_bh(rt_hash_lock_addr(i)); ++ rth = rt_hash_table[i].chain; ++ if (rth) ++ rt_hash_table[i].chain = NULL; ++ spin_unlock_bh(rt_hash_lock_addr(i)); ++ ++ for (; rth; rth = next) { ++ next = rth->u.rt_next; ++ rt_free(rth); ++ } ++ } ++ + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); + } + +@@ -1118,7 +1247,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd + struct rtable *rth, **rthp; + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + tos &= IPTOS_RT_MASK; + + if (!in_dev) +@@ -1154,6 +1285,10 @@ void ip_rt_redirect(u32 old_gw, u32 dadd + rth->fl.fl4_src != skeys[i] || + rth->fl.fl4_tos != tos || + rth->fl.oif != ikeys[k] || ++#ifdef CONFIG_VE ++ !ve_accessible_strict(rth->fl.owner_env, ++ ve) || ++#endif + rth->fl.iif != 0) { + rthp = &rth->u.rt_next; + continue; +@@ -1192,6 +1327,9 @@ void ip_rt_redirect(u32 old_gw, u32 dadd + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.xfrm = NULL; ++#ifdef CONFIG_VE ++ rt->fl.owner_env = ve; ++#endif + + rt->rt_flags |= RTCF_REDIRECTED; + +@@ -1631,6 +1769,9 @@ static int ip_route_input_mc(struct sk_b + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + #ifdef CONFIG_NET_CLS_ROUTE +@@ -1776,6 +1917,9 @@ static inline int __mkroute_input(struct + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + rth->rt_gateway = daddr; +@@ -2021,6 +2165,9 @@ local_input: + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + #ifdef CONFIG_NET_CLS_ROUTE +@@ -2100,6 +2247,9 @@ int ip_route_input(struct sk_buff *skb, + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == skb->nfmark && + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env == get_exec_env() && ++#endif + rth->fl.fl4_tos == tos) { + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); +@@ -2226,6 +2376,9 @@ static inline int __mkroute_output(struc + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= oldflp->fl4_fwmark; + #endif ++#ifdef CONFIG_VE ++ rth->fl.owner_env = get_exec_env(); ++#endif + rth->rt_dst = fl->fl4_dst; + rth->rt_src = fl->fl4_src; + rth->rt_iif = oldflp->oif ? : dev_out->ifindex; +@@ -2399,10 +2552,13 @@ static int ip_route_output_slow(struct r + ZERONET(oldflp->fl4_src)) + goto out; + +- /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ +- dev_out = ip_dev_find(oldflp->fl4_src); +- if (dev_out == NULL) +- goto out; ++ if (ip_rt_src_check) { ++ /* It is equivalent to ++ inet_addr_type(saddr) == RTN_LOCAL */ ++ dev_out = ip_dev_find(oldflp->fl4_src); ++ if (dev_out == NULL) ++ goto out; ++ } + + /* I removed check for oif == dev_out->oif here. + It was wrong for two reasons: +@@ -2429,6 +2585,12 @@ static int ip_route_output_slow(struct r + Luckily, this hack is good workaround. + */ + ++ if (dev_out == NULL) { ++ dev_out = ip_dev_find(oldflp->fl4_src); ++ if (dev_out == NULL) ++ goto out; ++ } ++ + fl.oif = dev_out->ifindex; + goto make_route; + } +@@ -2575,6 +2737,7 @@ int __ip_route_output_key(struct rtable + #ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == flp->fl4_fwmark && + #endif ++ ve_accessible_strict(rth->fl.owner_env, get_exec_env()) && + !((rth->fl.fl4_tos ^ flp->fl4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK))) { + +@@ -2705,7 +2868,7 @@ static int rt_fill_info(struct sk_buff * + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && +- ipv4_devconf.mc_forwarding) { ++ ve_ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { +@@ -2750,7 +2913,10 @@ int inet_rtm_getroute(struct sk_buff *in + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ +- skb->mac.raw = skb->data; ++ skb->mac.raw = skb->nh.raw = skb->data; ++ ++ /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ ++ skb->nh.iph->protocol = IPPROTO_ICMP; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta[RTA_SRC - 1]) +@@ -2853,22 +3019,22 @@ void ip_rt_multicast_event(struct in_dev + } + + #ifdef CONFIG_SYSCTL +-static int flush_delay; ++int ipv4_flush_delay; + +-static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, ++int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) + { + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); +- rt_cache_flush(flush_delay); ++ rt_cache_flush(ipv4_flush_delay); + return 0; + } + + return -EINVAL; + } + +-static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, ++int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + int __user *name, + int nlen, + void __user *oldval, +@@ -2890,7 +3056,7 @@ ctl_table ipv4_route_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", +- .data = &flush_delay, ++ .data = &ipv4_flush_delay, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, +@@ -3184,15 +3350,18 @@ int __init ip_rt_init(void) + #ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ +- if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || +- !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, +- proc_net_stat))) { ++ ++ if (!proc_glob_fops_create("net/rt_cache", ++ S_IRUGO, &rt_cache_seq_fops)) ++ return -ENOMEM; ++ ++ if (!(rtstat_pde = create_proc_glob_entry("net/stat/rt_cache", ++ S_IRUGO, NULL))) + return -ENOMEM; +- } + rtstat_pde->proc_fops = &rt_cpu_seq_fops; + } + #ifdef CONFIG_NET_CLS_ROUTE +- create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); ++ create_proc_read_entry("net/rt_acct", 0, NULL, ip_rt_acct_read, NULL); + #endif + #endif + #ifdef CONFIG_XFRM +diff -upr linux-2.6.16.orig/net/ipv4/sysctl_net_ipv4.c linux-2.6.16-026test015/net/ipv4/sysctl_net_ipv4.c +--- linux-2.6.16.orig/net/ipv4/sysctl_net_ipv4.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/sysctl_net_ipv4.c 2006-07-04 14:41:39.000000000 +0400 +@@ -22,6 +22,9 @@ + /* From af_inet.c */ + extern int sysctl_ip_nonlocal_bind; + ++int sysctl_tcp_use_sg = 1; ++EXPORT_SYMBOL(sysctl_tcp_use_sg); ++ + #ifdef CONFIG_SYSCTL + static int zero; + static int tcp_retr1_max = 255; +@@ -33,22 +36,21 @@ struct ipv4_config ipv4_config; + + #ifdef CONFIG_SYSCTL + +-static + int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) + { +- int val = ipv4_devconf.forwarding; ++ int val = ve_ipv4_devconf.forwarding; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + +- if (write && ipv4_devconf.forwarding != val) ++ if (write && ve_ipv4_devconf.forwarding != val) + inet_forward_change(); + + return ret; + } + +-static int ipv4_sysctl_forward_strategy(ctl_table *table, ++int ipv4_sysctl_forward_strategy(ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, +@@ -664,6 +666,14 @@ ctl_table ipv4_table[] = { + .mode = 0644, + .proc_handler = &proc_dointvec, + }, ++ { ++ .ctl_name = NET_TCP_USE_SG, ++ .procname = "tcp_use_sg", ++ .data = &sysctl_tcp_use_sg, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec, ++ }, + + { .ctl_name = 0 } + }; +diff -upr linux-2.6.16.orig/net/ipv4/tcp.c linux-2.6.16-026test015/net/ipv4/tcp.c +--- linux-2.6.16.orig/net/ipv4/tcp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/tcp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -248,6 +248,7 @@ + */ + + #include <linux/config.h> ++#include <linux/kmem_cache.h> + #include <linux/module.h> + #include <linux/types.h> + #include <linux/fcntl.h> +@@ -263,6 +264,9 @@ + #include <net/xfrm.h> + #include <net/ip.h> + ++#include <ub/ub_orphan.h> ++#include <ub/ub_net.h> ++#include <ub/ub_tcp.h> + + #include <asm/uaccess.h> + #include <asm/ioctls.h> +@@ -321,6 +325,7 @@ unsigned int tcp_poll(struct file *file, + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); ++ int check_send_space; + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == TCP_LISTEN) +@@ -335,6 +340,21 @@ unsigned int tcp_poll(struct file *file, + if (sk->sk_err) + mask = POLLERR; + ++ check_send_space = 1; ++#ifdef CONFIG_USER_RESOURCE ++ if (!(sk->sk_shutdown & SEND_SHUTDOWN) && sock_has_ubc(sk)) { ++ unsigned long size; ++ size = MAX_TCP_HEADER + tp->mss_cache; ++ if (size > SOCK_MIN_UBCSPACE) ++ size = SOCK_MIN_UBCSPACE; ++ size = skb_charge_size(size); ++ if (ub_sock_makewres_tcp(sk, size)) { ++ check_send_space = 0; ++ ub_sock_sndqueueadd_tcp(sk, size); ++ } ++ } ++#endif ++ + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a +@@ -378,7 +398,7 @@ unsigned int tcp_poll(struct file *file, + sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + +- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ if (check_send_space && !(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ +@@ -528,16 +548,23 @@ static ssize_t do_tcp_sendpages(struct s + int copy, i, can_coalesce; + int offset = poffset % PAGE_SIZE; + int size = min_t(size_t, psize, PAGE_SIZE - offset); ++ unsigned long chargesize = 0; + + if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { + new_segment: ++ chargesize = 0; + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + ++ chargesize = skb_charge_size(MAX_TCP_HEADER + ++ tp->mss_cache); ++ if (ub_sock_getwres_tcp(sk, chargesize) < 0) ++ goto wait_for_ubspace; + skb = sk_stream_alloc_pskb(sk, 0, 0, + sk->sk_allocation); + if (!skb) + goto wait_for_memory; ++ ub_skb_set_charge(skb, sk, chargesize, UB_TCPSNDBUF); + + skb_entail(sk, tp, skb); + copy = size_goal; +@@ -593,10 +620,14 @@ new_segment: + wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + wait_for_memory: ++ ub_sock_retwres_tcp(sk, chargesize, ++ skb_charge_size(MAX_TCP_HEADER + tp->mss_cache)); ++ chargesize = 0; ++wait_for_ubspace: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + +- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) ++ if ((err = sk_stream_wait_memory(sk, &timeo, chargesize)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); +@@ -699,6 +730,7 @@ int tcp_sendmsg(struct kiocb *iocb, stru + while (--iovlen >= 0) { + int seglen = iov->iov_len; + unsigned char __user *from = iov->iov_base; ++ unsigned long chargesize = 0; + + iov++; + +@@ -709,18 +741,26 @@ int tcp_sendmsg(struct kiocb *iocb, stru + + if (!sk->sk_send_head || + (copy = size_goal - skb->len) <= 0) { ++ unsigned long size; + + new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ ++ chargesize = 0; + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; +- +- skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), +- 0, sk->sk_allocation); ++ size = select_size(sk, tp); ++ chargesize = skb_charge_size(MAX_TCP_HEADER + ++ size); ++ if (ub_sock_getwres_tcp(sk, chargesize) < 0) ++ goto wait_for_ubspace; ++ skb = sk_stream_alloc_pskb(sk, size, 0, ++ sk->sk_allocation); + if (!skb) + goto wait_for_memory; ++ ub_skb_set_charge(skb, sk, chargesize, ++ UB_TCPSNDBUF); + + /* + * Check whether we can use HW checksum. +@@ -768,6 +808,7 @@ new_segment: + } else if (page) { + if (off == PAGE_SIZE) { + put_page(page); ++ ub_sock_tcp_detachpage(sk); + TCP_PAGE(sk) = page = NULL; + off = 0; + } +@@ -781,6 +822,9 @@ new_segment: + goto wait_for_memory; + + if (!page) { ++ chargesize = PAGE_SIZE; ++ if (ub_sock_tcp_chargepage(sk) < 0) ++ goto wait_for_ubspace; + /* Allocate new cache page. */ + if (!(page = sk_stream_alloc_page(sk))) + goto wait_for_memory; +@@ -812,7 +856,8 @@ new_segment: + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; +- } ++ } else ++ ub_sock_tcp_detachpage(sk); + } + + TCP_OFF(sk) = off + copy; +@@ -843,10 +888,15 @@ new_segment: + wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + wait_for_memory: ++ ub_sock_retwres_tcp(sk, chargesize, ++ skb_charge_size(MAX_TCP_HEADER+tp->mss_cache)); ++ chargesize = 0; ++wait_for_ubspace: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + +- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) ++ if ((err = sk_stream_wait_memory(sk, &timeo, ++ chargesize)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); +@@ -944,7 +994,18 @@ static void cleanup_rbuf(struct sock *sk + #if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +- BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); ++ if (!(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq))) { ++ printk("KERNEL: assertion: skb==NULL || " ++ "before(tp->copied_seq, skb->end_seq)\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("copied=%d, copied_seq=%d, rcv_nxt=%d\n", copied, ++ tp->copied_seq, tp->rcv_nxt); ++ printk("skb->len=%d, skb->seq=%d, skb->end_seq=%d\n", ++ skb->len, TCP_SKB_CB(skb)->seq, ++ TCP_SKB_CB(skb)->end_seq); ++ } + #endif + + if (inet_csk_ack_scheduled(sk)) { +@@ -1168,7 +1229,22 @@ int tcp_recvmsg(struct kiocb *iocb, stru + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; +- BUG_TRAP(flags & MSG_PEEK); ++ if (!(flags & MSG_PEEK)) { ++ printk("KERNEL: assertion: flags&MSG_PEEK\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? ++ VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("flags=0x%x, len=%d, copied_seq=%d, " ++ "rcv_nxt=%d\n", flags, len, ++ tp->copied_seq, tp->rcv_nxt); ++ printk("skb->len=%d, *seq=%d, skb->seq=%d, " ++ "skb->end_seq=%d, offset=%d\n", ++ skb->len, *seq, ++ TCP_SKB_CB(skb)->seq, ++ TCP_SKB_CB(skb)->end_seq, ++ offset); ++ } + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + +@@ -1231,8 +1307,18 @@ int tcp_recvmsg(struct kiocb *iocb, stru + + tp->ucopy.len = len; + +- BUG_TRAP(tp->copied_seq == tp->rcv_nxt || +- (flags & (MSG_PEEK | MSG_TRUNC))); ++ if (!(tp->copied_seq == tp->rcv_nxt || ++ (flags&(MSG_PEEK|MSG_TRUNC)))) { ++ printk("KERNEL: assertion: tp->copied_seq == " ++ "tp->rcv_nxt || ...\n"); ++ printk("VE%u pid %d comm %.16s\n", ++ (get_exec_env() ? ++ VEID(get_exec_env()) : 0), ++ current->pid, current->comm); ++ printk("flags=0x%x, len=%d, copied_seq=%d, " ++ "rcv_nxt=%d\n", flags, len, ++ tp->copied_seq, tp->rcv_nxt); ++ } + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise +@@ -1583,7 +1669,7 @@ adjudge_to_death: + if (tmo > TCP_TIMEWAIT_LEN) { + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); + } else { +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } +@@ -1591,9 +1677,7 @@ adjudge_to_death: + } + if (sk->sk_state != TCP_CLOSE) { + sk_stream_mem_reclaim(sk); +- if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || +- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && +- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { ++ if (ub_too_many_orphans(sk, ub_get_orphan_count(sk))) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned " + "sockets\n"); +@@ -1602,7 +1686,7 @@ adjudge_to_death: + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + } + } +- atomic_inc(sk->sk_prot->orphan_count); ++ ub_inc_orphan_count(sk); + + if (sk->sk_state == TCP_CLOSE) + inet_csk_destroy_sock(sk); +@@ -2051,7 +2135,7 @@ void __init tcp_init(void) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, +- SLAB_HWCACHE_ALIGN, NULL, NULL); ++ SLAB_HWCACHE_ALIGN | SLAB_UBC, NULL, NULL); + if (!tcp_hashinfo.bind_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + +diff -upr linux-2.6.16.orig/net/ipv4/tcp_input.c linux-2.6.16-026test015/net/ipv4/tcp_input.c +--- linux-2.6.16.orig/net/ipv4/tcp_input.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/tcp_input.c 2006-07-04 14:41:37.000000000 +0400 +@@ -72,6 +72,8 @@ + #include <linux/ipsec.h> + #include <asm/unaligned.h> + ++#include <ub/ub_tcp.h> ++ + int sysctl_tcp_timestamps = 1; + int sysctl_tcp_window_scaling = 1; + int sysctl_tcp_sack = 1; +@@ -252,7 +254,7 @@ static void tcp_grow_window(struct sock + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && +- !tcp_memory_pressure) { ++ ub_tcp_rmem_allows_expand(sk)) { + int incr; + + /* Check #2. Increase window, if skb with such overhead +@@ -321,6 +323,8 @@ static void tcp_init_buffer_space(struct + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; ++ ++ ub_tcp_update_maxadvmss(sk); + } + + /* 5. Recalculate window clamp after socket hit its memory bounds. */ +@@ -332,7 +336,7 @@ static void tcp_clamp_window(struct sock + + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && +- !tcp_memory_pressure && ++ !ub_tcp_memory_pressure(sk) && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); +@@ -3118,7 +3122,7 @@ queue_and_out: + !sk_stream_rmem_schedule(sk, skb))) { + if (tcp_prune_queue(sk) < 0 || + !sk_stream_rmem_schedule(sk, skb)) +- goto drop; ++ goto drop_part; + } + sk_stream_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); +@@ -3162,6 +3166,12 @@ out_of_window: + drop: + __kfree_skb(skb); + return; ++ ++drop_part: ++ if (after(tp->copied_seq, tp->rcv_nxt)) ++ tp->rcv_nxt = tp->copied_seq; ++ __kfree_skb(skb); ++ return; + } + + /* Out of window. F.e. zero window probe. */ +@@ -3333,6 +3343,10 @@ tcp_collapse(struct sock *sk, struct sk_ + nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (!nskb) + return; ++ if (ub_tcprcvbuf_charge_forced(skb->sk, nskb) < 0) { ++ kfree_skb(nskb); ++ return; ++ } + skb_reserve(nskb, header); + memcpy(nskb->head, skb->head, header); + nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); +@@ -3429,7 +3443,7 @@ static int tcp_prune_queue(struct sock * + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk, tp); +- else if (tcp_memory_pressure) ++ else if (ub_tcp_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + + tcp_collapse_ofo_queue(sk); +@@ -3505,7 +3519,7 @@ static int tcp_should_expand_sndbuf(stru + return 0; + + /* If we are under global TCP memory pressure, do not expand. */ +- if (tcp_memory_pressure) ++ if (ub_tcp_memory_pressure(sk)) + return 0; + + /* If we are under soft global TCP memory pressure, do not expand. */ +@@ -3898,6 +3912,10 @@ int tcp_rcv_established(struct sock *sk, + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; ++ /* This is OK not to try to free memory here. ++ * Do this below on slow path. Den */ ++ if (ub_tcprcvbuf_charge(sk, skb) < 0) ++ goto step5; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + +diff -upr linux-2.6.16.orig/net/ipv4/tcp_ipv4.c linux-2.6.16-026test015/net/ipv4/tcp_ipv4.c +--- linux-2.6.16.orig/net/ipv4/tcp_ipv4.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/tcp_ipv4.c 2006-07-04 14:41:39.000000000 +0400 +@@ -72,6 +72,8 @@ + #include <net/timewait_sock.h> + #include <net/xfrm.h> + ++#include <ub/ub_tcp.h> ++ + #include <linux/inet.h> + #include <linux/ipv6.h> + #include <linux/stddef.h> +@@ -705,6 +707,7 @@ struct request_sock_ops tcp_request_sock + .destructor = tcp_v4_reqsk_destructor, + .send_reset = tcp_v4_send_reset, + }; ++EXPORT_SYMBOL_GPL(tcp_request_sock_ops); + + static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), +@@ -979,12 +982,15 @@ static int tcp_v4_checksum_init(struct s + */ + int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) + { ++ struct user_beancounter *ub; ++ ++ ub = set_exec_ub(sock_bc(sk)->ub); + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); +- return 0; ++ goto restore_context; + } + + if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) +@@ -998,7 +1004,7 @@ int tcp_v4_do_rcv(struct sock *sk, struc + if (nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; +- return 0; ++ goto restore_context; + } + } + +@@ -1006,6 +1012,9 @@ int tcp_v4_do_rcv(struct sock *sk, struc + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); ++ ++restore_context: ++ (void)set_exec_ub(ub); + return 0; + + reset: +@@ -1017,7 +1026,7 @@ discard: + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ +- return 0; ++ goto restore_context; + + csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); +@@ -1302,6 +1311,8 @@ int tcp_v4_destroy_sock(struct sock *sk) + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { ++ /* queue is empty, uncharge */ ++ ub_sock_tcp_detachpage(sk); + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } +@@ -1316,16 +1327,34 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); + #ifdef CONFIG_PROC_FS + /* Proc filesystem TCP sock list dumping. */ + +-static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) ++static inline struct inet_timewait_sock *tw_head(struct hlist_head *head, ++ envid_t veid) + { +- return hlist_empty(head) ? NULL : +- list_entry(head->first, struct inet_timewait_sock, tw_node); ++ struct inet_timewait_sock *tw; ++ struct hlist_node *pos; ++ ++ if (hlist_empty(head)) ++ return NULL; ++ hlist_for_each_entry(tw, pos, head, tw_node) { ++ if (!ve_accessible_veid(tw->tw_owner_env, veid)) ++ continue; ++ return tw; ++ } ++ return NULL; + } + +-static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) ++static inline struct inet_timewait_sock * ++ tw_next(struct inet_timewait_sock *tw, envid_t veid) + { +- return tw->tw_node.next ? +- hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; ++ while (1) { ++ if (tw->tw_node.next == NULL) ++ return NULL; ++ tw = hlist_entry(tw->tw_node.next, typeof(*tw), tw_node); ++ if (!ve_accessible_veid(tw->tw_owner_env, veid)) ++ continue; ++ return tw; ++ } ++ return NULL; /* make compiler happy */ + } + + static void *listening_get_next(struct seq_file *seq, void *cur) +@@ -1334,7 +1363,9 @@ static void *listening_get_next(struct s + struct hlist_node *node; + struct sock *sk = cur; + struct tcp_iter_state* st = seq->private; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + if (!sk) { + st->bucket = 0; + sk = sk_head(&tcp_hashinfo.listening_hash[0]); +@@ -1374,6 +1405,8 @@ get_req: + } + get_sk: + sk_for_each_from(sk, node) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (sk->sk_family == st->family) { + cur = sk; + goto out; +@@ -1414,7 +1447,9 @@ static void *established_get_first(struc + { + struct tcp_iter_state* st = seq->private; + void *rc = NULL; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { + struct sock *sk; + struct hlist_node *node; +@@ -1425,6 +1460,8 @@ static void *established_get_first(struc + + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (sk->sk_family != st->family) { + continue; + } +@@ -1434,6 +1471,8 @@ static void *established_get_first(struc + st->state = TCP_SEQ_STATE_TIME_WAIT; + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { ++ if (!ve_accessible_veid(tw->tw_owner_env, VEID(ve))) ++ continue; + if (tw->tw_family != st->family) { + continue; + } +@@ -1453,16 +1492,17 @@ static void *established_get_next(struct + struct inet_timewait_sock *tw; + struct hlist_node *node; + struct tcp_iter_state* st = seq->private; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + ++st->num; + + if (st->state == TCP_SEQ_STATE_TIME_WAIT) { + tw = cur; +- tw = tw_next(tw); ++ tw = tw_next(tw, VEID(ve)); + get_tw: +- while (tw && tw->tw_family != st->family) { +- tw = tw_next(tw); +- } ++ while (tw && tw->tw_family != st->family) ++ tw = tw_next(tw, VEID(ve)); + if (tw) { + cur = tw; + goto out; +@@ -1484,12 +1524,15 @@ get_tw: + sk = sk_next(sk); + + sk_for_each_from(sk, node) { ++ if (!ve_accessible(VE_OWNER_SK(sk), ve)) ++ continue; + if (sk->sk_family == st->family) + goto found; + } + + st->state = TCP_SEQ_STATE_TIME_WAIT; +- tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); ++ tw = tw_head(&tcp_hashinfo.ehash[st->bucket + ++ tcp_hashinfo.ehash_size].chain, VEID(ve)); + goto get_tw; + found: + cur = sk; +@@ -1635,7 +1678,7 @@ int tcp_proc_register(struct tcp_seq_afi + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + +- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); ++ p = proc_glob_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + if (p) + p->data = afinfo; + else +@@ -1647,7 +1690,8 @@ void tcp_proc_unregister(struct tcp_seq_ + { + if (!afinfo) + return; +- proc_net_remove(afinfo->name); ++ ++ remove_proc_glob_entry(afinfo->name, NULL); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); + } + +@@ -1777,7 +1821,7 @@ out: + static struct file_operations tcp4_seq_fops; + static struct tcp_seq_afinfo tcp4_seq_afinfo = { + .owner = THIS_MODULE, +- .name = "tcp", ++ .name = "net/tcp", + .family = AF_INET, + .seq_show = tcp4_seq_show, + .seq_fops = &tcp4_seq_fops, +@@ -1844,6 +1888,86 @@ void __init tcp_v4_init(struct net_proto + tcp_socket->sk->sk_prot->unhash(tcp_socket->sk); + } + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++static void tcp_kill_ve_onesk(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ /* Check the assumed state of the socket. */ ++ if (!sock_flag(sk, SOCK_DEAD)) { ++ static int printed; ++invalid: ++ if (!printed) ++ printk(KERN_DEBUG "Killing sk: dead %d, state %d, " ++ "wrseq %u unseq %u, wrqu %d.\n", ++ sock_flag(sk, SOCK_DEAD), sk->sk_state, ++ tp->write_seq, tp->snd_una, ++ !skb_queue_empty(&sk->sk_write_queue)); ++ printed = 1; ++ return; ++ } ++ ++ tcp_send_active_reset(sk, GFP_ATOMIC); ++ switch (sk->sk_state) { ++ case TCP_FIN_WAIT1: ++ case TCP_CLOSING: ++ /* In these 2 states the peer may want us to retransmit ++ * some data and/or FIN. Entering "resetting mode" ++ * instead. ++ */ ++ tcp_time_wait(sk, TCP_CLOSE, 0); ++ break; ++ case TCP_FIN_WAIT2: ++ /* By some reason the socket may stay in this state ++ * without turning into a TW bucket. Fix it. ++ */ ++ tcp_time_wait(sk, TCP_FIN_WAIT2, 0); ++ break; ++ case TCP_LAST_ACK: ++ /* Just jump into CLOSED state. */ ++ tcp_done(sk); ++ break; ++ default: ++ /* The socket must be already close()d. */ ++ goto invalid; ++ } ++} ++ ++void tcp_v4_kill_ve_sockets(struct ve_struct *envid) ++{ ++ struct inet_ehash_bucket *head; ++ int i; ++ ++ /* alive */ ++ local_bh_disable(); ++ head = tcp_hashinfo.ehash; ++ for (i = 0; i < tcp_hashinfo.ehash_size; i++) { ++ struct sock *sk; ++ struct hlist_node *node; ++more_work: ++ write_lock(&head[i].lock); ++ sk_for_each(sk, node, &head[i].chain) { ++ if (ve_accessible_strict(VE_OWNER_SK(sk), envid)) { ++ sock_hold(sk); ++ write_unlock(&head[i].lock); ++ ++ bh_lock_sock(sk); ++ /* sk might have disappeared from the hash before ++ * we got the lock */ ++ if (sk->sk_state != TCP_CLOSE) ++ tcp_kill_ve_onesk(sk); ++ bh_unlock_sock(sk); ++ sock_put(sk); ++ goto more_work; ++ } ++ } ++ write_unlock(&head[i].lock); ++ } ++ local_bh_enable(); ++} ++EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); ++#endif ++ + EXPORT_SYMBOL(ipv4_specific); + EXPORT_SYMBOL(tcp_hashinfo); + EXPORT_SYMBOL(tcp_prot); +diff -upr linux-2.6.16.orig/net/ipv4/tcp_minisocks.c linux-2.6.16-026test015/net/ipv4/tcp_minisocks.c +--- linux-2.6.16.orig/net/ipv4/tcp_minisocks.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/tcp_minisocks.c 2006-07-04 14:41:39.000000000 +0400 +@@ -29,6 +29,8 @@ + #include <net/inet_common.h> + #include <net/xfrm.h> + ++#include <ub/ub_net.h> ++ + #ifdef CONFIG_SYSCTL + #define SYNC_INIT 0 /* let the user enable it */ + #else +@@ -307,6 +309,8 @@ void tcp_time_wait(struct sock *sk, int + tw->tw_ipv6only = np->ipv6only; + } + #endif ++ tw->tw_owner_env = VEID(VE_OWNER_SK(sk)); ++ + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + +@@ -355,6 +359,8 @@ struct sock *tcp_create_openreq_child(st + struct tcp_sock *newtp; + + /* Now setup tcp_sock */ ++ SET_VE_OWNER_SK(newsk, VE_OWNER_SK(sk)); ++ + newtp = tcp_sk(newsk); + newtp->pred_flags = 0; + newtp->rcv_nxt = treq->rcv_isn + 1; +diff -upr linux-2.6.16.orig/net/ipv4/tcp_output.c linux-2.6.16-026test015/net/ipv4/tcp_output.c +--- linux-2.6.16.orig/net/ipv4/tcp_output.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/tcp_output.c 2006-07-04 14:41:37.000000000 +0400 +@@ -42,6 +42,9 @@ + #include <linux/module.h> + #include <linux/smp_lock.h> + ++#include <ub/ub_net.h> ++#include <ub/ub_tcp.h> ++ + /* People can turn this off for buggy TCP's found in printers etc. */ + int sysctl_tcp_retrans_collapse = 1; + +@@ -528,16 +531,26 @@ int tcp_fragment(struct sock *sk, struct + if (nsize < 0) + nsize = 0; + +- if (skb_cloned(skb) && +- skb_is_nonlinear(skb) && +- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) +- return -ENOMEM; ++ if (skb_cloned(skb) && skb_is_nonlinear(skb)) { ++ unsigned long chargesize; ++ chargesize = skb_bc(skb)->charged; ++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) ++ return -ENOMEM; ++ ub_sock_retwres_tcp(sk, chargesize, chargesize); ++ ub_tcpsndbuf_charge_forced(sk, skb); ++ } + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ +- sk_charge_skb(sk, buff); ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOMEM; ++ } ++ ++ buff->truesize = skb->len - len; ++ skb->truesize -= buff->truesize; + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; +@@ -978,6 +991,11 @@ static int tso_fragment(struct sock *sk, + if (unlikely(buff == NULL)) + return -ENOMEM; + ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOMEM; ++ } ++ + buff->truesize = nlen; + skb->truesize -= nlen; + +@@ -1281,7 +1299,7 @@ u32 __tcp_select_window(struct sock *sk) + if (free_space < full_space/2) { + icsk->icsk_ack.quick = 0; + +- if (tcp_memory_pressure) ++ if (ub_tcp_shrink_rcvbuf(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + + if (free_space < mss) +@@ -1708,6 +1726,7 @@ void tcp_send_fin(struct sock *sk) + break; + yield(); + } ++ ub_tcpsndbuf_charge_forced(sk, skb); + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); +@@ -1777,6 +1796,10 @@ int tcp_send_synack(struct sock *sk) + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; ++ if (ub_tcpsndbuf_charge(sk, skb) < 0) { ++ kfree_skb(nskb); ++ return -ENOMEM; ++ } + __skb_unlink(skb, &sk->sk_write_queue); + skb_header_release(nskb); + __skb_queue_head(&sk->sk_write_queue, nskb); +@@ -1928,6 +1951,10 @@ int tcp_connect(struct sock *sk) + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; ++ if (ub_tcpsndbuf_charge(sk, buff) < 0) { ++ kfree_skb(buff); ++ return -ENOBUFS; ++ } + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); +diff -upr linux-2.6.16.orig/net/ipv4/tcp_timer.c linux-2.6.16-026test015/net/ipv4/tcp_timer.c +--- linux-2.6.16.orig/net/ipv4/tcp_timer.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/tcp_timer.c 2006-07-04 14:41:39.000000000 +0400 +@@ -22,6 +22,8 @@ + + #include <linux/module.h> + #include <net/tcp.h> ++#include <ub/ub_orphan.h> ++#include <ub/ub_tcp.h> + + int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; + int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; +@@ -67,7 +69,7 @@ static void tcp_write_err(struct sock *s + static int tcp_out_of_resources(struct sock *sk, int do_reset) + { + struct tcp_sock *tp = tcp_sk(sk); +- int orphans = atomic_read(&tcp_orphan_count); ++ int orphans = ub_get_orphan_count(sk); + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ +@@ -78,9 +80,7 @@ static int tcp_out_of_resources(struct s + if (sk->sk_err_soft) + orphans <<= 1; + +- if (orphans >= sysctl_tcp_max_orphans || +- (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && +- atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { ++ if (ub_too_many_orphans(sk, orphans)) { + if (net_ratelimit()) + printk(KERN_INFO "Out of socket memory\n"); + +@@ -173,9 +173,12 @@ static int tcp_write_timeout(struct sock + static void tcp_delack_timer(unsigned long data) + { + struct sock *sk = (struct sock*)data; ++ struct ve_struct *env; + struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ +@@ -224,11 +227,12 @@ static void tcp_delack_timer(unsigned lo + TCP_CHECK_TIMER(sk); + + out: +- if (tcp_memory_pressure) ++ if (ub_tcp_memory_pressure(sk)) + sk_stream_mem_reclaim(sk); + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } + + static void tcp_probe_timer(struct sock *sk) +@@ -283,8 +287,11 @@ static void tcp_probe_timer(struct sock + static void tcp_retransmit_timer(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + if (!tp->packets_out) + goto out; + +@@ -381,15 +388,19 @@ out_reset_timer: + if (icsk->icsk_retransmits > sysctl_tcp_retries1) + __sk_dst_reset(sk); + +-out:; ++out: ++ (void)set_exec_env(env); + } + + static void tcp_write_timer(unsigned long data) + { + struct sock *sk = (struct sock*)data; ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ +@@ -423,6 +434,7 @@ out: + out_unlock: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } + + /* +@@ -450,10 +462,13 @@ void tcp_set_keepalive(struct sock *sk, + static void tcp_keepalive_timer (unsigned long data) + { + struct sock *sk = (struct sock *) data; ++ struct ve_struct *env; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u32 elapsed; + ++ env = set_exec_env(VE_OWNER_SK(sk)); ++ + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { +@@ -525,4 +540,5 @@ death: + out: + bh_unlock_sock(sk); + sock_put(sk); ++ (void)set_exec_env(env); + } +diff -upr linux-2.6.16.orig/net/ipv4/udp.c linux-2.6.16-026test015/net/ipv4/udp.c +--- linux-2.6.16.orig/net/ipv4/udp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv4/udp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -127,7 +127,9 @@ static int udp_v4_get_port(struct sock * + struct hlist_node *node; + struct sock *sk2; + struct inet_sock *inet = inet_sk(sk); ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; +@@ -141,7 +143,7 @@ static int udp_v4_get_port(struct sock * + struct hlist_head *list; + int size; + +- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; ++ list = &udp_hash[udp_hashfn(result, VEID(env))]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + +@@ -163,7 +165,7 @@ static int udp_v4_get_port(struct sock * + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); +- if (!udp_lport_inuse(result)) ++ if (!udp_lport_inuse(result, env)) + break; + } + if (i >= (1 << 16) / UDP_HTABLE_SIZE) +@@ -172,11 +174,12 @@ gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, +- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { ++ &udp_hash[udp_hashfn(snum, VEID(env))]) { + struct inet_sock *inet2 = inet_sk(sk2); + + if (inet2->num == snum && + sk2 != sk && ++ ve_accessible_strict(VE_OWNER_SK(sk2), env) && + !ipv6_only_sock(sk2) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || +@@ -190,7 +193,7 @@ gotit: + } + inet->num = snum; + if (sk_unhashed(sk)) { +- struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; ++ struct hlist_head *h = &udp_hash[udp_hashfn(snum, VEID(env))]; + + sk_add_node(sk, h); + sock_prot_inc_use(sk->sk_prot); +@@ -228,11 +231,15 @@ static struct sock *udp_v4_lookup_longwa + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; ++ struct ve_struct *env; + +- sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { ++ env = get_exec_env(); ++ sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) { + struct inet_sock *inet = inet_sk(sk); + +- if (inet->num == hnum && !ipv6_only_sock(sk)) { ++ if (inet->num == hnum && ++ ve_accessible_strict(VE_OWNER_SK(sk), env) && ++ !ipv6_only_sock(sk)) { + int score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) +@@ -1049,7 +1056,8 @@ static int udp_v4_mcast_deliver(struct s + int dif; + + read_lock(&udp_hash_lock); +- sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); ++ sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest), ++ VEID(VE_OWNER_SKB(skb)))]); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (sk) { +@@ -1367,10 +1375,14 @@ static struct sock *udp_get_first(struct + { + struct sock *sk; + struct udp_iter_state *state = seq->private; ++ struct ve_struct *env; + ++ env = get_exec_env(); + for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + struct hlist_node *node; + sk_for_each(sk, node, &udp_hash[state->bucket]) { ++ if (!ve_accessible(VE_OWNER_SK(sk), env)) ++ continue; + if (sk->sk_family == state->family) + goto found; + } +@@ -1387,8 +1399,13 @@ static struct sock *udp_get_next(struct + do { + sk = sk_next(sk); + try_again: +- ; +- } while (sk && sk->sk_family != state->family); ++ if (!sk) ++ break; ++ if (sk->sk_family != state->family) ++ continue; ++ if (ve_accessible(VE_OWNER_SK(sk), get_exec_env())) ++ break; ++ } while (1); + + if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { + sk = sk_head(&udp_hash[state->bucket]); +@@ -1474,7 +1491,7 @@ int udp_proc_register(struct udp_seq_afi + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + +- p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); ++ p = proc_glob_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + if (p) + p->data = afinfo; + else +@@ -1486,7 +1503,8 @@ void udp_proc_unregister(struct udp_seq_ + { + if (!afinfo) + return; +- proc_net_remove(afinfo->name); ++ ++ remove_proc_glob_entry(afinfo->name, NULL); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); + } + +@@ -1529,7 +1547,7 @@ static int udp4_seq_show(struct seq_file + static struct file_operations udp4_seq_fops; + static struct udp_seq_afinfo udp4_seq_afinfo = { + .owner = THIS_MODULE, +- .name = "udp", ++ .name = "net/udp", + .family = AF_INET, + .seq_show = udp4_seq_show, + .seq_fops = &udp4_seq_fops, +diff -upr linux-2.6.16.orig/net/ipv6/addrconf.c linux-2.6.16-026test015/net/ipv6/addrconf.c +--- linux-2.6.16.orig/net/ipv6/addrconf.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/addrconf.c 2006-07-04 14:41:39.000000000 +0400 +@@ -100,6 +100,7 @@ + #define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) + + #ifdef CONFIG_SYSCTL ++static struct addrconf_sysctl_table * __addrconf_sysctl_register(struct inet6_dev *idev, char *devname, int ifindex, struct ipv6_devconf *p); + static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); + static void addrconf_sysctl_unregister(struct ipv6_devconf *p); + #endif +@@ -133,8 +134,6 @@ static DEFINE_SPINLOCK(addrconf_verify_l + static void addrconf_join_anycast(struct inet6_ifaddr *ifp); + static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); + +-static int addrconf_ifdown(struct net_device *dev, int how); +- + static void addrconf_dad_start(struct inet6_ifaddr *ifp, u32 flags); + static void addrconf_dad_timer(unsigned long data); + static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +@@ -149,7 +148,7 @@ static int ipv6_chk_same_addr(const stru + + static struct notifier_block *inet6addr_chain; + +-struct ipv6_devconf ipv6_devconf = { ++struct ipv6_devconf global_ipv6_devconf = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, +@@ -171,7 +170,7 @@ struct ipv6_devconf ipv6_devconf = { + .max_addresses = IPV6_MAX_ADDRESSES, + }; + +-static struct ipv6_devconf ipv6_devconf_dflt = { ++struct ipv6_devconf global_ipv6_devconf_dflt = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, +@@ -192,6 +191,12 @@ static struct ipv6_devconf ipv6_devconf_ + .max_addresses = IPV6_MAX_ADDRESSES, + }; + ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++#define ipv6_devconf_dflt (*(get_exec_env()->_ipv6_devconf_dflt)) ++#else ++#define ipv6_devconf_dflt global_ipv6_devconf_dflt ++#endif ++ + /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ + #if 0 + const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +@@ -463,8 +468,8 @@ static void addrconf_forward_change(void + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { +- int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding); +- idev->cnf.forwarding = ipv6_devconf.forwarding; ++ int changed = (!idev->cnf.forwarding) ^ (!ve_ipv6_devconf.forwarding); ++ idev->cnf.forwarding = ve_ipv6_devconf.forwarding; + if (changed) + dev_forward_change(idev); + } +@@ -1148,9 +1153,10 @@ int ipv6_chk_addr(struct in6_addr *addr, + read_lock_bh(&addrconf_hash_lock); + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr) && +- !(ifp->flags&IFA_F_TENTATIVE)) { ++ !(ifp->flags&IFA_F_TENTATIVE) && ++ ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) { + if (dev == NULL || ifp->idev->dev == dev || +- !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) ++ !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) + break; + } + } +@@ -1166,7 +1172,9 @@ int ipv6_chk_same_addr(const struct in6_ + + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr)) { +- if (dev == NULL || ifp->idev->dev == dev) ++ if ((dev == NULL && ++ ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) ++ || ifp->idev->dev == dev) + break; + } + } +@@ -1180,9 +1188,10 @@ struct inet6_ifaddr * ipv6_get_ifaddr(st + + read_lock_bh(&addrconf_hash_lock); + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { +- if (ipv6_addr_equal(&ifp->addr, addr)) { ++ if (ipv6_addr_equal(&ifp->addr, addr) && ++ ve_accessible_strict(ifp->idev->dev->owner_env, get_exec_env())) { + if (dev == NULL || ifp->idev->dev == dev || +- !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { ++ !((ifp->scope&(IFA_LINK|IFA_HOST)) || strict)) { + in6_ifa_hold(ifp); + break; + } +@@ -1842,7 +1851,7 @@ err_exit: + /* + * Manual configuration of address on an interface + */ +-static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) ++int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) + { + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; +@@ -1871,6 +1880,7 @@ static int inet6_addr_add(int ifindex, s + + return PTR_ERR(ifp); + } ++EXPORT_SYMBOL_GPL(inet6_addr_add); + + static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) + { +@@ -1911,7 +1921,7 @@ int addrconf_add_ifaddr(void __user *arg + struct in6_ifreq ireq; + int err; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) +@@ -1928,7 +1938,7 @@ int addrconf_del_ifaddr(void __user *arg + struct in6_ifreq ireq; + int err; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) +@@ -2270,7 +2280,7 @@ static struct notifier_block ipv6_dev_no + .priority = 0 + }; + +-static int addrconf_ifdown(struct net_device *dev, int how) ++int addrconf_ifdown(struct net_device *dev, int how) + { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa, **bifa; +@@ -2278,7 +2288,7 @@ static int addrconf_ifdown(struct net_de + + ASSERT_RTNL(); + +- if (dev == &loopback_dev && how == 1) ++ if (dev == get_ve0()->_loopback_dev && how == 1) + how = 0; + + rt6_ifdown(dev); +@@ -2386,10 +2396,12 @@ static int addrconf_ifdown(struct net_de + } + return 0; + } ++EXPORT_SYMBOL_GPL(addrconf_ifdown); + + static void addrconf_rs_timer(unsigned long data) + { + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; ++ struct ve_struct *old_env = set_exec_env(ifp->idev->dev->owner_env); + + if (ifp->idev->cnf.forwarding) + goto out; +@@ -2428,6 +2440,7 @@ static void addrconf_rs_timer(unsigned l + + out: + in6_ifa_put(ifp); ++ set_exec_env(old_env); + } + + /* +@@ -2495,6 +2508,7 @@ static void addrconf_dad_timer(unsigned + struct inet6_dev *idev = ifp->idev; + struct in6_addr unspec; + struct in6_addr mcaddr; ++ struct ve_struct *old_env = set_exec_env(ifp->idev->dev->owner_env); + + read_lock_bh(&idev->lock); + if (idev->dead) { +@@ -2527,6 +2541,7 @@ static void addrconf_dad_timer(unsigned + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); + out: + in6_ifa_put(ifp); ++ set_exec_env(old_env); + } + + static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +@@ -2594,8 +2609,11 @@ static struct inet6_ifaddr *if6_get_firs + + for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + ifa = inet6_addr_lst[state->bucket]; +- if (ifa) +- break; ++ while (ifa) { ++ if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env())) ++ return ifa; ++ ifa = ifa->lst_next; ++ } + } + return ifa; + } +@@ -2606,6 +2624,11 @@ static struct inet6_ifaddr *if6_get_next + + ifa = ifa->lst_next; + try_again: ++ while (ifa) { ++ if (ve_accessible_strict(ifa->idev->dev->owner_env, get_exec_env())) ++ break; ++ ifa = ifa->lst_next; ++ } + if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { + ifa = inet6_addr_lst[state->bucket]; + goto try_again; +@@ -2697,14 +2720,14 @@ static struct file_operations if6_fops = + + int __init if6_proc_init(void) + { +- if (!proc_net_fops_create("if_inet6", S_IRUGO, &if6_fops)) ++ if (!proc_glob_fops_create("net/if_inet6", S_IRUGO, &if6_fops)) + return -ENOMEM; + return 0; + } + + void if6_proc_exit(void) + { +- proc_net_remove("if_inet6"); ++ remove_proc_glob_entry("net/if_inet6", NULL); + } + #endif /* CONFIG_PROC_FS */ + +@@ -2717,6 +2740,7 @@ static void addrconf_verify(unsigned lon + struct inet6_ifaddr *ifp; + unsigned long now, next; + int i; ++ struct ve_struct *old_env; + + spin_lock_bh(&addrconf_verify_lock); + now = jiffies; +@@ -2737,6 +2761,8 @@ restart: + if (ifp->flags & IFA_F_PERMANENT) + continue; + ++ old_env = set_exec_env(ifp->idev->dev->owner_env); ++ + spin_lock(&ifp->lock); + age = (now - ifp->tstamp) / HZ; + +@@ -2751,6 +2777,7 @@ restart: + in6_ifa_hold(ifp); + read_unlock(&addrconf_hash_lock); + ipv6_del_addr(ifp); ++ set_exec_env(old_env); + goto restart; + } else if (age >= ifp->prefered_lft) { + /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ +@@ -2772,6 +2799,7 @@ restart: + + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); ++ set_exec_env(old_env); + goto restart; + } + #ifdef CONFIG_IPV6_PRIVACY +@@ -2793,6 +2821,7 @@ restart: + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); ++ set_exec_env(old_env); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) +@@ -2805,6 +2834,7 @@ restart: + next = ifp->tstamp + ifp->prefered_lft * HZ; + spin_unlock(&ifp->lock); + } ++ set_exec_env(old_env); + } + read_unlock(&addrconf_hash_lock); + } +@@ -3360,7 +3390,7 @@ int addrconf_sysctl_forward(ctl_table *c + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && valp != &ipv6_devconf_dflt.forwarding) { +- if (valp != &ipv6_devconf.forwarding) { ++ if (valp != &ve_ipv6_devconf.forwarding) { + if ((!*valp) ^ (!val)) { + struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; + if (idev == NULL) +@@ -3368,7 +3398,7 @@ int addrconf_sysctl_forward(ctl_table *c + dev_forward_change(idev); + } + } else { +- ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; ++ ipv6_devconf_dflt.forwarding = ve_ipv6_devconf.forwarding; + addrconf_forward_change(); + } + if (*valp) +@@ -3411,7 +3441,7 @@ static int addrconf_sysctl_forward_strat + } + + if (valp != &ipv6_devconf_dflt.forwarding) { +- if (valp != &ipv6_devconf.forwarding) { ++ if (valp != &ve_ipv6_devconf.forwarding) { + struct inet6_dev *idev = (struct inet6_dev *)table->extra1; + int changed; + if (unlikely(idev == NULL)) +@@ -3447,7 +3477,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_FORWARDING, + .procname = "forwarding", +- .data = &ipv6_devconf.forwarding, ++ .data = &global_ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &addrconf_sysctl_forward, +@@ -3456,7 +3486,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_HOP_LIMIT, + .procname = "hop_limit", +- .data = &ipv6_devconf.hop_limit, ++ .data = &global_ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, +@@ -3464,7 +3494,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_MTU, + .procname = "mtu", +- .data = &ipv6_devconf.mtu6, ++ .data = &global_ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3472,7 +3502,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_ACCEPT_RA, + .procname = "accept_ra", +- .data = &ipv6_devconf.accept_ra, ++ .data = &global_ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3480,7 +3510,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_ACCEPT_REDIRECTS, + .procname = "accept_redirects", +- .data = &ipv6_devconf.accept_redirects, ++ .data = &global_ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3488,7 +3518,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_AUTOCONF, + .procname = "autoconf", +- .data = &ipv6_devconf.autoconf, ++ .data = &global_ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3496,7 +3526,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_DAD_TRANSMITS, + .procname = "dad_transmits", +- .data = &ipv6_devconf.dad_transmits, ++ .data = &global_ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3504,7 +3534,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_RTR_SOLICITS, + .procname = "router_solicitations", +- .data = &ipv6_devconf.rtr_solicits, ++ .data = &global_ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3512,7 +3542,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL, + .procname = "router_solicitation_interval", +- .data = &ipv6_devconf.rtr_solicit_interval, ++ .data = &global_ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, +@@ -3521,7 +3551,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY, + .procname = "router_solicitation_delay", +- .data = &ipv6_devconf.rtr_solicit_delay, ++ .data = &global_ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, +@@ -3530,7 +3560,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_FORCE_MLD_VERSION, + .procname = "force_mld_version", +- .data = &ipv6_devconf.force_mld_version, ++ .data = &global_ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3539,7 +3569,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_USE_TEMPADDR, + .procname = "use_tempaddr", +- .data = &ipv6_devconf.use_tempaddr, ++ .data = &global_ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3547,7 +3577,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_TEMP_VALID_LFT, + .procname = "temp_valid_lft", +- .data = &ipv6_devconf.temp_valid_lft, ++ .data = &global_ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3555,7 +3585,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_TEMP_PREFERED_LFT, + .procname = "temp_prefered_lft", +- .data = &ipv6_devconf.temp_prefered_lft, ++ .data = &global_ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3563,7 +3593,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_REGEN_MAX_RETRY, + .procname = "regen_max_retry", +- .data = &ipv6_devconf.regen_max_retry, ++ .data = &global_ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3571,7 +3601,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR, + .procname = "max_desync_factor", +- .data = &ipv6_devconf.max_desync_factor, ++ .data = &global_ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3580,7 +3610,7 @@ static struct addrconf_sysctl_table + { + .ctl_name = NET_IPV6_MAX_ADDRESSES, + .procname = "max_addresses", +- .data = &ipv6_devconf.max_addresses, ++ .data = &global_ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, +@@ -3635,29 +3665,22 @@ static struct addrconf_sysctl_table + }, + }; + +-static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) ++static struct addrconf_sysctl_table * ++__addrconf_sysctl_register(struct inet6_dev *idev, char *dev_name, int ifindex, struct ipv6_devconf *p) + { + int i; +- struct net_device *dev = idev ? idev->dev : NULL; + struct addrconf_sysctl_table *t; +- char *dev_name = NULL; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) +- return; ++ return NULL; ++ + memcpy(t, &addrconf_sysctl, sizeof(*t)); + for (i=0; t->addrconf_vars[i].data; i++) { +- t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; ++ t->addrconf_vars[i].data += (char*)p - (char*)&global_ipv6_devconf; + t->addrconf_vars[i].de = NULL; + t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ + } +- if (dev) { +- dev_name = dev->name; +- t->addrconf_dev[0].ctl_name = dev->ifindex; +- } else { +- dev_name = "default"; +- t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; +- } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const +@@ -3668,6 +3691,7 @@ static void addrconf_sysctl_register(str + if (!dev_name) + goto free; + ++ t->addrconf_dev[0].ctl_name = ifindex; + t->addrconf_dev[0].procname = dev_name; + + t->addrconf_dev[0].child = t->addrconf_vars; +@@ -3682,9 +3706,7 @@ static void addrconf_sysctl_register(str + t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); + if (t->sysctl_header == NULL) + goto free_procname; +- else +- p->sysctl = t; +- return; ++ return t; + + /* error path */ + free_procname: +@@ -3692,7 +3714,26 @@ static void addrconf_sysctl_register(str + free: + kfree(t); + +- return; ++ return NULL; ++} ++ ++static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) ++{ ++ struct net_device *dev; ++ char *dev_name; ++ int ifindex; ++ ++ dev = idev ? idev->dev : NULL; ++ ++ if (dev) { ++ dev_name = dev->name; ++ ifindex = dev->ifindex; ++ } else { ++ dev_name = "default"; ++ ifindex = NET_PROTO_CONF_DEFAULT; ++ } ++ ++ p->sysctl = __addrconf_sysctl_register(idev, dev_name, ifindex, p); + } + + static void addrconf_sysctl_unregister(struct ipv6_devconf *p) +@@ -3706,6 +3747,73 @@ static void addrconf_sysctl_unregister(s + } + } + ++int addrconf_sysctl_init(struct ve_struct *ve) ++{ ++ int err = 0; ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ struct ipv6_devconf *conf, *conf_def; ++ ++ err = -ENOMEM; ++ ++ conf = kmalloc(sizeof(*conf), GFP_KERNEL); ++ if (!conf) ++ goto err1; ++ ++ memcpy(conf, &global_ipv6_devconf, sizeof(*conf)); ++ conf->sysctl = __addrconf_sysctl_register(NULL, "all", ++ NET_PROTO_CONF_ALL, conf); ++ if (!conf->sysctl) ++ goto err2; ++ ++ conf_def = kmalloc(sizeof(*conf_def), GFP_KERNEL); ++ if (!conf_def) ++ goto err3; ++ ++ memcpy(conf_def, &global_ipv6_devconf_dflt, sizeof(*conf_def)); ++ conf_def->sysctl = __addrconf_sysctl_register(NULL, "default", ++ NET_PROTO_CONF_DEFAULT, conf_def); ++ if (!conf_def->sysctl) ++ goto err4; ++ ++ ve->_ipv6_devconf = conf; ++ ve->_ipv6_devconf_dflt = conf_def; ++ return 0; ++ ++err4: ++ kfree(conf_def); ++err3: ++ addrconf_sysctl_unregister(conf); ++err2: ++ kfree(conf); ++err1: ++#endif ++#endif ++ return err; ++} ++EXPORT_SYMBOL(addrconf_sysctl_init); ++ ++void addrconf_sysctl_fini(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ addrconf_sysctl_unregister(ve->_ipv6_devconf); ++ addrconf_sysctl_unregister(ve->_ipv6_devconf_dflt); ++#endif ++#endif ++} ++EXPORT_SYMBOL(addrconf_sysctl_fini); ++ ++void addrconf_sysctl_free(struct ve_struct *ve) ++{ ++#ifdef CONFIG_SYSCTL ++#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE) ++ kfree(ve->_ipv6_devconf); ++ kfree(ve->_ipv6_devconf_dflt); ++#endif ++#endif ++} ++EXPORT_SYMBOL(addrconf_sysctl_free); + + #endif + +@@ -3731,6 +3839,11 @@ int __init addrconf_init(void) + { + int err = 0; + ++#ifdef CONFIG_VE ++ get_ve0()->_ipv6_devconf = &global_ipv6_devconf; ++ get_ve0()->_ipv6_devconf_dflt = &global_ipv6_devconf_dflt; ++#endif ++ + /* The addrconf netdev notifier requires that loopback_dev + * has it's ipv6 private information allocated and setup + * before it can bring up and give link-local addresses +@@ -3772,7 +3885,7 @@ int __init addrconf_init(void) + #ifdef CONFIG_SYSCTL + addrconf_sysctl.sysctl_header = + register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0); +- addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); ++ __addrconf_sysctl_register(NULL, "default", NET_PROTO_CONF_DEFAULT, &global_ipv6_devconf_dflt); + #endif + + return 0; +@@ -3789,8 +3902,8 @@ void __exit addrconf_cleanup(void) + + rtnetlink_links[PF_INET6] = NULL; + #ifdef CONFIG_SYSCTL +- addrconf_sysctl_unregister(&ipv6_devconf_dflt); +- addrconf_sysctl_unregister(&ipv6_devconf); ++ addrconf_sysctl_unregister(&global_ipv6_devconf_dflt); ++ addrconf_sysctl_unregister(&global_ipv6_devconf); + #endif + + rtnl_lock(); +@@ -3835,6 +3948,6 @@ void __exit addrconf_cleanup(void) + #endif + + #ifdef CONFIG_PROC_FS +- proc_net_remove("if_inet6"); ++ remove_proc_glob_entry("net/if_inet6", NULL); + #endif + } +diff -upr linux-2.6.16.orig/net/ipv6/af_inet6.c linux-2.6.16-026test015/net/ipv6/af_inet6.c +--- linux-2.6.16.orig/net/ipv6/af_inet6.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/af_inet6.c 2006-07-04 14:41:39.000000000 +0400 +@@ -60,6 +60,7 @@ + #ifdef CONFIG_IPV6_TUNNEL + #include <net/ip6_tunnel.h> + #endif ++#include <ub/ub_net.h> + + #include <asm/uaccess.h> + #include <asm/system.h> +@@ -160,6 +161,13 @@ lookup_protocol: + if (sk == NULL) + goto out; + ++ err = -ENOBUFS; ++ if (ub_sock_charge(sk, PF_INET6, sock->type)) ++ goto out_sk_free; ++ /* if charge was successful, sock_init_data() MUST be called to ++ * set sk->sk_type. otherwise sk will be uncharged to wrong resource ++ */ ++ + sock_init_data(sock, sk); + + err = 0; +@@ -234,6 +242,9 @@ out: + out_rcu_unlock: + rcu_read_unlock(); + goto out; ++out_sk_free: ++ sk_free(sk); ++ return err; + } + + +@@ -650,6 +661,8 @@ int inet6_sk_rebuild_header(struct sock + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); ++ if (!sysctl_tcp_use_sg) ++ sk->sk_route_caps &= ~NETIF_F_SG; + } + + return 0; +@@ -715,21 +728,21 @@ snmp6_mib_free(void *ptr[2]) + + static int __init init_ipv6_mibs(void) + { +- if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), ++ if (snmp6_mib_init((void **)ve_ipv6_statistics, sizeof (struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; +- if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), ++ if (snmp6_mib_init((void **)ve_icmpv6_statistics, sizeof (struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp_mib; +- if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), ++ if (snmp6_mib_init((void **)ve_udp_stats_in6, sizeof (struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udp_mib; + return 0; + + err_udp_mib: +- snmp6_mib_free((void **)icmpv6_statistics); ++ snmp6_mib_free((void **)ve_icmpv6_statistics); + err_icmp_mib: +- snmp6_mib_free((void **)ipv6_statistics); ++ snmp6_mib_free((void **)ve_ipv6_statistics); + err_ip_mib: + return -ENOMEM; + +@@ -737,9 +750,9 @@ err_ip_mib: + + static void cleanup_ipv6_mibs(void) + { +- snmp6_mib_free((void **)ipv6_statistics); +- snmp6_mib_free((void **)icmpv6_statistics); +- snmp6_mib_free((void **)udp_stats_in6); ++ snmp6_mib_free((void **)ve_ipv6_statistics); ++ snmp6_mib_free((void **)ve_icmpv6_statistics); ++ snmp6_mib_free((void **)ve_udp_stats_in6); + } + + static int __init inet6_init(void) +diff -upr linux-2.6.16.orig/net/ipv6/anycast.c linux-2.6.16-026test015/net/ipv6/anycast.c +--- linux-2.6.16.orig/net/ipv6/anycast.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/anycast.c 2006-07-04 14:41:39.000000000 +0400 +@@ -83,7 +83,7 @@ int ipv6_sock_ac_join(struct sock *sk, i + struct net_device *dev = NULL; + struct inet6_dev *idev; + struct ipv6_ac_socklist *pac; +- int ishost = !ipv6_devconf.forwarding; ++ int ishost = !ve_ipv6_devconf.forwarding; + int err = 0; + + if (!capable(CAP_NET_ADMIN)) +@@ -455,6 +455,8 @@ static inline struct ifacaddr6 *ac6_get_ + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + idev = in6_dev_get(state->dev); + if (!idev) + continue; +@@ -484,6 +486,8 @@ static struct ifacaddr6 *ac6_get_next(st + state->idev = NULL; + break; + } ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; +@@ -579,7 +583,7 @@ static struct file_operations ac6_seq_fo + + int __init ac6_proc_init(void) + { +- if (!proc_net_fops_create("anycast6", S_IRUGO, &ac6_seq_fops)) ++ if (!proc_glob_fops_create("net/anycast6", S_IRUGO, &ac6_seq_fops)) + return -ENOMEM; + + return 0; +@@ -587,7 +591,7 @@ int __init ac6_proc_init(void) + + void ac6_proc_exit(void) + { +- proc_net_remove("anycast6"); ++ remove_proc_glob_entry("net/anycast6", NULL); + } + #endif + +diff -upr linux-2.6.16.orig/net/ipv6/exthdrs.c linux-2.6.16-026test015/net/ipv6/exthdrs.c +--- linux-2.6.16.orig/net/ipv6/exthdrs.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/exthdrs.c 2006-07-04 14:41:36.000000000 +0400 +@@ -489,6 +489,18 @@ int ipv6_parse_hopopts(struct sk_buff *s + { + struct inet6_skb_parm *opt = IP6CB(skb); + ++ /* ++ * skb->nh.raw is equal to skb->data, and ++ * skb->h.raw - skb->nh.raw is always equal to ++ * sizeof(struct ipv6hdr) by definition of ++ * hop-by-hop options. ++ */ ++ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || ++ !pskb_may_pull(skb, sizeof(struct ipv6hdr) + ((skb->h.raw[1] + 1) << 3))) { ++ kfree_skb(skb); ++ return -1; ++ } ++ + opt->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { + skb->h.raw += (skb->h.raw[1]+1)<<3; +diff -upr linux-2.6.16.orig/net/ipv6/inet6_connection_sock.c linux-2.6.16-026test015/net/ipv6/inet6_connection_sock.c +--- linux-2.6.16.orig/net/ipv6/inet6_connection_sock.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/inet6_connection_sock.c 2006-07-04 14:41:39.000000000 +0400 +@@ -26,6 +26,8 @@ + #include <net/ip6_route.h> + #include <net/sock.h> + #include <net/inet6_connection_sock.h> ++#include <ub/ub_net.h> ++#include <ub/ub_orphan.h> + + int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +@@ -36,6 +38,7 @@ int inet6_csk_bind_conflict(const struct + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && ++ !ve_accessible_strict(VE_OWNER_SK(sk), VE_OWNER_SK(sk2)) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && +@@ -173,6 +176,7 @@ int inet6_csk_xmit(struct sk_buff *skb, + + if (err) { + sk->sk_err_soft = -err; ++ kfree_skb(skb); + return err; + } + +@@ -181,12 +185,15 @@ int inet6_csk_xmit(struct sk_buff *skb, + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; ++ kfree_skb(skb); + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); ++ if (!sysctl_tcp_use_sg) ++ sk->sk_route_caps &= ~NETIF_F_SG; + } + + skb->dst = dst_clone(dst); +diff -upr linux-2.6.16.orig/net/ipv6/inet6_hashtables.c linux-2.6.16-026test015/net/ipv6/inet6_hashtables.c +--- linux-2.6.16.orig/net/ipv6/inet6_hashtables.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/inet6_hashtables.c 2006-07-04 14:41:39.000000000 +0400 +@@ -31,9 +31,14 @@ struct sock *inet6_lookup_listener(struc + const struct hlist_node *node; + struct sock *result = NULL; + int score, hiscore = 0; ++ struct ve_struct *env; ++ ++ env = get_exec_env(); + + read_lock(&hashinfo->lhash_lock); +- sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum)]) { ++ sk_for_each(sk, node, &hashinfo->listening_hash[inet_lhashfn(hnum, VEID(env))]) { ++ if (!ve_accessible_strict(VE_OWNER_SK(sk), env)) ++ continue; + if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + +@@ -84,7 +89,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); + + static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, +- struct inet_timewait_sock **twp) ++ struct inet_timewait_sock **twp, ++ struct ve_struct *ve) + { + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); +@@ -94,7 +100,7 @@ static int __inet6_check_established(str + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, +- inet->dport); ++ inet->dport, VEID(ve)); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; +@@ -113,7 +119,8 @@ static int __inet6_check_established(str + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && +- sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { ++ sk2->sk_bound_dev_if == sk->sk_bound_dev_if && ++ ve_accessible_strict(tw->tw_owner_env, VEID(ve))) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else +@@ -124,7 +131,7 @@ static int __inet6_check_established(str + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { +- if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) ++ if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif, ve)) + goto not_unique; + } + +@@ -173,7 +180,9 @@ int inet6_hash_connect(struct inet_timew + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; ++ struct ve_struct *ve; + ++ ve = VE_OWNER_SK(sk); + if (snum == 0) { + const int low = sysctl_local_port_range[0]; + const int high = sysctl_local_port_range[1]; +@@ -187,7 +196,8 @@ int inet6_hash_connect(struct inet_timew + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; +- head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; ++ head = &hinfo->bhash[inet_bhashfn(port, ++ hinfo->bhash_size, VEID(ve))]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, +@@ -201,14 +211,14 @@ int inet6_hash_connect(struct inet_timew + goto next_port; + if (!__inet6_check_established(death_row, + sk, port, +- &tw)) ++ &tw, ve)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, +- head, port); ++ head, port, ve); + if (!tb) { + spin_unlock(&head->lock); + break; +@@ -243,7 +253,7 @@ ok: + goto out; + } + +- head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; ++ head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size, VEID(ve))]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + +@@ -254,7 +264,7 @@ ok: + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ +- ret = __inet6_check_established(death_row, sk, snum, NULL); ++ ret = __inet6_check_established(death_row, sk, snum, NULL, ve); + out: + local_bh_enable(); + return ret; +diff -upr linux-2.6.16.orig/net/ipv6/ip6_fib.c linux-2.6.16-026test015/net/ipv6/ip6_fib.c +--- linux-2.6.16.orig/net/ipv6/ip6_fib.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/ip6_fib.c 2006-07-04 14:41:39.000000000 +0400 +@@ -1128,8 +1128,12 @@ static int fib6_age(struct rt6_info *rt, + + static DEFINE_SPINLOCK(fib6_gc_lock); + ++LIST_HEAD(fib6_table_list); ++ + void fib6_run_gc(unsigned long dummy) + { ++ struct fib6_table *tbl; ++ + if (dummy != ~0UL) { + spin_lock_bh(&fib6_gc_lock); + gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval; +@@ -1147,7 +1151,11 @@ void fib6_run_gc(unsigned long dummy) + + write_lock_bh(&rt6_lock); + ndisc_dst_gc(&gc_args.more); +- fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); ++ list_for_each_entry(tbl, &fib6_table_list, list) { ++ struct ve_struct *old_env = set_exec_env(tbl->owner_env); ++ fib6_clean_tree(&tbl->root, fib6_age, 0, NULL); ++ set_exec_env(old_env); ++ } + write_unlock_bh(&rt6_lock); + + if (gc_args.more) +@@ -1163,7 +1171,7 @@ void __init fib6_init(void) + { + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), +- 0, SLAB_HWCACHE_ALIGN, ++ 0, SLAB_HWCACHE_ALIGN | SLAB_UBC, + NULL, NULL); + if (!fib6_node_kmem) + panic("cannot create fib6_nodes cache"); +diff -upr linux-2.6.16.orig/net/ipv6/ip6_flowlabel.c linux-2.6.16-026test015/net/ipv6/ip6_flowlabel.c +--- linux-2.6.16.orig/net/ipv6/ip6_flowlabel.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/ip6_flowlabel.c 2006-07-04 14:41:39.000000000 +0400 +@@ -417,6 +417,9 @@ int ipv6_flowlabel_opt(struct sock *sk, + struct ipv6_fl_socklist *sfl, **sflp; + struct ip6_flowlabel *fl; + ++ if (!ve_is_super(get_exec_env())) ++ return -EPERM; ++ + if (optlen < sizeof(freq)) + return -EINVAL; + +diff -upr linux-2.6.16.orig/net/ipv6/ip6_output.c linux-2.6.16-026test015/net/ipv6/ip6_output.c +--- linux-2.6.16.orig/net/ipv6/ip6_output.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/ip6_output.c 2006-07-04 14:41:39.000000000 +0400 +@@ -319,7 +319,7 @@ int ip6_forward(struct sk_buff *skb) + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct inet6_skb_parm *opt = IP6CB(skb); + +- if (ipv6_devconf.forwarding == 0) ++ if (ve_ipv6_devconf.forwarding == 0) + goto error; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { +@@ -407,6 +407,20 @@ int ip6_forward(struct sk_buff *skb) + return -EMSGSIZE; + } + ++ /* ++ * We try to optimize forwarding of VE packets: ++ * do not decrement TTL (and so save skb_cow) ++ * during forwarding of outgoing pkts from VE. ++ * For incoming pkts we still do ttl decr, ++ * since such skb is not cloned and does not require ++ * actual cow. So, there is at least one place ++ * in pkts path with mandatory ttl decr, that is ++ * sufficient to prevent routing loops. ++ */ ++ hdr = skb->nh.ipv6h; ++ if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */ ++ goto no_ttl_decr; ++ + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + goto drop; +@@ -418,6 +432,7 @@ int ip6_forward(struct sk_buff *skb) + + hdr->hop_limit--; + ++no_ttl_decr: + IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); + +diff -upr linux-2.6.16.orig/net/ipv6/mcast.c linux-2.6.16-026test015/net/ipv6/mcast.c +--- linux-2.6.16.orig/net/ipv6/mcast.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/mcast.c 2006-07-04 14:41:39.000000000 +0400 +@@ -156,7 +156,7 @@ static int ip6_mc_leave_src(struct sock + #define IGMP6_UNSOLICITED_IVAL (10*HZ) + #define MLD_QRV_DEFAULT 2 + +-#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \ ++#define MLD_V1_SEEN(idev) (ve_ipv6_devconf.force_mld_version == 1 || \ + (idev)->cnf.force_mld_version == 1 || \ + ((idev)->mc_v1_seen && \ + time_before(jiffies, (idev)->mc_v1_seen))) +@@ -248,6 +248,7 @@ int ipv6_sock_mc_join(struct sock *sk, i + + return 0; + } ++EXPORT_SYMBOL_GPL(ipv6_sock_mc_join); + + /* + * socket leave on multicast group +@@ -2166,15 +2167,18 @@ static void igmp6_leave_group(struct ifm + static void mld_gq_timer_expire(unsigned long data) + { + struct inet6_dev *idev = (struct inet6_dev *)data; ++ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); + + idev->mc_gq_running = 0; + mld_send_report(idev, NULL); + __in6_dev_put(idev); ++ set_exec_env(old_env); + } + + static void mld_ifc_timer_expire(unsigned long data) + { + struct inet6_dev *idev = (struct inet6_dev *)data; ++ struct ve_struct *old_env = set_exec_env(idev->dev->owner_env); + + mld_send_cr(idev); + if (idev->mc_ifc_count) { +@@ -2183,6 +2187,7 @@ static void mld_ifc_timer_expire(unsigne + mld_ifc_start_timer(idev, idev->mc_maxdelay); + } + __in6_dev_put(idev); ++ set_exec_env(old_env); + } + + static void mld_ifc_event(struct inet6_dev *idev) +@@ -2197,6 +2202,7 @@ static void mld_ifc_event(struct inet6_d + static void igmp6_timer_handler(unsigned long data) + { + struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; ++ struct ve_struct *old_env = set_exec_env(ma->idev->dev->owner_env); + + if (MLD_V1_SEEN(ma->idev)) + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); +@@ -2208,6 +2214,7 @@ static void igmp6_timer_handler(unsigned + ma->mca_flags &= ~MAF_TIMER_RUNNING; + spin_unlock(&ma->mca_lock); + ma_put(ma); ++ set_exec_env(old_env); + } + + /* Device going down */ +@@ -2331,6 +2338,8 @@ static inline struct ifmcaddr6 *igmp6_mc + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + idev = in6_dev_get(state->dev); + if (!idev) + continue; +@@ -2361,6 +2370,8 @@ static struct ifmcaddr6 *igmp6_mc_get_ne + state->idev = NULL; + break; + } ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; +@@ -2476,6 +2487,8 @@ static inline struct ip6_sf_list *igmp6_ + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + idev = in6_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; +@@ -2515,6 +2528,8 @@ static struct ip6_sf_list *igmp6_mcf_get + state->idev = NULL; + goto out; + } ++ if (unlikely(!ve_accessible_strict(state->dev->owner_env, get_exec_env()))) ++ continue; + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; +@@ -2657,8 +2672,8 @@ int __init igmp6_init(struct net_proto_f + np->hop_limit = 1; + + #ifdef CONFIG_PROC_FS +- proc_net_fops_create("igmp6", S_IRUGO, &igmp6_mc_seq_fops); +- proc_net_fops_create("mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); ++ proc_glob_fops_create("net/igmp6", S_IRUGO, &igmp6_mc_seq_fops); ++ proc_glob_fops_create("net/mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); + #endif + + return 0; +@@ -2670,7 +2685,7 @@ void igmp6_cleanup(void) + igmp6_socket = NULL; /* for safety */ + + #ifdef CONFIG_PROC_FS +- proc_net_remove("mcfilter6"); +- proc_net_remove("igmp6"); ++ remove_proc_glob_entry("net/mcfilter6", NULL); ++ remove_proc_glob_entry("net/igmp6", NULL); + #endif + } +diff -upr linux-2.6.16.orig/net/ipv6/ndisc.c linux-2.6.16-026test015/net/ipv6/ndisc.c +--- linux-2.6.16.orig/net/ipv6/ndisc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/ndisc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -124,7 +124,7 @@ static struct neigh_ops ndisc_direct_ops + .queue_xmit = dev_queue_xmit, + }; + +-struct neigh_table nd_tbl = { ++struct neigh_table global_nd_tbl = { + .family = AF_INET6, + .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), + .key_len = sizeof(struct in6_addr), +@@ -135,7 +135,7 @@ struct neigh_table nd_tbl = { + .proxy_redo = pndisc_redo, + .id = "ndisc_cache", + .parms = { +- .tbl = &nd_tbl, ++ .tbl = &global_nd_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, +@@ -1660,7 +1660,9 @@ int __init ndisc_init(struct net_proto_f + * Initialize the neighbour table + */ + +- neigh_table_init(&nd_tbl); ++ get_ve0()->ve_nd_tbl = &global_nd_tbl; ++ if (neigh_table_init(&nd_tbl)) ++ panic("cannot initialize IPv6 NDISC tables\n"); + + #ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, +@@ -1682,3 +1684,52 @@ void ndisc_cleanup(void) + sock_release(ndisc_socket); + ndisc_socket = NULL; /* For safety. */ + } ++ ++int ve_ndisc_init(struct ve_struct *ve) ++{ ++ struct ve_struct *old_env; ++ int err; ++ ++ ve->ve_nd_tbl = kmalloc(sizeof(struct neigh_table), GFP_KERNEL); ++ if (ve->ve_nd_tbl == NULL) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ *(ve->ve_nd_tbl) = global_nd_tbl; ++ ve->ve_nd_tbl->parms.tbl = ve->ve_nd_tbl; ++ old_env = set_exec_env(ve); ++ err = neigh_table_init(ve->ve_nd_tbl); ++ if (err) ++ goto out_free; ++#ifdef CONFIG_SYSCTL ++ neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, ++ "ipv6", ++ &ndisc_ifinfo_sysctl_change, ++ &ndisc_ifinfo_sysctl_strategy); ++#endif ++ set_exec_env(old_env); ++ err = 0; ++ ++out: ++ return err; ++ ++out_free: ++ kfree(ve->ve_nd_tbl); ++ ve->ve_nd_tbl = NULL; ++ goto out; ++} ++EXPORT_SYMBOL(ve_ndisc_init); ++ ++void ve_ndisc_fini(struct ve_struct *ve) ++{ ++ if (ve->ve_nd_tbl) { ++#ifdef CONFIG_SYSCTL ++ neigh_sysctl_unregister(&ve->ve_nd_tbl->parms); ++#endif ++ neigh_table_clear(ve->ve_nd_tbl); ++ kfree(ve->ve_nd_tbl); ++ ve->ve_nd_tbl = NULL; ++ } ++} ++EXPORT_SYMBOL(ve_ndisc_fini); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6_queue.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6_queue.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6_queue.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6_queue.c 2006-07-04 14:41:39.000000000 +0400 +@@ -540,8 +540,11 @@ ipq_rcv_sk(struct sock *sk, int len) + down(&ipqnl_sem); + + for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { ++ struct ve_struct *env; + skb = skb_dequeue(&sk->sk_receive_queue); ++ env = set_exec_env(VE_OWNER_SKB(skb)); + ipq_rcv_skb(skb); ++ (void)set_exec_env(env); + kfree_skb(skb); + } + +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6_tables.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6_tables.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6_tables.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6_tables.c 2006-07-04 14:41:39.000000000 +0400 +@@ -32,9 +32,11 @@ + #include <asm/semaphore.h> + #include <linux/proc_fs.h> + #include <linux/cpumask.h> ++#include <ub/ub_mem.h> + + #include <linux/netfilter_ipv6/ip6_tables.h> + #include <linux/netfilter/x_tables.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +@@ -79,6 +81,14 @@ do { \ + #define inline + #endif + ++#ifdef CONFIG_VE_IPTABLES ++/* include ve.h and define get_exec_env */ ++#include <linux/sched.h> ++#define ve_ip6t_standard_target (get_exec_env()->_ip6t_standard_target) ++#else ++#define ve_ip6t_standard_target &ip6t_standard_target ++#endif ++ + /* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore +@@ -632,7 +642,7 @@ check_entry(struct ip6t_entry *e, const + } + t->u.kernel.target = target; + +- if (t->u.kernel.target == &ip6t_standard_target) { ++ if (t->u.kernel.target == ve_ip6t_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; +@@ -1120,7 +1130,7 @@ do_add_counters(void __user *user, unsig + + write_lock_bh(&t->lock); + private = t->private; +- if (private->number != paddc->num_counters) { ++ if (private->number != tmp.num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } +@@ -1148,7 +1158,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1173,7 +1183,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd + { + int ret; + +- if (!capable(CAP_NET_ADMIN)) ++ if (!capable(CAP_VE_NET_ADMIN)) + return -EPERM; + + switch (cmd) { +@@ -1271,7 +1281,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd + return ret; + } + +-int ip6t_register_table(struct xt_table *table, ++struct ip6t_table *ip6t_register_table(struct xt_table *table, + const struct ip6t_replace *repl) + { + int ret; +@@ -1282,7 +1292,7 @@ int ip6t_register_table(struct xt_table + + newinfo = xt_alloc_table_info(repl->size); + if (!newinfo) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; +@@ -1295,15 +1305,13 @@ int ip6t_register_table(struct xt_table + repl->underflow); + if (ret != 0) { + xt_free_table_info(newinfo); +- return ret; ++ return ERR_PTR(ret); + } + +- if (xt_register_table(table, &bootstrap, newinfo) != 0) { ++ table = virt_xt_register_table(table, &bootstrap, newinfo); ++ if (IS_ERR(table)) + xt_free_table_info(newinfo); +- return ret; +- } +- +- return 0; ++ return table; + } + + void ip6t_unregister_table(struct xt_table *table) +@@ -1311,7 +1319,7 @@ void ip6t_unregister_table(struct xt_tab + struct xt_table_info *private; + void *loc_cpu_entry; + +- private = xt_unregister_table(table); ++ private = virt_xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries[raw_smp_processor_id()]; +@@ -1319,6 +1327,29 @@ void ip6t_unregister_table(struct xt_tab + xt_free_table_info(private); + } + ++void ip6t_flush_table(struct xt_table *table) ++{ ++ struct xt_table *t; ++ void *loc_cpu_entry; ++ ++ if (table == NULL) ++ return; ++ ++ t = xt_find_table_lock(AF_INET6, table->name); ++ if (t && !IS_ERR(t)) { ++ struct xt_table_info *private; ++ private = t->private; ++ loc_cpu_entry = private->entries[raw_smp_processor_id()]; ++ IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, ++ cleanup_entry, NULL); ++ if (private->number > private->initial_entries) ++ module_put(t->me); ++ private->size = 0; ++ xt_table_unlock(t); ++ module_put(t->me); ++ } ++} ++ + /* Returns 1 if the type and code is matched by the range, 0 otherwise */ + static inline int + icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, +@@ -1405,36 +1436,93 @@ static struct ip6t_match icmp6_matchstru + .checkentry = &icmp6_checkentry, + }; + +-static int __init init(void) ++static int init_ip6tables(void) + { + int ret; + +- xt_proto_init(AF_INET6); ++ if (ve_ip6t_standard_target != NULL) ++ return -EEXIST; + +- /* Noone else will be downing sem now, so we won't sleep */ +- xt_register_target(AF_INET6, &ip6t_standard_target); +- xt_register_target(AF_INET6, &ip6t_error_target); +- xt_register_match(AF_INET6, &icmp6_matchstruct); ++ ret = xt_register_target(AF_INET6, &ip6t_standard_target); ++ if (ret) ++ goto out; ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip6t_standard_target = xt_find_target(AF_INET6, IP6T_STANDARD_TARGET, 0); ++ if (IS_ERR(ve_ip6t_standard_target)) ++ goto out_standard; ++#endif ++ ret = xt_register_target(AF_INET6, &ip6t_error_target); ++ if (ret) ++ goto out_error; ++ ret = xt_register_match(AF_INET6, &icmp6_matchstruct); ++ if (ret) ++ goto out_icmp; ++ ret = xt_proto_init(AF_INET6); ++ if (ret) ++ goto out_proc; ++ return 0; ++ ++out_proc: ++ xt_unregister_match(AF_INET6, &icmp6_matchstruct); ++out_icmp: ++ xt_unregister_target(AF_INET6, &ip6t_error_target); ++out_error: ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip6t_standard_target = NULL; ++out_standard: ++#endif ++ xt_unregister_target(AF_INET6, &ip6t_standard_target); ++out: ++ return ret; ++} ++ ++static void fini_ip6tables(void) ++{ ++ xt_proto_fini(AF_INET6); ++ xt_unregister_match(AF_INET6, &icmp6_matchstruct); ++ xt_unregister_target(AF_INET6, &ip6t_error_target); ++#ifdef CONFIG_VE_IPTABLES ++ ve_ip6t_standard_target = NULL; ++#endif ++ xt_unregister_target(AF_INET6, &ip6t_standard_target); ++} ++ ++static int __init init(void) ++{ ++ int ret; ++ ++ ret = init_ip6tables(); ++ if (ret) ++ goto out; + + /* Register setsockopt */ + ret = nf_register_sockopt(&ip6t_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); +- xt_proto_fini(AF_INET6); +- return ret; ++ goto out_sockopts; + } + ++ KSYMRESOLVE(init_ip6tables); ++ KSYMRESOLVE(fini_ip6tables); ++ KSYMRESOLVE(ip6t_flush_table); ++ KSYMMODRESOLVE(ip6_tables); + printk("ip6_tables: (C) 2000-2006 Netfilter Core Team\n"); + return 0; ++ ++out_sockopts: ++ fini_ip6tables(); ++out: ++ return ret; + } + + static void __exit fini(void) + { ++ KSYMMODUNRESOLVE(ip6_tables); ++ KSYMUNRESOLVE(init_ip6tables); ++ KSYMUNRESOLVE(fini_ip6tables); ++ KSYMUNRESOLVE(ip6t_flush_table); + nf_unregister_sockopt(&ip6t_sockopts); +- xt_unregister_match(AF_INET6, &icmp6_matchstruct); +- xt_unregister_target(AF_INET6, &ip6t_error_target); +- xt_unregister_target(AF_INET6, &ip6t_standard_target); +- xt_proto_fini(AF_INET6); ++ fini_ip6tables(); + } + + /* +@@ -1516,6 +1604,7 @@ EXPORT_SYMBOL(ip6t_do_table); + EXPORT_SYMBOL(ip6t_ext_hdr); + EXPORT_SYMBOL(ipv6_find_hdr); + EXPORT_SYMBOL(ip6_masked_addrcmp); ++EXPORT_SYMBOL(ip6t_flush_table); + +-module_init(init); ++subsys_initcall(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6t_LOG.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_LOG.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6t_LOG.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_LOG.c 2006-07-04 14:41:39.000000000 +0400 +@@ -20,6 +20,7 @@ + #include <net/udp.h> + #include <net/tcp.h> + #include <net/ipv6.h> ++#include <linux/nfcalls.h> + #include <linux/netfilter.h> + #include <linux/netfilter_ipv6/ip6_tables.h> + +@@ -488,10 +489,23 @@ static struct nf_logger ip6t_logger = { + .me = THIS_MODULE, + }; + ++int init_ip6table_LOG(void) ++{ ++ return ip6t_register_target(&ip6t_log_reg); ++} ++ ++void fini_ip6table_LOG(void) ++{ ++ ip6t_unregister_target(&ip6t_log_reg); ++} ++ + static int __init init(void) + { +- if (ip6t_register_target(&ip6t_log_reg)) +- return -EINVAL; ++ int err; ++ ++ err = init_ip6table_LOG(); ++ if (err < 0) ++ return err; + if (nf_log_register(PF_INET6, &ip6t_logger) < 0) { + printk(KERN_WARNING "ip6t_LOG: not logging via system console " + "since somebody else already registered for PF_INET6\n"); +@@ -499,13 +513,19 @@ static int __init init(void) + * ip6tables userspace would abort */ + } + ++ KSYMRESOLVE(init_ip6table_LOG); ++ KSYMRESOLVE(fini_ip6table_LOG); ++ KSYMMODRESOLVE(ip6t_LOG); + return 0; + } + + static void __exit fini(void) + { ++ KSYMMODUNRESOLVE(ip6t_LOG); ++ KSYMUNRESOLVE(init_ip6table_LOG); ++ KSYMUNRESOLVE(fini_ip6table_LOG); + nf_log_unregister_logger(&ip6t_logger); +- ip6t_unregister_target(&ip6t_log_reg); ++ fini_ip6table_LOG(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6t_REJECT.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_REJECT.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6t_REJECT.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_REJECT.c 2006-07-04 14:41:39.000000000 +0400 +@@ -26,6 +26,7 @@ + #include <net/ip6_checksum.h> + #include <net/ip6_fib.h> + #include <net/ip6_route.h> ++#include <linux/nfcalls.h> + #include <net/flow.h> + #include <linux/netfilter_ipv6/ip6_tables.h> + #include <linux/netfilter_ipv6/ip6t_REJECT.h> +@@ -268,17 +269,39 @@ static struct ip6t_target ip6t_reject_re + .me = THIS_MODULE + }; + +-static int __init init(void) ++int init_ip6table_REJECT(void) + { + if (ip6t_register_target(&ip6t_reject_reg)) + return -EINVAL; + return 0; + } + +-static void __exit fini(void) ++void fini_ip6table_REJECT(void) + { + ip6t_unregister_target(&ip6t_reject_reg); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_ip6table_REJECT(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_ip6table_REJECT); ++ KSYMRESOLVE(fini_ip6table_REJECT); ++ KSYMMODRESOLVE(ip6t_REJECT); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip6t_REJECT); ++ KSYMUNRESOLVE(init_ip6table_REJECT); ++ KSYMUNRESOLVE(fini_ip6table_REJECT); ++ fini_ip6table_REJECT(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6t_multiport.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_multiport.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6t_multiport.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6t_multiport.c 2006-07-04 14:41:39.000000000 +0400 +@@ -14,6 +14,7 @@ + #include <linux/udp.h> + #include <linux/skbuff.h> + #include <linux/in.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter_ipv6/ip6t_multiport.h> + #include <linux/netfilter_ipv6/ip6_tables.h> +@@ -112,15 +113,37 @@ static struct ip6t_match multiport_match + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_ip6table_multiport(void) + { + return ip6t_register_match(&multiport_match); + } + +-static void __exit fini(void) ++void fini_ip6table_multiport(void) + { + ip6t_unregister_match(&multiport_match); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_ip6table_multiport(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_ip6table_multiport); ++ KSYMRESOLVE(fini_ip6table_multiport); ++ KSYMMODRESOLVE(ip6t_multiport); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip6t_multiport); ++ KSYMUNRESOLVE(init_ip6table_multiport); ++ KSYMUNRESOLVE(fini_ip6table_multiport); ++ fini_ip6table_multiport(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6table_filter.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_filter.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6table_filter.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_filter.c 2006-07-04 14:41:39.000000000 +0400 +@@ -11,12 +11,20 @@ + + #include <linux/module.h> + #include <linux/moduleparam.h> ++#include <linux/nfcalls.h> + #include <linux/netfilter_ipv6/ip6_tables.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); + MODULE_DESCRIPTION("ip6tables filter table"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_packet_filter (get_exec_env()->_ve_ip6t_filter_pf) ++#else ++#define ve_packet_filter &packet_filter ++#endif ++ + #define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) + + /* Standard entry. */ +@@ -43,7 +51,7 @@ static struct + struct ip6t_replace repl; + struct ip6t_standard entries[3]; + struct ip6t_error term; +-} initial_table __initdata ++} initial_table + = { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), + { [NF_IP6_LOCAL_IN] = 0, +@@ -108,7 +116,7 @@ ip6t_hook(unsigned int hook, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { +- return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); ++ return ip6t_do_table(pskb, hook, in, out, ve_packet_filter, NULL); + } + + static unsigned int +@@ -128,7 +136,7 @@ ip6t_local_out_hook(unsigned int hook, + } + #endif + +- return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); ++ return ip6t_do_table(pskb, hook, in, out, ve_packet_filter, NULL); + } + + static struct nf_hook_ops ip6t_ops[] = { +@@ -159,56 +167,89 @@ static struct nf_hook_ops ip6t_ops[] = { + static int forward = NF_ACCEPT; + module_param(forward, bool, 0000); + +-static int __init init(void) ++int init_ip6table_filter(void) + { + int ret; +- +- if (forward < 0 || forward > NF_MAX_VERDICT) { +- printk("iptables forward must be 0 or 1\n"); +- return -EINVAL; +- } +- +- /* Entry 1 is the FORWARD hook */ +- initial_table.entries[1].target.verdict = -forward - 1; ++ struct ip6t_table *tmp_filter; + + /* Register table */ +- ret = ip6t_register_table(&packet_filter, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp_filter = ip6t_register_table(&packet_filter, ++ &initial_table.repl); ++ if (IS_ERR(tmp_filter)) ++ return PTR_ERR(tmp_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = tmp_filter; ++#endif + + /* Register hooks */ +- ret = nf_register_hook(&ip6t_ops[0]); ++ ret = virt_nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + +- ret = nf_register_hook(&ip6t_ops[1]); ++ ret = virt_nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +- ret = nf_register_hook(&ip6t_ops[2]); ++ ret = virt_nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: +- nf_unregister_hook(&ip6t_ops[1]); ++ virt_nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: +- nf_unregister_hook(&ip6t_ops[0]); ++ virt_nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: +- ip6t_unregister_table(&packet_filter); ++ ip6t_unregister_table(ve_packet_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = NULL; ++#endif + + return ret; + } + +-static void __exit fini(void) ++void fini_ip6table_filter(void) + { + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) +- nf_unregister_hook(&ip6t_ops[i]); ++ virt_nf_unregister_hook(&ip6t_ops[i]); + +- ip6t_unregister_table(&packet_filter); ++ ip6t_unregister_table(ve_packet_filter); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_filter = NULL; ++#endif ++} ++ ++static int __init init(void) ++{ ++ int err; ++ ++ if (forward < 0 || forward > NF_MAX_VERDICT) { ++ printk("iptables forward must be 0 or 1\n"); ++ return -EINVAL; ++ } ++ ++ /* Entry 1 is the FORWARD hook */ ++ initial_table.entries[1].target.verdict = -forward - 1; ++ ++ err = init_ip6table_filter(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_ip6table_filter); ++ KSYMRESOLVE(fini_ip6table_filter); ++ KSYMMODRESOLVE(ip6table_filter); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip6table_filter); ++ KSYMUNRESOLVE(init_ip6table_filter); ++ KSYMUNRESOLVE(fini_ip6table_filter); ++ fini_ip6table_filter(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6table_mangle.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_mangle.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6table_mangle.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_mangle.c 2006-07-04 14:41:39.000000000 +0400 +@@ -12,6 +12,7 @@ + */ + #include <linux/module.h> + #include <linux/netfilter_ipv6/ip6_tables.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +@@ -53,7 +54,7 @@ static struct + struct ip6t_replace repl; + struct ip6t_standard entries[5]; + struct ip6t_error term; +-} initial_table __initdata ++} initial_table + = { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), + { [NF_IP6_PRE_ROUTING] = 0, +@@ -130,6 +131,13 @@ static struct ip6t_table packet_mangler + .af = AF_INET6, + }; + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_packet_mangler (get_exec_env()->_ip6t_mangle_table) ++#else ++#define ve_packet_mangler &packet_mangler ++#endif ++ + /* The work comes in here from netfilter.c. */ + static unsigned int + ip6t_route_hook(unsigned int hook, +@@ -138,7 +146,7 @@ ip6t_route_hook(unsigned int hook, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { +- return ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); ++ return ip6t_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); + } + + static unsigned int +@@ -174,7 +182,7 @@ ip6t_local_hook(unsigned int hook, + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h); + +- ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); ++ ret = ip6t_do_table(pskb, hook, in, out, ve_packet_mangler, NULL); + + if (ret != NF_DROP && ret != NF_STOLEN + && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr)) +@@ -228,60 +236,93 @@ static struct nf_hook_ops ip6t_ops[] = { + }, + }; + +-static int __init init(void) ++int init_ip6table_mangle(void) + { + int ret; ++ struct ip6t_table *tmp_mangler; + + /* Register table */ +- ret = ip6t_register_table(&packet_mangler, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp_mangler = ip6t_register_table(&packet_mangler, ++ &initial_table.repl); ++ if (IS_ERR(tmp_mangler)) ++ return PTR_ERR(tmp_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = tmp_mangler; ++#endif + + /* Register hooks */ +- ret = nf_register_hook(&ip6t_ops[0]); ++ ret = virt_nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + +- ret = nf_register_hook(&ip6t_ops[1]); ++ ret = virt_nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +- ret = nf_register_hook(&ip6t_ops[2]); ++ ret = virt_nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + +- ret = nf_register_hook(&ip6t_ops[3]); ++ ret = virt_nf_register_hook(&ip6t_ops[3]); + if (ret < 0) + goto cleanup_hook2; + +- ret = nf_register_hook(&ip6t_ops[4]); ++ ret = virt_nf_register_hook(&ip6t_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: +- nf_unregister_hook(&ip6t_ops[3]); ++ virt_nf_unregister_hook(&ip6t_ops[3]); + cleanup_hook2: +- nf_unregister_hook(&ip6t_ops[2]); ++ virt_nf_unregister_hook(&ip6t_ops[2]); + cleanup_hook1: +- nf_unregister_hook(&ip6t_ops[1]); ++ virt_nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: +- nf_unregister_hook(&ip6t_ops[0]); ++ virt_nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: +- ip6t_unregister_table(&packet_mangler); ++ ip6t_unregister_table(ve_packet_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = NULL; ++#endif + + return ret; + } + +-static void __exit fini(void) ++void fini_ip6table_mangle(void) + { + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) +- nf_unregister_hook(&ip6t_ops[i]); ++ virt_nf_unregister_hook(&ip6t_ops[i]); ++ ++ ip6t_unregister_table(ve_packet_mangler); ++#ifdef CONFIG_VE_IPTABLES ++ ve_packet_mangler = NULL; ++#endif ++} ++ ++static int __init init(void) ++{ ++ int err; + +- ip6t_unregister_table(&packet_mangler); ++ err = init_ip6table_mangle(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_ip6table_mangle); ++ KSYMRESOLVE(fini_ip6table_mangle); ++ KSYMMODRESOLVE(ip6table_mangle); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(ip6table_mangle); ++ KSYMUNRESOLVE(init_ip6table_mangle); ++ KSYMUNRESOLVE(fini_ip6table_mangle); ++ fini_ip6table_mangle(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/ipv6/netfilter/ip6table_raw.c linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_raw.c +--- linux-2.6.16.orig/net/ipv6/netfilter/ip6table_raw.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/netfilter/ip6table_raw.c 2006-07-04 14:41:39.000000000 +0400 +@@ -145,11 +145,12 @@ static struct nf_hook_ops ip6t_ops[] = { + static int __init init(void) + { + int ret; ++ struct ip6t_table *tmp; + + /* Register table */ +- ret = ip6t_register_table(&packet_raw, &initial_table.repl); +- if (ret < 0) +- return ret; ++ tmp = ip6t_register_table(&packet_raw, &initial_table.repl); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); +diff -upr linux-2.6.16.orig/net/ipv6/proc.c linux-2.6.16-026test015/net/ipv6/proc.c +--- linux-2.6.16.orig/net/ipv6/proc.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/proc.c 2006-07-04 14:41:39.000000000 +0400 +@@ -25,13 +25,18 @@ + #include <linux/proc_fs.h> + #include <linux/seq_file.h> + #include <linux/stddef.h> ++#include <linux/ve.h> + #include <net/sock.h> + #include <net/tcp.h> + #include <net/transp_v6.h> + #include <net/ipv6.h> + + #ifdef CONFIG_PROC_FS ++#ifdef CONFIG_VE ++#define proc_net_devsnmp6 (get_exec_env()->_proc_net_devsnmp6) ++#else + static struct proc_dir_entry *proc_net_devsnmp6; ++#endif + + static int fold_prot_inuse(struct proto *proto) + { +@@ -164,9 +169,9 @@ static int snmp6_seq_show(struct seq_fil + seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); + snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); + } else { +- snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); +- snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); +- snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); ++ snmp6_seq_show_item(seq, (void **)ve_ipv6_statistics, snmp6_ipstats_list); ++ snmp6_seq_show_item(seq, (void **)ve_icmpv6_statistics, snmp6_icmp6_list); ++ snmp6_seq_show_item(seq, (void **)ve_udp_stats_in6, snmp6_udp6_list); + } + return 0; + } +@@ -229,15 +234,27 @@ int snmp6_unregister_dev(struct inet6_de + return 0; + } + ++int ve_snmp_proc_init(void) ++{ ++ proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net); ++ return proc_net_devsnmp6 == NULL ? -ENOMEM : 0; ++} ++EXPORT_SYMBOL(ve_snmp_proc_init); ++ ++void ve_snmp_proc_fini(void) ++{ ++ proc_net_remove("dev_snmp6"); ++} ++EXPORT_SYMBOL(ve_snmp_proc_fini); ++ + int __init ipv6_misc_proc_init(void) + { + int rc = 0; + +- if (!proc_net_fops_create("snmp6", S_IRUGO, &snmp6_seq_fops)) ++ if (!proc_glob_fops_create("net/snmp6", S_IRUGO, &snmp6_seq_fops)) + goto proc_snmp6_fail; + +- proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net); +- if (!proc_net_devsnmp6) ++ if (ve_snmp_proc_init()) + goto proc_dev_snmp6_fail; + + if (!proc_net_fops_create("sockstat6", S_IRUGO, &sockstat6_seq_fops)) +@@ -246,9 +263,9 @@ out: + return rc; + + proc_sockstat6_fail: +- proc_net_remove("dev_snmp6"); ++ ve_snmp_proc_fini(); + proc_dev_snmp6_fail: +- proc_net_remove("snmp6"); ++ remove_proc_glob_entry("net/snmp6", NULL); + proc_snmp6_fail: + rc = -ENOMEM; + goto out; +diff -upr linux-2.6.16.orig/net/ipv6/raw.c linux-2.6.16-026test015/net/ipv6/raw.c +--- linux-2.6.16.orig/net/ipv6/raw.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/raw.c 2006-07-04 14:41:39.000000000 +0400 +@@ -99,6 +99,9 @@ struct sock *__raw_v6_lookup(struct sock + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) + continue; + ++ if (!ve_accessible_strict(VE_OWNER_SK(sk), get_exec_env())) ++ continue; ++ + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + goto found; +@@ -1046,8 +1049,14 @@ static struct sock *raw6_get_next(struct + do { + sk = sk_next(sk); + try_again: +- ; +- } while (sk && sk->sk_family != PF_INET6); ++ if (!sk) ++ break; ++ if (sk->sk_family != PF_INET6) ++ continue; ++ if (ve_accessible(VE_OWNER_SK(sk), ++ get_exec_env())) ++ break; ++ } while (1); + + if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) { + sk = sk_head(&raw_v6_htable[state->bucket]); +@@ -1166,13 +1175,13 @@ static struct file_operations raw6_seq_f + + int __init raw6_proc_init(void) + { +- if (!proc_net_fops_create("raw6", S_IRUGO, &raw6_seq_fops)) ++ if (!proc_glob_fops_create("net/raw6", S_IRUGO, &raw6_seq_fops)) + return -ENOMEM; + return 0; + } + + void raw6_proc_exit(void) + { +- proc_net_remove("raw6"); ++ remove_proc_glob_entry("net/raw6", NULL); + } + #endif /* CONFIG_PROC_FS */ +diff -upr linux-2.6.16.orig/net/ipv6/reassembly.c linux-2.6.16-026test015/net/ipv6/reassembly.c +--- linux-2.6.16.orig/net/ipv6/reassembly.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/reassembly.c 2006-07-04 14:41:39.000000000 +0400 +@@ -43,6 +43,7 @@ + #include <linux/icmpv6.h> + #include <linux/random.h> + #include <linux/jhash.h> ++#include <linux/ve_owner.h> + + #include <net/sock.h> + #include <net/snmp.h> +@@ -53,6 +54,7 @@ + #include <net/rawv6.h> + #include <net/ndisc.h> + #include <net/addrconf.h> ++#include <linux/ve_owner.h> + + int sysctl_ip6frag_high_thresh = 256*1024; + int sysctl_ip6frag_low_thresh = 192*1024; +@@ -95,8 +97,12 @@ struct frag_queue + #define FIRST_IN 2 + #define LAST_IN 1 + __u16 nhoffset; ++ struct ve_struct *owner_env; + }; + ++DCL_VE_OWNER_PROTO(IP6Q, struct frag_queue, owner_env) ++DCL_VE_OWNER(IP6Q, struct frag_queue, owner_env) ++ + /* Hash table. */ + + #define IP6Q_HASHSZ 64 +@@ -288,6 +294,9 @@ static void ip6_evictor(void) + static void ip6_frag_expire(unsigned long data) + { + struct frag_queue *fq = (struct frag_queue *) data; ++ struct ve_struct *envid; ++ ++ envid = set_exec_env(VE_OWNER_IP6Q(fq)); + + spin_lock(&fq->lock); + +@@ -318,6 +327,8 @@ static void ip6_frag_expire(unsigned lon + out: + spin_unlock(&fq->lock); + fq_put(fq, NULL); ++ ++ (void)set_exec_env(envid); + } + + /* Creation primitives. */ +@@ -336,7 +347,8 @@ static struct frag_queue *ip6_frag_inter + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { + if (fq->id == fq_in->id && + ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && +- ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { ++ ipv6_addr_equal(&fq_in->daddr, &fq->daddr) && ++ fq->owner_env == get_exec_env()) { + atomic_inc(&fq->refcnt); + write_unlock(&ip6_frag_lock); + fq_in->last_in |= COMPLETE; +@@ -380,6 +392,8 @@ ip6_frag_create(unsigned int hash, u32 i + spin_lock_init(&fq->lock); + atomic_set(&fq->refcnt, 1); + ++ SET_VE_OWNER_IP6Q(fq, get_exec_env()); ++ + return ip6_frag_intern(hash, fq); + + oom: +@@ -398,7 +412,8 @@ fq_find(u32 id, struct in6_addr *src, st + hlist_for_each_entry(fq, n, &ip6_frag_hash[hash], list) { + if (fq->id == id && + ipv6_addr_equal(src, &fq->saddr) && +- ipv6_addr_equal(dst, &fq->daddr)) { ++ ipv6_addr_equal(dst, &fq->daddr) && ++ fq->owner_env == get_exec_env()) { + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + return fq; +@@ -727,6 +742,9 @@ static int ipv6_frag_rcv(struct sk_buff + fq->meat == fq->len) + ret = ip6_frag_reasm(fq, skbp, dev); + ++ if (ret > 0) ++ SET_VE_OWNER_SKB(*skbp, VE_OWNER_SKB(skb)); ++ + spin_unlock(&fq->lock); + fq_put(fq, NULL); + return ret; +@@ -737,6 +755,50 @@ static int ipv6_frag_rcv(struct sk_buff + return -1; + } + ++#ifdef CONFIG_VE ++/* XXX */ ++void ip6_frag_cleanup(struct ve_struct *envid) ++{ ++ int i, progress; ++ ++ local_bh_disable(); ++ do { ++ progress = 0; ++ for (i = 0; i < IP6Q_HASHSZ; i++) { ++ struct frag_queue *fq; ++ struct hlist_node *p, *n; ++ ++ if (hlist_empty(&ip6_frag_hash[i])) ++ continue; ++inner_restart: ++ read_lock(&ip6_frag_lock); ++ hlist_for_each_entry_safe(fq, p, n, ++ &ip6_frag_hash[i], list) { ++ if (!ve_accessible_strict( ++ VE_OWNER_IP6Q(fq), ++ envid)) ++ continue; ++ atomic_inc(&fq->refcnt); ++ read_unlock(&ip6_frag_lock); ++ ++ spin_lock(&fq->lock); ++ if (!(fq->last_in&COMPLETE)) ++ fq_kill(fq); ++ spin_unlock(&fq->lock); ++ ++ fq_put(fq, NULL); ++ progress = 1; ++ goto inner_restart; ++ } ++ read_unlock(&ip6_frag_lock); ++ } ++ } while(progress); ++ local_bh_enable(); ++} ++EXPORT_SYMBOL(ip6_frag_cleanup); ++#endif ++ ++ + static struct inet6_protocol frag_protocol = + { + .handler = ipv6_frag_rcv, +diff -upr linux-2.6.16.orig/net/ipv6/route.c linux-2.6.16-026test015/net/ipv6/route.c +--- linux-2.6.16.orig/net/ipv6/route.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/route.c 2006-07-04 14:41:39.000000000 +0400 +@@ -52,7 +52,6 @@ + #include <net/addrconf.h> + #include <net/tcp.h> + #include <linux/rtnetlink.h> +-#include <net/dst.h> + #include <net/xfrm.h> + + #include <asm/uaccess.h> +@@ -113,7 +112,6 @@ struct rt6_info ip6_null_entry = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, +- .dev = &loopback_dev, + .obsolete = -1, + .error = -ENETUNREACH, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, +@@ -128,11 +126,19 @@ struct rt6_info ip6_null_entry = { + .rt6i_ref = ATOMIC_INIT(1), + }; + +-struct fib6_node ip6_routing_table = { +- .leaf = &ip6_null_entry, +- .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, ++struct fib6_table global_fib6_table = { ++ .root = { ++ .leaf = &ip6_null_entry, ++ .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, ++ } + }; + ++#ifdef CONFIG_VE ++#define ip6_routing_table (get_exec_env()->_fib6_table->root) ++#else ++#define ip6_routing_table (global_ip6_routing_table.root) ++#endif ++ + /* Protects all the ip6 fib */ + + DEFINE_RWLOCK(rt6_lock); +@@ -778,7 +784,7 @@ static int ipv6_get_mtu(struct net_devic + + int ipv6_get_hoplimit(struct net_device *dev) + { +- int hoplimit = ipv6_devconf.hop_limit; ++ int hoplimit = ve_ipv6_devconf.hop_limit; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); +@@ -1421,10 +1427,12 @@ struct rt6_info *addrconf_dst_alloc(stru + rt->rt6i_flags |= RTF_ANYCAST; + else + rt->rt6i_flags |= RTF_LOCAL; +- rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); +- if (rt->rt6i_nexthop == NULL) { ++ rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev); ++ if (IS_ERR(rt->rt6i_nexthop)) { ++ void *err = rt->rt6i_nexthop; ++ rt->rt6i_nexthop = NULL; + dst_free((struct dst_entry *) rt); +- return ERR_PTR(-ENOMEM); ++ return err; + } + + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); +@@ -1640,8 +1648,12 @@ static int rt6_fill_node(struct sk_buff + goto rtattr_failure; + if (rt->u.dst.neighbour) + RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); +- if (rt->u.dst.dev) +- RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex); ++ if (rt->u.dst.dev) { ++ struct net_device *odev = rt->rt6i_dev; ++ if (rt == &ip6_null_entry) ++ odev = &loopback_dev; ++ RTA_PUT(skb, RTA_OIF, sizeof(int), &odev->ifindex); ++ } + RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric); + ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); + if (rt->rt6i_expires) +@@ -2110,23 +2122,31 @@ void __init ip6_route_init(void) + if (!ip6_dst_ops.kmem_cachep) + panic("cannot create ip6_dst_cache"); + ++#ifdef CONFIG_VE ++ global_fib6_table.owner_env = get_ve0(); ++ get_ve0()->_fib6_table = &global_fib6_table; ++#endif ++ list_add(&global_fib6_table.list, &fib6_table_list); + fib6_init(); + #ifdef CONFIG_PROC_FS +- p = proc_net_create("ipv6_route", 0, rt6_proc_info); +- if (p) ++ p = create_proc_glob_entry("net/ipv6_route", 0, NULL); ++ if (p) { + p->owner = THIS_MODULE; ++ p->get_info = rt6_proc_info; ++ } + + proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops); + #endif + #ifdef CONFIG_XFRM + xfrm6_init(); + #endif ++ ip6_null_entry.u.dst.dev = &loopback_dev; + } + + void ip6_route_cleanup(void) + { + #ifdef CONFIG_PROC_FS +- proc_net_remove("ipv6_route"); ++ remove_proc_glob_entry("net/ipv6_route", NULL); + proc_net_remove("rt6_stats"); + #endif + #ifdef CONFIG_XFRM +@@ -2136,3 +2156,35 @@ void ip6_route_cleanup(void) + fib6_gc_cleanup(); + kmem_cache_destroy(ip6_dst_ops.kmem_cachep); + } ++ ++int init_ve_route6(struct ve_struct *ve) ++{ ++ struct ve_struct *old_env = set_exec_env(ve); ++ ve->_fib6_table = kzalloc(sizeof(struct fib6_table), GFP_KERNEL_UBC); ++ if (ve->_fib6_table) { ++ ve->_fib6_table->owner_env = ve; ++ ve->_fib6_table->root.leaf = &ip6_null_entry; ++ ve->_fib6_table->root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; ++ write_lock_bh(&rt6_lock); ++ list_add(&ve->_fib6_table->list, &fib6_table_list); ++ write_unlock_bh(&rt6_lock); ++ } ++ set_exec_env(old_env); ++ return ve->_fib6_table ? 0 : -ENOMEM; ++} ++EXPORT_SYMBOL(init_ve_route6); ++ ++void fini_ve_route6(struct ve_struct *ve) ++{ ++ struct ve_struct *old_env = set_exec_env(ve); ++ ++ if (ve->_fib6_table) { ++ rt6_ifdown(NULL); ++ write_lock_bh(&rt6_lock); ++ list_del(&ve->_fib6_table->list); ++ write_unlock_bh(&rt6_lock); ++ kfree(ve->_fib6_table); ++ } ++ set_exec_env(old_env); ++} ++EXPORT_SYMBOL(fini_ve_route6); +diff -upr linux-2.6.16.orig/net/ipv6/tcp_ipv6.c linux-2.6.16-026test015/net/ipv6/tcp_ipv6.c +--- linux-2.6.16.orig/net/ipv6/tcp_ipv6.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/tcp_ipv6.c 2006-07-04 14:41:39.000000000 +0400 +@@ -62,6 +62,8 @@ + #include <net/dsfield.h> + #include <net/timewait_sock.h> + ++#include <ub/ub_tcp.h> ++ + #include <asm/uaccess.h> + + #include <linux/proc_fs.h> +@@ -77,7 +79,7 @@ static void tcp_v6_send_check(struct soc + + static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); + +-static struct inet_connection_sock_af_ops ipv6_mapped; ++struct inet_connection_sock_af_ops ipv6_mapped; + static struct inet_connection_sock_af_ops ipv6_specific; + + static int tcp_v6_get_port(struct sock *sk, unsigned short snum) +@@ -273,6 +275,8 @@ static int tcp_v6_connect(struct sock *s + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); ++ if (!sysctl_tcp_use_sg) ++ sk->sk_route_caps &= ~NETIF_F_SG; + + icsk->icsk_ext_hdr_len = 0; + if (np->opt) +@@ -933,6 +937,8 @@ static struct sock * tcp_v6_syn_recv_soc + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); ++ if (!sysctl_tcp_use_sg) ++ newsk->sk_route_caps &= ~NETIF_F_SG; + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; +@@ -1040,6 +1046,8 @@ static int tcp_v6_do_rcv(struct sock *sk + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp; + struct sk_buff *opt_skb = NULL; ++ struct user_beancounter *ub; ++ + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. +@@ -1052,6 +1060,8 @@ static int tcp_v6_do_rcv(struct sock *sk + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + ++ ub = set_exec_ub(sock_bc(sk)->ub); ++ + if (sk_filter(sk, skb, 0)) + goto discard; + +@@ -1083,7 +1093,7 @@ static int tcp_v6_do_rcv(struct sock *sk + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; +- return 0; ++ goto restore_context; + } + + if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) +@@ -1104,7 +1114,7 @@ static int tcp_v6_do_rcv(struct sock *sk + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); +- return 0; ++ goto restore_context; + } + } + +@@ -1114,6 +1124,9 @@ static int tcp_v6_do_rcv(struct sock *sk + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; ++ ++restore_context: ++ (void)set_exec_ub(ub); + return 0; + + reset: +@@ -1122,7 +1135,7 @@ discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); +- return 0; ++ goto restore_context; + csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; +@@ -1154,7 +1167,7 @@ ipv6_pktoptions: + + if (opt_skb) + kfree_skb(opt_skb); +- return 0; ++ goto restore_context; + } + + static int tcp_v6_rcv(struct sk_buff **pskb) +@@ -1315,7 +1328,7 @@ static struct inet_connection_sock_af_op + * TCP over IPv4 via INET6 API + */ + +-static struct inet_connection_sock_af_ops ipv6_mapped = { ++struct inet_connection_sock_af_ops ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, +@@ -1329,6 +1342,7 @@ static struct inet_connection_sock_af_op + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) + }; ++EXPORT_SYMBOL_GPL(ipv6_mapped); + + + +@@ -1535,7 +1549,7 @@ out: + static struct file_operations tcp6_seq_fops; + static struct tcp_seq_afinfo tcp6_seq_afinfo = { + .owner = THIS_MODULE, +- .name = "tcp6", ++ .name = "net/tcp6", + .family = AF_INET6, + .seq_show = tcp6_seq_show, + .seq_fops = &tcp6_seq_fops, +diff -upr linux-2.6.16.orig/net/ipv6/udp.c linux-2.6.16-026test015/net/ipv6/udp.c +--- linux-2.6.16.orig/net/ipv6/udp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/udp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -69,7 +69,9 @@ static int udp_v6_get_port(struct sock * + { + struct sock *sk2; + struct hlist_node *node; ++ struct ve_struct *env; + ++ env = VE_OWNER_SK(sk); + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; +@@ -83,7 +85,7 @@ static int udp_v6_get_port(struct sock * + int size; + struct hlist_head *list; + +- list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; ++ list = &udp_hash[udp_hashfn(result, VEID(env))]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + +@@ -105,7 +107,7 @@ static int udp_v6_get_port(struct sock * + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); +- if (!udp_lport_inuse(result)) ++ if (!udp_lport_inuse(result, env)) + break; + } + if (i >= (1 << 16) / UDP_HTABLE_SIZE) +@@ -114,9 +116,10 @@ gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, +- &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { ++ &udp_hash[udp_hashfn(snum, VEID(env))]) { + if (inet_sk(sk2)->num == snum && + sk2 != sk && ++ ve_accessible_strict(VE_OWNER_SK(sk2), env) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && +@@ -128,7 +131,7 @@ gotit: + + inet_sk(sk)->num = snum; + if (sk_unhashed(sk)) { +- sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]); ++ sk_add_node(sk, &udp_hash[udp_hashfn(snum, VEID(env))]); + sock_prot_inc_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); +@@ -161,12 +164,15 @@ static struct sock *udp_v6_lookup(struct + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; ++ struct ve_struct *env; + + read_lock(&udp_hash_lock); +- sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { ++ env = get_exec_env(); ++ sk_for_each(sk, node, &udp_hash[udp_hashfn(hnum, VEID(env))]) { + struct inet_sock *inet = inet_sk(sk); + +- if (inet->num == hnum && sk->sk_family == PF_INET6) { ++ if (inet->num == hnum && sk->sk_family == PF_INET6 && ++ ve_accessible_strict(VE_OWNER_SK(sk), env)) { + struct ipv6_pinfo *np = inet6_sk(sk); + int score = 0; + if (inet->dport) { +@@ -415,7 +421,8 @@ static void udpv6_mcast_deliver(struct u + int dif; + + read_lock(&udp_hash_lock); +- sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); ++ sk = sk_head(&udp_hash[udp_hashfn(ntohs(uh->dest), ++ VEID(VE_OWNER_SKB(skb)))]); + dif = skb->dev->ifindex; + sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { +@@ -1018,7 +1025,7 @@ static int udp6_seq_show(struct seq_file + static struct file_operations udp6_seq_fops; + static struct udp_seq_afinfo udp6_seq_afinfo = { + .owner = THIS_MODULE, +- .name = "udp6", ++ .name = "net/udp6", + .family = AF_INET6, + .seq_show = udp6_seq_show, + .seq_fops = &udp6_seq_fops, +diff -upr linux-2.6.16.orig/net/ipv6/xfrm6_policy.c linux-2.6.16-026test015/net/ipv6/xfrm6_policy.c +--- linux-2.6.16.orig/net/ipv6/xfrm6_policy.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/ipv6/xfrm6_policy.c 2006-07-04 14:41:36.000000000 +0400 +@@ -191,16 +191,18 @@ error: + static inline void + _decode_session6(struct sk_buff *skb, struct flowi *fl) + { +- u16 offset = sizeof(struct ipv6hdr); ++ u16 offset = skb->h.raw - skb->nh.raw; + struct ipv6hdr *hdr = skb->nh.ipv6h; +- struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); +- u8 nexthdr = skb->nh.ipv6h->nexthdr; ++ struct ipv6_opt_hdr *exthdr; ++ u8 nexthdr = skb->nh.raw[IP6CB(skb)->nhoff]; + + memset(fl, 0, sizeof(struct flowi)); + ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr); + ipv6_addr_copy(&fl->fl6_src, &hdr->saddr); + + while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) { ++ exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); ++ + switch (nexthdr) { + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: +diff -upr linux-2.6.16.orig/net/netfilter/core.c linux-2.6.16-026test015/net/netfilter/core.c +--- linux-2.6.16.orig/net/netfilter/core.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/core.c 2006-07-04 14:41:39.000000000 +0400 +@@ -32,16 +32,24 @@ + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ ++static DEFINE_SPINLOCK(nf_hook_lock); ++ + struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; + EXPORT_SYMBOL(nf_hooks); +-static DEFINE_SPINLOCK(nf_hook_lock); ++#ifdef CONFIG_VE_IPTABLES ++#define ve_nf_hooks \ ++ ((struct list_head (*)[NF_MAX_HOOKS])(get_exec_env()->_nf_hooks)) ++#else ++#define ve_nf_hooks nf_hooks ++#endif ++ + + int nf_register_hook(struct nf_hook_ops *reg) + { + struct list_head *i; + + spin_lock_bh(&nf_hook_lock); +- list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { ++ list_for_each(i, &ve_nf_hooks[reg->pf][reg->hooknum]) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } +@@ -53,6 +61,33 @@ int nf_register_hook(struct nf_hook_ops + } + EXPORT_SYMBOL(nf_register_hook); + ++int virt_nf_register_hook(struct nf_hook_ops *reg) ++{ ++ int ret = 0; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct nf_hook_ops *tmp; ++ ret = -ENOMEM; ++ tmp = kmalloc(sizeof(struct nf_hook_ops), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, reg, sizeof(struct nf_hook_ops)); ++ reg = tmp; ++ } ++ ++ ret = nf_register_hook(reg); ++ if (ret) ++ goto out; ++ ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())) ++ kfree(reg); ++nomem: ++ return ret; ++} ++EXPORT_SYMBOL(virt_nf_register_hook); ++ + void nf_unregister_hook(struct nf_hook_ops *reg) + { + spin_lock_bh(&nf_hook_lock); +@@ -63,6 +98,29 @@ void nf_unregister_hook(struct nf_hook_o + } + EXPORT_SYMBOL(nf_unregister_hook); + ++int virt_nf_unregister_hook(struct nf_hook_ops *reg) ++{ ++ struct nf_hook_ops *i; ++ ++ spin_lock_bh(&nf_hook_lock); ++ list_for_each_entry(i, &ve_nf_hooks[reg->pf][reg->hooknum], list) { ++ if (reg->hook == i->hook) { ++ reg = i; ++ break; ++ } ++ } ++ spin_unlock_bh(&nf_hook_lock); ++ if (reg != i) ++ return -ENOENT; ++ ++ nf_unregister_hook(reg); ++ ++ if (!ve_is_super(get_exec_env())) ++ kfree(reg); ++ return 0; ++} ++EXPORT_SYMBOL(virt_nf_unregister_hook); ++ + unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, +@@ -120,9 +178,9 @@ int nf_hook_slow(int pf, unsigned int ho + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + +- elem = &nf_hooks[pf][hook]; ++ elem = &ve_nf_hooks[pf][hook]; + next_hook: +- verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, ++ verdict = nf_iterate(&ve_nf_hooks[pf][hook], pskb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; +@@ -195,13 +253,54 @@ struct proc_dir_entry *proc_net_netfilte + EXPORT_SYMBOL(proc_net_netfilter); + #endif + +-void __init netfilter_init(void) ++void init_nf_hooks(struct list_head (*nh)[NF_MAX_HOOKS]) + { + int i, h; + for (i = 0; i < NPROTO; i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) +- INIT_LIST_HEAD(&nf_hooks[i][h]); ++ INIT_LIST_HEAD(&ve_nf_hooks[i][h]); + } ++} ++ ++int init_netfilter(void) ++{ ++#ifdef CONFIG_VE_IPTABLES ++ struct ve_struct *envid; ++ ++ envid = get_exec_env(); ++ envid->_nf_hooks = kmalloc(sizeof(nf_hooks), GFP_KERNEL); ++ if (envid->_nf_hooks == NULL) ++ return -ENOMEM; ++ ++ /* FIXME: charge ubc */ ++ ++ init_nf_hooks(envid->_nf_hooks); ++ return 0; ++#else ++ init_nf_hooks(nf_hooks); ++ return 0; ++#endif ++} ++EXPORT_SYMBOL(init_netfilter); ++ ++#ifdef CONFIG_VE_IPTABLES ++void fini_netfilter(void) ++{ ++ struct ve_struct *envid; ++ ++ envid = get_exec_env(); ++ if (envid->_nf_hooks != NULL) ++ kfree(envid->_nf_hooks); ++ envid->_nf_hooks = NULL; ++ ++ /* FIXME: uncharge ubc */ ++} ++EXPORT_SYMBOL(fini_netfilter); ++#endif ++ ++void __init netfilter_init(void) ++{ ++ init_netfilter(); + + #ifdef CONFIG_PROC_FS + proc_net_netfilter = proc_mkdir("netfilter", proc_net); +@@ -214,3 +313,4 @@ void __init netfilter_init(void) + if (netfilter_log_init() < 0) + panic("cannot initialize nf_log"); + } ++ +diff -upr linux-2.6.16.orig/net/netfilter/nf_conntrack_netlink.c linux-2.6.16-026test015/net/netfilter/nf_conntrack_netlink.c +--- linux-2.6.16.orig/net/netfilter/nf_conntrack_netlink.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/nf_conntrack_netlink.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1641,7 +1641,7 @@ static void __exit ctnetlink_exit(void) + printk("ctnetlink: unregistering from nfnetlink.\n"); + + #ifdef CONFIG_NF_CONNTRACK_EVENTS +- nf_conntrack_unregister_notifier(&ctnl_notifier_exp); ++ nf_conntrack_expect_unregister_notifier(&ctnl_notifier_exp); + nf_conntrack_unregister_notifier(&ctnl_notifier); + #endif + +diff -upr linux-2.6.16.orig/net/netfilter/nf_conntrack_proto_sctp.c linux-2.6.16-026test015/net/netfilter/nf_conntrack_proto_sctp.c +--- linux-2.6.16.orig/net/netfilter/nf_conntrack_proto_sctp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/nf_conntrack_proto_sctp.c 2006-07-04 14:41:36.000000000 +0400 +@@ -240,12 +240,15 @@ static int do_basic_checks(struct nf_con + flag = 1; + } + +- /* Cookie Ack/Echo chunks not the first OR +- Init / Init Ack / Shutdown compl chunks not the only chunks */ +- if ((sch->type == SCTP_CID_COOKIE_ACK ++ /* ++ * Cookie Ack/Echo chunks not the first OR ++ * Init / Init Ack / Shutdown compl chunks not the only chunks ++ * OR zero-length. ++ */ ++ if (((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) +- && count !=0 ) { ++ && count !=0) || !sch->length) { + DEBUGP("Basic checks failed\n"); + return 1; + } +@@ -256,7 +259,7 @@ static int do_basic_checks(struct nf_con + } + + DEBUGP("Basic checks passed\n"); +- return 0; ++ return count == 0; + } + + static int new_state(enum ip_conntrack_dir dir, +diff -upr linux-2.6.16.orig/net/netfilter/nf_queue.c linux-2.6.16-026test015/net/netfilter/nf_queue.c +--- linux-2.6.16.orig/net/netfilter/nf_queue.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/nf_queue.c 2006-07-04 14:41:39.000000000 +0400 +@@ -209,12 +209,12 @@ void nf_reinject(struct sk_buff *skb, st + /* Drop reference to owner of hook which queued us. */ + module_put(info->elem->owner); + +- list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { ++ list_for_each_rcu(i, &ve_nf_hooks[info->pf][info->hook]) { + if (i == elem) + break; + } + +- if (i == &nf_hooks[info->pf][info->hook]) { ++ if (i == &ve_nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + NFDEBUG("%s: module disappeared, dropping packet.\n", + __FUNCTION__); +@@ -235,7 +235,7 @@ void nf_reinject(struct sk_buff *skb, st + + if (verdict == NF_ACCEPT) { + next_hook: +- verdict = nf_iterate(&nf_hooks[info->pf][info->hook], ++ verdict = nf_iterate(&ve_nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem, + info->okfn, INT_MIN); +diff -upr linux-2.6.16.orig/net/netfilter/nf_sockopt.c linux-2.6.16-026test015/net/netfilter/nf_sockopt.c +--- linux-2.6.16.orig/net/netfilter/nf_sockopt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/nf_sockopt.c 2006-07-04 14:41:39.000000000 +0400 +@@ -80,6 +80,12 @@ static int nf_sockopt(struct sock *sk, i + struct nf_sockopt_ops *ops; + int ret; + ++#ifdef CONFIG_VE_IPTABLES ++ if (!get_exec_env()->_nf_hooks || ++ !get_exec_env()->_ipt_standard_target) ++ return -ENOPROTOOPT; ++#endif ++ + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + +diff -upr linux-2.6.16.orig/net/netfilter/x_tables.c linux-2.6.16-026test015/net/netfilter/x_tables.c +--- linux-2.6.16.orig/net/netfilter/x_tables.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/x_tables.c 2006-07-04 14:41:39.000000000 +0400 +@@ -24,6 +24,10 @@ + + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter_arp.h> ++#include <linux/nfcalls.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_mem.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +@@ -38,7 +42,13 @@ struct xt_af { + struct list_head tables; + }; + ++#ifdef CONFIG_VE_IPTABLES ++/* include ve.h and define get_exec_env */ ++#include <linux/sched.h> ++#define xt (get_exec_env()->_xt) ++#else + static struct xt_af *xt; ++#endif + + #ifdef DEBUG_IP_FIREWALL_USER + #define duprintf(format, args...) printk(format , ## args) +@@ -52,17 +62,52 @@ enum { + MATCH, + }; + ++#ifdef CONFIG_USER_RESOURCE ++#define UB_NUMXTENT 23 ++static int charge_xtables(struct user_beancounter *ub, unsigned long size) ++{ ++ if (ub == NULL) ++ return 0; ++ return charge_beancounter(ub, UB_NUMXTENT, size, 1); ++} ++static void uncharge_xtables(struct user_beancounter *ub, unsigned long size) ++{ ++ if (ub == NULL) ++ return; ++ uncharge_beancounter(ub, UB_NUMXTENT, size); ++} ++#endif /* CONFIG_USER_RESOURCE */ ++ + /* Registration hooks for targets. */ + int + xt_register_target(int af, struct xt_target *target) + { + int ret; ++ struct module *mod = target->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct xt_target *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = ub_kmalloc(sizeof(struct xt_target), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, target, sizeof(struct xt_target)); ++ target = tmp; ++ } + + ret = down_interruptible(&xt[af].mutex); + if (ret != 0) +- return ret; ++ goto out; + list_add(&target->list, &xt[af].target); + up(&xt[af].mutex); ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())) { ++ kfree(target); ++nomem: ++ module_put(mod); ++ } + return ret; + } + EXPORT_SYMBOL(xt_register_target); +@@ -71,8 +116,21 @@ void + xt_unregister_target(int af, struct xt_target *target) + { + down(&xt[af].mutex); ++ if (!ve_is_super(get_exec_env())) { ++ target = list_named_find(&xt[af].target, target->name); ++ if (!target) { ++ up(&xt[af].mutex); ++ return; ++ } ++ } ++ + LIST_DELETE(&xt[af].target, target); + up(&xt[af].mutex); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(target->me); ++ kfree(target); ++ } + } + EXPORT_SYMBOL(xt_unregister_target); + +@@ -80,14 +138,33 @@ int + xt_register_match(int af, struct xt_match *match) + { + int ret; ++ struct module *mod = match->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct xt_match *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = ub_kmalloc(sizeof(struct xt_match), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, match, sizeof(struct xt_match)); ++ match = tmp; ++ } + + ret = down_interruptible(&xt[af].mutex); + if (ret != 0) +- return ret; ++ goto out; + + list_add(&match->list, &xt[af].match); + up(&xt[af].mutex); + ++ return 0; ++out: ++ if (!ve_is_super(get_exec_env())) { ++ kfree(match); ++nomem: ++ module_put(mod); ++ } + return ret; + } + EXPORT_SYMBOL(xt_register_match); +@@ -96,8 +173,21 @@ void + xt_unregister_match(int af, struct xt_match *match) + { + down(&xt[af].mutex); ++ if (!ve_is_super(get_exec_env())) { ++ match = list_named_find(&xt[af].match, match->name); ++ if (!match) { ++ up(&xt[af].mutex); ++ return; ++ } ++ } ++ + LIST_DELETE(&xt[af].match, match); + up(&xt[af].mutex); ++ ++ if (!ve_is_super(get_exec_env())) { ++ module_put(match->me); ++ kfree(match); ++ } + } + EXPORT_SYMBOL(xt_unregister_match); + +@@ -246,7 +336,7 @@ struct xt_table_info *xt_alloc_table_inf + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) + return NULL; + +- newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL); ++ newinfo = kzalloc(sizeof(struct xt_table_info), GFP_KERNEL_UBC); + if (!newinfo) + return NULL; + +@@ -255,10 +345,10 @@ struct xt_table_info *xt_alloc_table_inf + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, +- GFP_KERNEL, ++ GFP_KERNEL_UBC, + cpu_to_node(cpu)); + else +- newinfo->entries[cpu] = vmalloc_node(size, ++ newinfo->entries[cpu] = ub_vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { +@@ -315,6 +405,9 @@ xt_replace_table(struct xt_table *table, + int *error) + { + struct xt_table_info *oldinfo, *private; ++#ifdef CONFIG_USER_RESOURCE ++ struct user_beancounter *old_ub, *new_ub; ++#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); +@@ -328,6 +421,21 @@ xt_replace_table(struct xt_table *table, + return NULL; + } + oldinfo = private; ++ ++#ifdef CONFIG_USER_RESOURCE ++ new_ub = mem_ub(newinfo); ++ if (charge_xtables(new_ub, newinfo->number)) { ++ oldinfo = NULL; ++ write_unlock_bh(&table->lock); ++ *error = -ENOMEM; ++ return NULL; ++ } ++ if (num_counters) { ++ old_ub = mem_ub(oldinfo); ++ uncharge_xtables(old_ub, oldinfo->number); ++ } ++#endif ++ + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); +@@ -355,6 +463,7 @@ int xt_register_table(struct xt_table *t + + /* Simplifies replace_table code. */ + table->private = bootstrap; ++ rwlock_init(&table->lock); + if (!xt_replace_table(table, 0, newinfo, &ret)) + goto unlock; + +@@ -364,7 +473,6 @@ int xt_register_table(struct xt_table *t + /* save number of initial entries */ + private->initial_entries = private->number; + +- rwlock_init(&table->lock); + list_prepend(&xt[table->af].tables, table); + + ret = 0; +@@ -374,6 +482,39 @@ int xt_register_table(struct xt_table *t + } + EXPORT_SYMBOL_GPL(xt_register_table); + ++struct xt_table * virt_xt_register_table(struct xt_table *table, ++ struct xt_table_info *bootstrap, ++ struct xt_table_info *newinfo) ++{ ++ int ret; ++ struct module *mod = table->me; ++ ++ if (!ve_is_super(get_exec_env())) { ++ struct xt_table *tmp; ++ __module_get(mod); ++ ret = -ENOMEM; ++ tmp = ub_kmalloc(sizeof(struct xt_table), GFP_KERNEL); ++ if (!tmp) ++ goto nomem; ++ memcpy(tmp, table, sizeof(struct xt_table)); ++ table = tmp; ++ } ++ ++ ret = xt_register_table(table, bootstrap, newinfo); ++ if (ret) ++ goto out; ++ ++ return table; ++out: ++ if (!ve_is_super(get_exec_env())) { ++ kfree(table); ++nomem: ++ module_put(mod); ++ } ++ return ERR_PTR(ret); ++} ++EXPORT_SYMBOL_GPL(virt_xt_register_table); ++ + void *xt_unregister_table(struct xt_table *table) + { + struct xt_table_info *private; +@@ -383,10 +524,27 @@ void *xt_unregister_table(struct xt_tabl + LIST_DELETE(&xt[table->af].tables, table); + up(&xt[table->af].mutex); + ++#ifdef CONFIG_USER_RESOURCE ++ uncharge_xtables(mem_ub(private), private->number); ++#endif ++ + return private; + } + EXPORT_SYMBOL_GPL(xt_unregister_table); + ++void *virt_xt_unregister_table(struct xt_table *table) ++{ ++ void *ret; ++ ++ ret = xt_unregister_table(table); ++ if (!ve_is_super(get_exec_env())) { ++ module_put(table->me); ++ kfree(table); ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(virt_xt_unregister_table); ++ + #ifdef CONFIG_PROC_FS + static char *xt_proto_prefix[NPROTO] = { + [AF_INET] = "ip", +@@ -597,10 +755,13 @@ void xt_proto_fini(int af) + EXPORT_SYMBOL_GPL(xt_proto_fini); + + +-static int __init xt_init(void) ++int init_xtables(void) + { + int i; + ++ if (xt) ++ return -EEXIST; ++ + xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL); + if (!xt) + return -ENOMEM; +@@ -614,11 +775,34 @@ static int __init xt_init(void) + return 0; + } + +-static void __exit xt_fini(void) ++void fini_xtables(void) + { + kfree(xt); ++ xt = NULL; ++} ++ ++static int __init xt_init(void) ++{ ++ int err; ++ ++ err = init_xtables(); ++ if (err) ++ return err; ++ ++ KSYMRESOLVE(init_xtables); ++ KSYMRESOLVE(fini_xtables); ++ KSYMMODRESOLVE(x_tables); ++ return 0; ++} ++ ++static void __exit xt_fini(void) ++{ ++ KSYMMODUNRESOLVE(x_tables); ++ KSYMUNRESOLVE(init_xtables); ++ KSYMUNRESOLVE(fini_xtables); ++ fini_xtables(); + } + +-module_init(xt_init); ++subsys_initcall(xt_init); + module_exit(xt_fini); + +diff -upr linux-2.6.16.orig/net/netfilter/xt_conntrack.c linux-2.6.16-026test015/net/netfilter/xt_conntrack.c +--- linux-2.6.16.orig/net/netfilter/xt_conntrack.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_conntrack.c 2006-07-04 14:41:39.000000000 +0400 +@@ -20,6 +20,8 @@ + + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter/xt_conntrack.h> ++#include <linux/netfilter_ipv4/ip_tables.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +@@ -213,25 +215,145 @@ static int check(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat_to_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct ipt_entry_match *pm; ++ struct xt_conntrack_info *pinfo; ++ struct compat_xt_conntrack_info info; ++ u_int16_t msize; ++ ++ pm = (struct ipt_entry_match *)match; ++ msize = pm->u.user.match_size; ++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) ++ return -EFAULT; ++ pinfo = (struct xt_conntrack_info *)pm->data; ++ memset(&info, 0, sizeof(struct compat_xt_conntrack_info)); ++ info.statemask = pinfo->statemask; ++ info.statusmask = pinfo->statusmask; ++ memcpy(info.tuple, pinfo->tuple, IP_CT_DIR_MAX * ++ sizeof(struct ip_conntrack_tuple)); ++ memcpy(info.sipmsk, pinfo->sipmsk, ++ IP_CT_DIR_MAX * sizeof(struct in_addr)); ++ memcpy(info.dipmsk, pinfo->dipmsk, ++ IP_CT_DIR_MAX * sizeof(struct in_addr)); ++ info.expires_min = pinfo->expires_min; ++ info.expires_max = pinfo->expires_max; ++ info.flags = pinfo->flags; ++ info.invflags = pinfo->invflags; ++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), ++ &info, sizeof(struct compat_xt_conntrack_info))) ++ return -EFAULT; ++ msize -= off; ++ if (put_user(msize, (u_int16_t *)*dstptr)) ++ return -EFAULT; ++ *size -= off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int compat_from_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct compat_ipt_entry_match *pm; ++ struct ipt_entry_match *dstpm; ++ struct compat_xt_conntrack_info *pinfo; ++ struct xt_conntrack_info info; ++ u_int16_t msize; ++ ++ pm = (struct compat_ipt_entry_match *)match; ++ dstpm = (struct ipt_entry_match *)*dstptr; ++ msize = pm->u.user.match_size; ++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); ++ pinfo = (struct compat_xt_conntrack_info *)pm->data; ++ memset(&info, 0, sizeof(struct xt_conntrack_info)); ++ info.statemask = pinfo->statemask; ++ info.statusmask = pinfo->statusmask; ++ memcpy(info.tuple, pinfo->tuple, IP_CT_DIR_MAX * ++ sizeof(struct ip_conntrack_tuple)); ++ memcpy(info.sipmsk, pinfo->sipmsk, ++ IP_CT_DIR_MAX * sizeof(struct in_addr)); ++ memcpy(info.dipmsk, pinfo->dipmsk, ++ IP_CT_DIR_MAX * sizeof(struct in_addr)); ++ info.expires_min = pinfo->expires_min; ++ info.expires_max = pinfo->expires_max; ++ info.flags = pinfo->flags; ++ info.invflags = pinfo->invflags; ++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), ++ &info, sizeof(struct xt_conntrack_info)); ++ msize += off; ++ dstpm->u.user.match_size = msize; ++ *size += off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int compat(void *match, void **dstptr, int *size, int convert) ++{ ++ int ret, off; ++ ++ off = XT_ALIGN(sizeof(struct xt_conntrack_info)) - ++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_conntrack_info)); ++ switch (convert) { ++ case COMPAT_TO_USER: ++ ret = compat_to_user(match, dstptr, size, off); ++ break; ++ case COMPAT_FROM_USER: ++ ret = compat_from_user(match, dstptr, size, off); ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ ret = 0; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++#endif ++ + static struct xt_match conntrack_match = { + .name = "conntrack", + .match = &match, + .checkentry = &check, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + ++int init_xt_conntrack_match(void) ++{ ++ return xt_register_match(AF_INET, &conntrack_match); ++} ++ ++void fini_xt_conntrack_match(void) ++{ ++ xt_unregister_match(AF_INET, &conntrack_match); ++} ++ + static int __init init(void) + { + int ret; + need_conntrack(); +- ret = xt_register_match(AF_INET, &conntrack_match); +- ++ ret = init_xt_conntrack_match(); ++ if (ret < 0) ++ return ret; ++ ++ KSYMRESOLVE(init_xt_conntrack_match); ++ KSYMRESOLVE(fini_xt_conntrack_match); ++ KSYMMODRESOLVE(xt_conntrack); + return ret; + } + + static void __exit fini(void) + { +- xt_unregister_match(AF_INET, &conntrack_match); ++ KSYMMODUNRESOLVE(xt_conntrack); ++ KSYMUNRESOLVE(init_xt_conntrack_match); ++ KSYMUNRESOLVE(fini_xt_conntrack_match); ++ fini_xt_conntrack_match(); + } + + module_init(init); +diff -upr linux-2.6.16.orig/net/netfilter/xt_helper.c linux-2.6.16-026test015/net/netfilter/xt_helper.c +--- linux-2.6.16.orig/net/netfilter/xt_helper.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_helper.c 2006-07-04 14:41:39.000000000 +0400 +@@ -24,6 +24,8 @@ + #endif + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter/xt_helper.h> ++#include <linux/netfilter_ipv4/ip_tables.h> ++#include <linux/nfcalls.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); +@@ -148,23 +150,107 @@ static int check(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat_to_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct ipt_entry_match *pm; ++ struct xt_helper_info *pinfo; ++ struct compat_xt_helper_info info; ++ u_int16_t msize; ++ ++ pm = (struct ipt_entry_match *)match; ++ msize = pm->u.user.match_size; ++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) ++ return -EFAULT; ++ pinfo = (struct xt_helper_info *)pm->data; ++ memset(&info, 0, sizeof(struct compat_xt_helper_info)); ++ info.invert = pinfo->invert; ++ memcpy(info.name, pinfo->name, 30); ++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), ++ &info, sizeof(struct compat_xt_helper_info))) ++ return -EFAULT; ++ msize -= off; ++ if (put_user(msize, (u_int16_t *)*dstptr)) ++ return -EFAULT; ++ *size -= off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int compat_from_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct compat_ipt_entry_match *pm; ++ struct ipt_entry_match *dstpm; ++ struct compat_xt_helper_info *pinfo; ++ struct xt_helper_info info; ++ u_int16_t msize; ++ ++ pm = (struct compat_ipt_entry_match *)match; ++ dstpm = (struct ipt_entry_match *)*dstptr; ++ msize = pm->u.user.match_size; ++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); ++ pinfo = (struct compat_xt_helper_info *)pm->data; ++ memset(&info, 0, sizeof(struct xt_helper_info)); ++ info.invert = pinfo->invert; ++ memcpy(info.name, pinfo->name, 30); ++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), ++ &info, sizeof(struct xt_helper_info)); ++ msize += off; ++ dstpm->u.user.match_size = msize; ++ *size += off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int compat(void *match, void **dstptr, int *size, int convert) ++{ ++ int ret, off; ++ ++ off = XT_ALIGN(sizeof(struct xt_helper_info)) - ++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_helper_info)); ++ switch (convert) { ++ case COMPAT_TO_USER: ++ ret = compat_to_user(match, dstptr, size, off); ++ break; ++ case COMPAT_FROM_USER: ++ ret = compat_from_user(match, dstptr, size, off); ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ ret = 0; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++#endif ++ + static struct xt_match helper_match = { + .name = "helper", + .match = &match, + .checkentry = &check, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + static struct xt_match helper6_match = { + .name = "helper", + .match = &match, + .checkentry = &check, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_xt_helper(void) + { + int ret; +- need_conntrack(); + + ret = xt_register_match(AF_INET, &helper_match); + if (ret < 0) +@@ -177,12 +263,35 @@ static int __init init(void) + return ret; + } + +-static void __exit fini(void) ++void fini_xt_helper(void) + { + xt_unregister_match(AF_INET, &helper_match); + xt_unregister_match(AF_INET6, &helper6_match); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ need_conntrack(); ++ err = init_xt_helper(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_xt_helper); ++ KSYMRESOLVE(fini_xt_helper); ++ KSYMMODRESOLVE(xt_helper); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(xt_helper); ++ KSYMUNRESOLVE(init_xt_helper); ++ KSYMUNRESOLVE(fini_xt_helper); ++ fini_xt_helper(); ++} ++ + module_init(init); + module_exit(fini); + +diff -upr linux-2.6.16.orig/net/netfilter/xt_length.c linux-2.6.16-026test015/net/netfilter/xt_length.c +--- linux-2.6.16.orig/net/netfilter/xt_length.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_length.c 2006-07-04 14:41:39.000000000 +0400 +@@ -13,6 +13,7 @@ + + #include <linux/netfilter/xt_length.h> + #include <linux/netfilter/x_tables.h> ++#include <linux/nfcalls.h> + + MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); + MODULE_DESCRIPTION("IP tables packet length matching module"); +@@ -63,20 +64,38 @@ checkentry(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = XT_ALIGN(sizeof(struct xt_length_info)) - ++ COMPAT_XT_ALIGN(sizeof(struct xt_length_info)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++#endif ++ + static struct xt_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + static struct xt_match length6_match = { + .name = "length", + .match = &match6, + .checkentry = &checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_xt_length(void) + { + int ret; + ret = xt_register_match(AF_INET, &length_match); +@@ -89,11 +108,33 @@ static int __init init(void) + return ret; + } + +-static void __exit fini(void) ++void fini_xt_length(void) + { + xt_unregister_match(AF_INET, &length_match); + xt_unregister_match(AF_INET6, &length6_match); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_xt_length(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_xt_length); ++ KSYMRESOLVE(fini_xt_length); ++ KSYMMODRESOLVE(xt_length); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(xt_length); ++ KSYMUNRESOLVE(init_xt_length); ++ KSYMUNRESOLVE(fini_xt_length); ++ fini_xt_length(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/netfilter/xt_limit.c linux-2.6.16-026test015/net/netfilter/xt_limit.c +--- linux-2.6.16.orig/net/netfilter/xt_limit.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_limit.c 2006-07-04 14:41:39.000000000 +0400 +@@ -17,9 +17,11 @@ + #include <linux/skbuff.h> + #include <linux/spinlock.h> + #include <linux/interrupt.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter/xt_limit.h> ++#include <linux/netfilter_ipv4/ip_tables.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); +@@ -27,6 +29,13 @@ MODULE_DESCRIPTION("iptables rate limit + MODULE_ALIAS("ipt_limit"); + MODULE_ALIAS("ip6t_limit"); + ++#ifdef CONFIG_VE_IPTABLES ++#include <linux/sched.h> ++#define ve_ipt_limit_reg (*(get_exec_env()->_ipt_limit_reg)) ++#else ++#define ve_ipt_limit_reg ipt_limit_reg ++#endif ++ + /* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ +@@ -137,20 +146,108 @@ ipt_limit_checkentry(const char *tablena + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int ipt_limit_compat_to_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct ipt_entry_match *pm; ++ struct xt_rateinfo *pinfo; ++ struct compat_xt_rateinfo rinfo; ++ u_int16_t msize; ++ ++ pm = (struct ipt_entry_match *)match; ++ msize = pm->u.user.match_size; ++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) ++ return -EFAULT; ++ pinfo = (struct xt_rateinfo *)pm->data; ++ memset(&rinfo, 0, sizeof(struct compat_xt_rateinfo)); ++ rinfo.avg = pinfo->avg; ++ rinfo.burst = pinfo->burst; ++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), ++ &rinfo, sizeof(struct compat_xt_rateinfo))) ++ return -EFAULT; ++ msize -= off; ++ if (put_user(msize, (u_int16_t *)*dstptr)) ++ return -EFAULT; ++ *size -= off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int ipt_limit_compat_from_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct compat_ipt_entry_match *pm; ++ struct ipt_entry_match *dstpm; ++ struct compat_xt_rateinfo *pinfo; ++ struct xt_rateinfo rinfo; ++ u_int16_t msize; ++ ++ pm = (struct compat_ipt_entry_match *)match; ++ dstpm = (struct ipt_entry_match *)*dstptr; ++ msize = pm->u.user.match_size; ++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); ++ pinfo = (struct compat_xt_rateinfo *)pm->data; ++ memset(&rinfo, 0, sizeof(struct xt_rateinfo)); ++ rinfo.avg = pinfo->avg; ++ rinfo.burst = pinfo->burst; ++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), ++ &rinfo, sizeof(struct xt_rateinfo)); ++ msize += off; ++ dstpm->u.user.match_size = msize; ++ *size += off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int ipt_limit_compat(void *match, void **dstptr, ++ int *size, int convert) ++{ ++ int ret, off; ++ ++ off = XT_ALIGN(sizeof(struct xt_rateinfo)) - ++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_rateinfo)); ++ switch (convert) { ++ case COMPAT_TO_USER: ++ ret = ipt_limit_compat_to_user(match, ++ dstptr, size, off); ++ break; ++ case COMPAT_FROM_USER: ++ ret = ipt_limit_compat_from_user(match, ++ dstptr, size, off); ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ ret = 0; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++#endif ++ + static struct xt_match ipt_limit_reg = { + .name = "limit", + .match = ipt_limit_match, + .checkentry = ipt_limit_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = ipt_limit_compat, ++#endif + .me = THIS_MODULE, + }; + static struct xt_match limit6_reg = { + .name = "limit", + .match = ipt_limit_match, + .checkentry = ipt_limit_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = ipt_limit_compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_xt_limit(void) + { + int ret; + +@@ -165,11 +262,33 @@ static int __init init(void) + return ret; + } + +-static void __exit fini(void) ++void fini_xt_limit(void) + { + xt_unregister_match(AF_INET, &ipt_limit_reg); + xt_unregister_match(AF_INET6, &limit6_reg); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_xt_limit(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_xt_limit); ++ KSYMRESOLVE(fini_xt_limit); ++ KSYMMODRESOLVE(xt_limit); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(xt_limit); ++ KSYMUNRESOLVE(init_xt_limit); ++ KSYMUNRESOLVE(fini_xt_limit); ++ fini_xt_limit(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/netfilter/xt_sctp.c linux-2.6.16-026test015/net/netfilter/xt_sctp.c +--- linux-2.6.16.orig/net/netfilter/xt_sctp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_sctp.c 2006-07-04 14:41:36.000000000 +0400 +@@ -62,7 +62,7 @@ match_packet(const struct sk_buff *skb, + + do { + sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); +- if (sch == NULL) { ++ if (sch == NULL || sch->length == 0) { + duprintf("Dropping invalid SCTP packet.\n"); + *hotdrop = 1; + return 0; +diff -upr linux-2.6.16.orig/net/netfilter/xt_state.c linux-2.6.16-026test015/net/netfilter/xt_state.c +--- linux-2.6.16.orig/net/netfilter/xt_state.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_state.c 2006-07-04 14:41:39.000000000 +0400 +@@ -10,9 +10,11 @@ + + #include <linux/module.h> + #include <linux/skbuff.h> ++#include <linux/nfcalls.h> + #include <net/netfilter/nf_conntrack_compat.h> + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter/xt_state.h> ++#include <linux/netfilter_ipv4/ip_tables.h> + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +@@ -55,10 +57,90 @@ static int check(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat_to_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct ipt_entry_match *pm; ++ struct xt_state_info *pinfo; ++ struct compat_xt_state_info info; ++ u_int16_t msize; ++ ++ pm = (struct ipt_entry_match *)match; ++ msize = pm->u.user.match_size; ++ if (__copy_to_user(*dstptr, pm, sizeof(struct ipt_entry_match))) ++ return -EFAULT; ++ pinfo = (struct xt_state_info *)pm->data; ++ memset(&info, 0, sizeof(struct compat_xt_state_info)); ++ info.statemask = pinfo->statemask; ++ if (__copy_to_user(*dstptr + sizeof(struct ipt_entry_match), ++ &info, sizeof(struct compat_xt_state_info))) ++ return -EFAULT; ++ msize -= off; ++ if (put_user(msize, (u_int16_t *)*dstptr)) ++ return -EFAULT; ++ *size -= off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int compat_from_user(void *match, void **dstptr, ++ int *size, int off) ++{ ++ struct compat_ipt_entry_match *pm; ++ struct ipt_entry_match *dstpm; ++ struct compat_xt_state_info *pinfo; ++ struct xt_state_info info; ++ u_int16_t msize; ++ ++ pm = (struct compat_ipt_entry_match *)match; ++ dstpm = (struct ipt_entry_match *)*dstptr; ++ msize = pm->u.user.match_size; ++ memcpy(*dstptr, pm, sizeof(struct compat_ipt_entry_match)); ++ pinfo = (struct compat_xt_state_info *)pm->data; ++ memset(&info, 0, sizeof(struct xt_state_info)); ++ info.statemask = pinfo->statemask; ++ memcpy(*dstptr + sizeof(struct compat_ipt_entry_match), ++ &info, sizeof(struct xt_state_info)); ++ msize += off; ++ dstpm->u.user.match_size = msize; ++ *size += off; ++ *dstptr += msize; ++ return 0; ++} ++ ++static int compat(void *match, void **dstptr, int *size, int convert) ++{ ++ int ret, off; ++ ++ off = XT_ALIGN(sizeof(struct xt_state_info)) - ++ COMPAT_XT_ALIGN(sizeof(struct compat_xt_state_info)); ++ switch (convert) { ++ case COMPAT_TO_USER: ++ ret = compat_to_user(match, dstptr, size, off); ++ break; ++ case COMPAT_FROM_USER: ++ ret = compat_from_user(match, dstptr, size, off); ++ break; ++ case COMPAT_CALC_SIZE: ++ *size += off; ++ ret = 0; ++ break; ++ default: ++ ret = -ENOPROTOOPT; ++ break; ++ } ++ return ret; ++} ++#endif ++ + static struct xt_match state_match = { + .name = "state", + .match = &match, + .checkentry = &check, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +@@ -66,15 +148,16 @@ static struct xt_match state6_match = { + .name = "state", + .match = &match, + .checkentry = &check, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_xt_state(void) + { + int ret; + +- need_conntrack(); +- + ret = xt_register_match(AF_INET, &state_match); + if (ret < 0) + return ret; +@@ -86,11 +169,34 @@ static int __init init(void) + return ret; + } + +-static void __exit fini(void) ++void fini_xt_state(void) + { + xt_unregister_match(AF_INET, &state_match); + xt_unregister_match(AF_INET6, &state6_match); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ need_conntrack(); ++ err = init_xt_state(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_xt_state); ++ KSYMRESOLVE(fini_xt_state); ++ KSYMMODRESOLVE(xt_state); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(xt_state); ++ KSYMUNRESOLVE(init_xt_state); ++ KSYMUNRESOLVE(fini_xt_state); ++ fini_xt_state(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/netfilter/xt_tcpmss.c linux-2.6.16-026test015/net/netfilter/xt_tcpmss.c +--- linux-2.6.16.orig/net/netfilter/xt_tcpmss.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_tcpmss.c 2006-07-04 14:41:39.000000000 +0400 +@@ -11,6 +11,7 @@ + #include <linux/module.h> + #include <linux/skbuff.h> + #include <net/tcp.h> ++#include <linux/nfcalls.h> + + #include <linux/netfilter/xt_tcpmss.h> + #include <linux/netfilter/x_tables.h> +@@ -133,10 +134,25 @@ checkentry6(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = XT_ALIGN(sizeof(struct xt_tcpmss_match_info)) - ++ COMPAT_XT_ALIGN(sizeof(struct xt_tcpmss_match_info)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++#endif ++ + static struct xt_match tcpmss_match = { + .name = "tcpmss", + .match = &match, + .checkentry = &checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + +@@ -144,11 +160,14 @@ static struct xt_match tcpmss6_match = { + .name = "tcpmss", + .match = &match, + .checkentry = &checkentry6, ++#ifdef CONFIG_COMPAT ++ .compat = &compat, ++#endif + .me = THIS_MODULE, + }; + + +-static int __init init(void) ++int init_xt_tcpmss(void) + { + int ret; + ret = xt_register_match(AF_INET, &tcpmss_match); +@@ -162,11 +181,33 @@ static int __init init(void) + return ret; + } + +-static void __exit fini(void) ++void fini_xt_tcpmss(void) + { + xt_unregister_match(AF_INET6, &tcpmss6_match); + xt_unregister_match(AF_INET, &tcpmss_match); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_xt_tcpmss(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_xt_tcpmss); ++ KSYMRESOLVE(fini_xt_tcpmss); ++ KSYMMODRESOLVE(xt_tcpmss); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(xt_tcpmss); ++ KSYMUNRESOLVE(init_xt_tcpmss); ++ KSYMUNRESOLVE(fini_xt_tcpmss); ++ fini_xt_tcpmss(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/netfilter/xt_tcpudp.c linux-2.6.16-026test015/net/netfilter/xt_tcpudp.c +--- linux-2.6.16.orig/net/netfilter/xt_tcpudp.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netfilter/xt_tcpudp.c 2006-07-04 14:41:39.000000000 +0400 +@@ -5,6 +5,7 @@ + #include <net/ipv6.h> + #include <net/tcp.h> + #include <net/udp.h> ++#include <linux/nfcalls.h> + #include <linux/netfilter/x_tables.h> + #include <linux/netfilter/xt_tcpudp.h> + #include <linux/netfilter_ipv4/ip_tables.h> +@@ -266,10 +267,35 @@ udp6_checkentry(const char *tablename, + return 1; + } + ++#ifdef CONFIG_COMPAT ++static int tcp_compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = XT_ALIGN(sizeof(struct xt_tcp)) - ++ COMPAT_XT_ALIGN(sizeof(struct xt_tcp)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++ ++static int udp_compat(void *match, ++ void **dstptr, int *size, int convert) ++{ ++ int off; ++ ++ off = XT_ALIGN(sizeof(struct xt_udp)) - ++ COMPAT_XT_ALIGN(sizeof(struct xt_udp)); ++ return ipt_match_align_compat(match, dstptr, size, off, convert); ++} ++#endif ++ + static struct xt_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &tcp_compat, ++#endif + .me = THIS_MODULE, + }; + static struct xt_match tcp6_matchstruct = { +@@ -283,6 +309,9 @@ static struct xt_match udp_matchstruct = + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, ++#ifdef CONFIG_COMPAT ++ .compat = &udp_compat, ++#endif + .me = THIS_MODULE, + }; + static struct xt_match udp6_matchstruct = { +@@ -292,7 +321,7 @@ static struct xt_match udp6_matchstruct + .me = THIS_MODULE, + }; + +-static int __init init(void) ++int init_xt_tcpudp(void) + { + int ret; + ret = xt_register_match(AF_INET, &tcp_matchstruct); +@@ -322,7 +351,7 @@ out_unreg_tcp: + return ret; + } + +-static void __exit fini(void) ++void fini_xt_tcpudp(void) + { + xt_unregister_match(AF_INET6, &udp6_matchstruct); + xt_unregister_match(AF_INET, &udp_matchstruct); +@@ -330,5 +359,27 @@ static void __exit fini(void) + xt_unregister_match(AF_INET, &tcp_matchstruct); + } + ++static int __init init(void) ++{ ++ int err; ++ ++ err = init_xt_tcpudp(); ++ if (err < 0) ++ return err; ++ ++ KSYMRESOLVE(init_xt_tcpudp); ++ KSYMRESOLVE(fini_xt_tcpudp); ++ KSYMMODRESOLVE(xt_tcpudp); ++ return 0; ++} ++ ++static void __exit fini(void) ++{ ++ KSYMMODUNRESOLVE(xt_tcpudp); ++ KSYMUNRESOLVE(init_xt_tcpudp); ++ KSYMUNRESOLVE(fini_xt_tcpudp); ++ fini_xt_tcpudp(); ++} ++ + module_init(init); + module_exit(fini); +diff -upr linux-2.6.16.orig/net/netlink/af_netlink.c linux-2.6.16-026test015/net/netlink/af_netlink.c +--- linux-2.6.16.orig/net/netlink/af_netlink.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/netlink/af_netlink.c 2006-07-04 14:41:39.000000000 +0400 +@@ -60,27 +60,14 @@ + #include <net/sock.h> + #include <net/scm.h> + #include <net/netlink.h> ++#include <net/netlink_sock.h> ++ ++#include <ub/beancounter.h> ++#include <ub/ub_net.h> + + #define Nprintk(a...) + #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) + +-struct netlink_sock { +- /* struct sock has to be the first member of netlink_sock */ +- struct sock sk; +- u32 pid; +- u32 dst_pid; +- u32 dst_group; +- u32 flags; +- u32 subscriptions; +- u32 ngroups; +- unsigned long *groups; +- unsigned long state; +- wait_queue_head_t wait; +- struct netlink_callback *cb; +- spinlock_t cb_lock; +- void (*data_ready)(struct sock *sk, int bytes); +- struct module *module; +-}; + + #define NETLINK_KERNEL_SOCKET 0x1 + #define NETLINK_RECV_PKTINFO 0x2 +@@ -209,7 +196,10 @@ static __inline__ struct sock *netlink_l + read_lock(&nl_table_lock); + head = nl_pid_hashfn(hash, pid); + sk_for_each(sk, node, head) { +- if (nlk_sk(sk)->pid == pid) { ++ /* VEs should find sockets, created by kernel */ ++ if ((nlk_sk(sk)->pid == pid) && ++ (!pid || ve_accessible_strict(VE_OWNER_SK(sk), ++ get_exec_env()))){ + sock_hold(sk); + goto found; + } +@@ -309,7 +299,9 @@ static int netlink_insert(struct sock *s + head = nl_pid_hashfn(hash, pid); + len = 0; + sk_for_each(osk, node, head) { +- if (nlk_sk(osk)->pid == pid) ++ if ((nlk_sk(sk)->pid == pid) && ++ ve_accessible_strict(VE_OWNER_SK(sk), ++ get_exec_env())) + break; + len++; + } +@@ -362,6 +354,8 @@ static int __netlink_create(struct socke + sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); + if (!sk) + return -ENOMEM; ++ if (ub_other_sock_charge(sk)) ++ goto out_free; + + sock_init_data(sock, sk); + +@@ -372,6 +366,10 @@ static int __netlink_create(struct socke + sk->sk_destruct = netlink_sock_destruct; + sk->sk_protocol = protocol; + return 0; ++ ++out_free: ++ sk_free(sk); ++ return -ENOMEM; + } + + static int netlink_create(struct socket *sock, int protocol) +@@ -425,6 +423,7 @@ static int netlink_release(struct socket + return 0; + + netlink_remove(sk); ++ sock_orphan(sk); + nlk = nlk_sk(sk); + + spin_lock(&nlk->cb_lock); +@@ -439,7 +438,6 @@ static int netlink_release(struct socket + /* OK. Socket is unlinked, and, therefore, + no new packets will arrive */ + +- sock_orphan(sk); + sock->sk = NULL; + wake_up_interruptible_all(&nlk->wait); + +@@ -477,7 +475,7 @@ static int netlink_autobind(struct socke + struct hlist_head *head; + struct sock *osk; + struct hlist_node *node; +- s32 pid = current->tgid; ++ s32 pid = virt_pid(current); + int err; + static s32 rover = -4097; + +@@ -486,7 +484,9 @@ retry: + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + sk_for_each(osk, node, head) { +- if (nlk_sk(osk)->pid == pid) { ++ if ((nlk_sk(osk)->pid == pid) && ++ ve_accessible_strict(VE_OWNER_SK(osk), ++ get_exec_env())) { + /* Bind collision, search negative pid values. */ + pid = rover--; + if (rover > -4097) +@@ -511,7 +511,7 @@ retry: + static inline int netlink_capable(struct socket *sock, unsigned int flag) + { + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || +- capable(CAP_NET_ADMIN); ++ capable(CAP_VE_NET_ADMIN); + } + + static void +@@ -845,6 +845,9 @@ static inline int do_one_broadcast(struc + !test_bit(p->group - 1, nlk->groups)) + goto out; + ++ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk))) ++ goto out; ++ + if (p->failure) { + netlink_overrun(sk); + goto out; +@@ -942,6 +945,9 @@ static inline int do_one_set_err(struct + !test_bit(p->group - 1, nlk->groups)) + goto out; + ++ if (!ve_accessible_strict(get_exec_env(), VE_OWNER_SK(sk))) ++ goto out; ++ + sk->sk_err = p->code; + sk->sk_error_report(sk); + out: +@@ -1076,12 +1082,17 @@ static int netlink_sendmsg(struct kiocb + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); +- struct sockaddr_nl *addr=msg->msg_name; ++ struct sockaddr_nl *addr = msg->msg_name; + u32 dst_pid; +- u32 dst_group; + struct sk_buff *skb; + int err; + struct scm_cookie scm; ++ struct sock *dstsk; ++ long timeo; ++ int no_ubc, no_buf; ++ unsigned long chargesize; ++ ++ DECLARE_WAITQUEUE(wait, current); + + if (msg->msg_flags&MSG_OOB) + return -EOPNOTSUPP; +@@ -1092,17 +1103,16 @@ static int netlink_sendmsg(struct kiocb + if (err < 0) + return err; + ++ /* Broadcasts from user to kernel are disabled. This is OK ++ * according to ANK */ + if (msg->msg_namelen) { + if (addr->nl_family != AF_NETLINK) + return -EINVAL; + dst_pid = addr->nl_pid; +- dst_group = ffs(addr->nl_groups); +- if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) ++ if (addr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; +- } else { ++ } else + dst_pid = nlk->dst_pid; +- dst_group = nlk->dst_group; +- } + + if (!nlk->pid) { + err = netlink_autobind(sock); +@@ -1115,12 +1125,12 @@ static int netlink_sendmsg(struct kiocb + goto out; + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); +- if (skb==NULL) ++ if (skb == NULL) + goto out; + + NETLINK_CB(skb).pid = nlk->pid; + NETLINK_CB(skb).dst_pid = dst_pid; +- NETLINK_CB(skb).dst_group = dst_group; ++ NETLINK_CB(skb).dst_group = 0; + NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context); + memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + +@@ -1131,25 +1141,88 @@ static int netlink_sendmsg(struct kiocb + */ + + err = -EFAULT; +- if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { +- kfree_skb(skb); +- goto out; +- } ++ if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) ++ goto out_free; + + err = security_netlink_send(sk, skb); +- if (err) { +- kfree_skb(skb); +- goto out; ++ if (err) ++ goto out_free; ++ ++ timeo = sock_sndtimeo(sk, msg->msg_flags&MSG_DONTWAIT); ++retry: ++ dstsk = netlink_getsockbypid(sk, dst_pid); ++ if (IS_ERR(dstsk)) { ++ err = PTR_ERR(dstsk); ++ goto out_free; + } + +- if (dst_group) { +- atomic_inc(&skb->users); +- netlink_broadcast(sk, skb, dst_pid, dst_group, GFP_KERNEL); ++ nlk = nlk_sk(dstsk); ++#ifdef NL_EMULATE_DEV ++ if (nlk->handler) { ++ skb_orphan(skb); ++ err = nlk->handler(protocol, skb); ++ goto out_put; ++ } ++#endif ++ ++ /* BTW, it could be done once, before the retry loop */ ++ chargesize = skb_charge_fullsize(skb); ++ no_ubc = ub_sock_getwres_other(sk, chargesize); ++ no_buf = atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || ++ test_bit(0, &nlk->state); ++ if (no_ubc || no_buf) { ++ wait_queue_head_t *sleep; ++ ++ if (!no_ubc) ++ ub_sock_retwres_other(sk, chargesize, ++ SOCK_MIN_UBCSPACE_CH); ++ err = -EAGAIN; ++ if (timeo == 0) { ++ kfree_skb(skb); ++ goto out_put; ++ } ++ ++ /* wake up comes to different queues */ ++ sleep = no_ubc ? sk->sk_sleep : &nlk->wait; ++ __set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(sleep, &wait); ++ ++ /* this if can't be moved upper because ub_sock_snd_queue_add() ++ * may change task state to TASK_RUNNING */ ++ if (no_ubc) ++ ub_sock_sndqueueadd_other(sk, chargesize); ++ ++ if ((atomic_read(&dstsk->sk_rmem_alloc) > dstsk->sk_rcvbuf || ++ test_bit(0, &nlk->state) || no_ubc) && ++ !sock_flag(dstsk, SOCK_DEAD)) ++ timeo = schedule_timeout(timeo); ++ ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(sleep, &wait); ++ if (no_ubc) ++ ub_sock_sndqueuedel(sk); ++ sock_put(dstsk); ++ ++ if (!signal_pending(current)) ++ goto retry; ++ err = sock_intr_errno(timeo); ++ goto out_free; + } +- err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + ++ skb_orphan(skb); ++ skb_set_owner_r(skb, dstsk); ++ ub_skb_set_charge(skb, sk, chargesize, UB_OTHERSOCKBUF); ++ skb_queue_tail(&dstsk->sk_receive_queue, skb); ++ dstsk->sk_data_ready(dstsk, len); ++ err = len; ++out_put: ++ sock_put(dstsk); + out: + return err; ++ ++out_free: ++ kfree_skb(skb); ++ return err; + } + + static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, +@@ -1303,6 +1376,10 @@ static int netlink_dump(struct sock *sk) + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + return -ENOBUFS; ++ if (ub_nlrcvbuf_charge(skb, sk) < 0) { ++ kfree_skb(skb); ++ return -EACCES; ++ } + + spin_lock(&nlk->cb_lock); + +@@ -1365,9 +1442,9 @@ int netlink_dump_start(struct sock *ssk, + return -ECONNREFUSED; + } + nlk = nlk_sk(sk); +- /* A dump is in progress... */ ++ /* A dump or destruction is in progress... */ + spin_lock(&nlk->cb_lock); +- if (nlk->cb) { ++ if (nlk->cb || sock_flag(sk, SOCK_DEAD)) { + spin_unlock(&nlk->cb_lock); + netlink_destroy_callback(cb); + sock_put(sk); +@@ -1471,8 +1548,15 @@ void netlink_run_queue(struct sock *sk, + *qlen = skb_queue_len(&sk->sk_receive_queue); + + for (; *qlen; (*qlen)--) { ++ int ret; ++ struct ve_struct *old_env; + skb = skb_dequeue(&sk->sk_receive_queue); +- if (netlink_rcv_skb(skb, cb)) { ++ ++ old_env = set_exec_env(VE_OWNER_SKB(skb)); ++ ret = netlink_rcv_skb(skb, cb); ++ (void)set_exec_env(old_env); ++ ++ if (ret) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, skb); + else { +@@ -1740,6 +1824,7 @@ enomem: + + sock_register(&netlink_family_ops); + #ifdef CONFIG_PROC_FS ++ /* FIXME: virtualize before give access from VEs */ + proc_net_fops_create("netlink", 0, &netlink_seq_fops); + #endif + /* The netlink device handler may be needed early. */ +diff -upr linux-2.6.16.orig/net/packet/af_packet.c linux-2.6.16-026test015/net/packet/af_packet.c +--- linux-2.6.16.orig/net/packet/af_packet.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/packet/af_packet.c 2006-07-04 14:41:38.000000000 +0400 +@@ -79,6 +79,8 @@ + #include <linux/module.h> + #include <linux/init.h> + ++#include <ub/ub_net.h> ++ + #ifdef CONFIG_INET + #include <net/inet_common.h> + #endif +@@ -280,7 +282,8 @@ static int packet_rcv_spkt(struct sk_buf + * so that this procedure is noop. + */ + +- if (skb->pkt_type == PACKET_LOOPBACK) ++ if (skb->pkt_type == PACKET_LOOPBACK || ++ !ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) + goto out; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) +@@ -472,6 +475,9 @@ static int packet_rcv(struct sk_buff *sk + sk = pt->af_packet_priv; + po = pkt_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) ++ goto drop; ++ + skb->dev = dev; + + if (dev->hard_header) { +@@ -531,6 +537,9 @@ static int packet_rcv(struct sk_buff *sk + if (pskb_trim(skb, snaplen)) + goto drop_n_acct; + ++ if (ub_sockrcvbuf_charge(sk, skb)) ++ goto drop_n_acct; ++ + skb_set_owner_r(skb, sk); + skb->dev = NULL; + dst_release(skb->dst); +@@ -581,6 +590,9 @@ static int tpacket_rcv(struct sk_buff *s + sk = pt->af_packet_priv; + po = pkt_sk(sk); + ++ if (!ve_accessible(VE_OWNER_SKB(skb), VE_OWNER_SK(sk))) ++ goto drop; ++ + if (dev->hard_header) { + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb->mac.raw); +@@ -630,6 +642,12 @@ static int tpacket_rcv(struct sk_buff *s + if (snaplen > skb->len-skb->data_len) + snaplen = skb->len-skb->data_len; + ++ if (copy_skb && ++ ub_sockrcvbuf_charge(sk, copy_skb)) { ++ spin_lock(&sk->sk_receive_queue.lock); ++ goto ring_is_full; ++ } ++ + spin_lock(&sk->sk_receive_queue.lock); + h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head); + +@@ -1010,6 +1028,8 @@ static int packet_create(struct socket * + sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); + if (sk == NULL) + goto out; ++ if (ub_other_sock_charge(sk)) ++ goto out_free; + + sock->ops = &packet_ops; + #ifdef CONFIG_SOCK_PACKET +@@ -1048,6 +1068,9 @@ static int packet_create(struct socket * + sk_add_node(sk, &packet_sklist); + write_unlock_bh(&packet_sklist_lock); + return(0); ++ ++out_free: ++ sk_free(sk); + out: + return err; + } +@@ -1430,11 +1453,16 @@ static int packet_notifier(struct notifi + struct sock *sk; + struct hlist_node *node; + struct net_device *dev = (struct net_device*)data; ++ struct ve_struct *ve; + ++ ve = get_exec_env(); + read_lock(&packet_sklist_lock); + sk_for_each(sk, node, &packet_sklist) { + struct packet_sock *po = pkt_sk(sk); + ++ if (!ve_accessible_strict(VE_OWNER_SK(sk), ve)) ++ continue; ++ + switch (msg) { + case NETDEV_UNREGISTER: + #ifdef CONFIG_PACKET_MULTICAST +@@ -1845,6 +1873,8 @@ static inline struct sock *packet_seq_id + struct hlist_node *node; + + sk_for_each(s, node, &packet_sklist) { ++ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env())) ++ continue; + if (!off--) + return s; + } +@@ -1860,9 +1890,13 @@ static void *packet_seq_start(struct seq + static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) + { + ++*pos; +- return (v == SEQ_START_TOKEN) +- ? sk_head(&packet_sklist) +- : sk_next((struct sock*)v) ; ++ do { ++ v = (v == SEQ_START_TOKEN) ++ ? sk_head(&packet_sklist) ++ : sk_next((struct sock*)v); ++ } while (v != NULL && ++ !ve_accessible(VE_OWNER_SK((struct sock*)v), get_exec_env())); ++ return v; + } + + static void packet_seq_stop(struct seq_file *seq, void *v) +@@ -1918,7 +1952,7 @@ static struct file_operations packet_seq + + static void __exit packet_exit(void) + { +- proc_net_remove("packet"); ++ remove_proc_glob_entry("net/packet", NULL); + unregister_netdevice_notifier(&packet_netdev_notifier); + sock_unregister(PF_PACKET); + proto_unregister(&packet_proto); +@@ -1933,7 +1967,7 @@ static int __init packet_init(void) + + sock_register(&packet_family_ops); + register_netdevice_notifier(&packet_netdev_notifier); +- proc_net_fops_create("packet", 0, &packet_seq_fops); ++ proc_glob_fops_create("net/packet", 0, &packet_seq_fops); + out: + return rc; + } +diff -upr linux-2.6.16.orig/net/sched/sch_cbq.c linux-2.6.16-026test015/net/sched/sch_cbq.c +--- linux-2.6.16.orig/net/sched/sch_cbq.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sched/sch_cbq.c 2006-07-04 14:41:37.000000000 +0400 +@@ -932,8 +932,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int + + if (cl->deficit <= 0) { + q->active[prio] = cl; +- cl = cl->next_alive; + cl->deficit += cl->quantum; ++ cl = cl->next_alive; + } + return skb; + +@@ -1109,17 +1109,19 @@ static void cbq_normalize_quanta(struct + + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { ++ long mtu; + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { +- cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ +- q->quanta[prio]; +- } +- if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { +- printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); +- cl->quantum = cl->qdisc->dev->mtu/2 + 1; ++ cl->quantum = (cl->weight * cl->allot) / ++ (q->quanta[prio] / q->nclasses[prio]); + } ++ mtu = cl->qdisc->dev->mtu; ++ if (cl->quantum <= mtu/2) ++ cl->quantum = mtu/2 + 1; ++ else if (cl->quantum > 32*mtu) ++ cl->quantum = 32*mtu; + } + } + } +diff -upr linux-2.6.16.orig/net/sched/sch_generic.c linux-2.6.16-026test015/net/sched/sch_generic.c +--- linux-2.6.16.orig/net/sched/sch_generic.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sched/sch_generic.c 2006-07-04 14:41:38.000000000 +0400 +@@ -97,6 +97,7 @@ int qdisc_restart(struct net_device *dev + + /* Dequeue packet */ + if ((skb = q->dequeue(q)) != NULL) { ++ struct ve_struct *envid; + unsigned nolock = (dev->features & NETIF_F_LLTX); + /* + * When the driver has LLTX set it does its own locking +@@ -107,6 +108,7 @@ int qdisc_restart(struct net_device *dev + * of lock congestion it should return -1 and the packet + * will be requeued. + */ ++ envid = set_exec_env(VE_OWNER_SKB(skb)); + if (!nolock) { + if (!spin_trylock(&dev->xmit_lock)) { + collision: +@@ -121,6 +123,7 @@ int qdisc_restart(struct net_device *dev + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); ++ (void)set_exec_env(envid); + return -1; + } + __get_cpu_var(netdev_rx_stat).cpu_collision++; +@@ -146,6 +149,7 @@ int qdisc_restart(struct net_device *dev + spin_unlock(&dev->xmit_lock); + } + spin_lock(&dev->queue_lock); ++ (void)set_exec_env(envid); + return -1; + } + if (ret == NETDEV_TX_LOCKED && nolock) { +@@ -177,6 +181,7 @@ int qdisc_restart(struct net_device *dev + requeue: + q->ops->requeue(skb, q); + netif_schedule(dev); ++ (void)set_exec_env(envid); + return 1; + } + BUG_ON((int) q->q.qlen < 0); +@@ -625,3 +630,4 @@ EXPORT_SYMBOL(qdisc_reset); + EXPORT_SYMBOL(qdisc_restart); + EXPORT_SYMBOL(qdisc_lock_tree); + EXPORT_SYMBOL(qdisc_unlock_tree); ++EXPORT_SYMBOL(dev_shutdown); +diff -upr linux-2.6.16.orig/net/sched/sch_teql.c linux-2.6.16-026test015/net/sched/sch_teql.c +--- linux-2.6.16.orig/net/sched/sch_teql.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sched/sch_teql.c 2006-07-04 14:41:38.000000000 +0400 +@@ -189,6 +189,9 @@ static int teql_qdisc_init(struct Qdisc + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = qdisc_priv(sch); + ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ + if (dev->hard_header_len > m->dev->hard_header_len) + return -EINVAL; + +diff -upr linux-2.6.16.orig/net/sctp/inqueue.c linux-2.6.16-026test015/net/sctp/inqueue.c +--- linux-2.6.16.orig/net/sctp/inqueue.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sctp/inqueue.c 2006-07-04 14:41:36.000000000 +0400 +@@ -149,6 +149,7 @@ struct sctp_chunk *sctp_inq_pop(struct s + /* This is the first chunk in the packet. */ + chunk->singleton = 1; + ch = (sctp_chunkhdr_t *) chunk->skb->data; ++ chunk->data_accepted = 0; + } + + chunk->chunk_hdr = ch; +diff -upr linux-2.6.16.orig/net/sctp/sm_statefuns.c linux-2.6.16-026test015/net/sctp/sm_statefuns.c +--- linux-2.6.16.orig/net/sctp/sm_statefuns.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sctp/sm_statefuns.c 2006-07-04 14:41:36.000000000 +0400 +@@ -636,8 +636,9 @@ sctp_disposition_t sctp_sf_do_5_1D_ce(co + */ + chunk->subh.cookie_hdr = + (struct sctp_signed_cookie *)chunk->skb->data; +- skb_pull(chunk->skb, +- ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t)); ++ if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - ++ sizeof(sctp_chunkhdr_t))) ++ goto nomem; + + /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint + * "Z" will reply with a COOKIE ACK chunk after building a TCB +@@ -965,7 +966,8 @@ sctp_disposition_t sctp_sf_beat_8_3(cons + */ + chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data; + paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); +- skb_pull(chunk->skb, paylen); ++ if (!pskb_pull(chunk->skb, paylen)) ++ goto nomem; + + reply = sctp_make_heartbeat_ack(asoc, chunk, + chunk->subh.hb_hdr, paylen); +@@ -1028,6 +1030,12 @@ sctp_disposition_t sctp_sf_backbeat_8_3( + commands); + + hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data; ++ /* Make sure that the length of the parameter is what we expect */ ++ if (ntohs(hbinfo->param_hdr.length) != ++ sizeof(sctp_sender_hb_info_t)) { ++ return SCTP_DISPOSITION_DISCARD; ++ } ++ + from_addr = hbinfo->daddr; + link = sctp_assoc_lookup_paddr(asoc, &from_addr); + +@@ -1860,8 +1868,9 @@ sctp_disposition_t sctp_sf_do_5_2_4_dupc + * are in good shape. + */ + chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data; +- skb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - +- sizeof(sctp_chunkhdr_t)); ++ if (!pskb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - ++ sizeof(sctp_chunkhdr_t))) ++ goto nomem; + + /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie + * of a duplicate COOKIE ECHO match the Verification Tags of the +@@ -5151,7 +5160,9 @@ static int sctp_eat_data(const struct sc + int tmp; + __u32 tsn; + int account_value; ++ struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; + struct sock *sk = asoc->base.sk; ++ int rcvbuf_over = 0; + + data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); +@@ -5162,10 +5173,16 @@ static int sctp_eat_data(const struct sc + /* ASSERT: Now skb->data is really the user data. */ + + /* +- * if we are established, and we have used up our receive +- * buffer memory, drop the frame ++ * If we are established, and we have used up our receive buffer ++ * memory, think about droping the frame. ++ * Note that we have an opportunity to improve performance here. ++ * If we accept one chunk from an skbuff, we have to keep all the ++ * memory of that skbuff around until the chunk is read into user ++ * space. Therefore, once we accept 1 chunk we may as well accept all ++ * remaining chunks in the skbuff. The data_accepted flag helps us do ++ * that. + */ +- if (asoc->state == SCTP_STATE_ESTABLISHED) { ++ if ((asoc->state == SCTP_STATE_ESTABLISHED) && (!chunk->data_accepted)) { + /* + * If the receive buffer policy is 1, then each + * association can allocate up to sk_rcvbuf bytes +@@ -5176,9 +5193,25 @@ static int sctp_eat_data(const struct sc + account_value = atomic_read(&asoc->rmem_alloc); + else + account_value = atomic_read(&sk->sk_rmem_alloc); +- +- if (account_value > sk->sk_rcvbuf) +- return SCTP_IERROR_IGNORE_TSN; ++ if (account_value > sk->sk_rcvbuf) { ++ /* ++ * We need to make forward progress, even when we are ++ * under memory pressure, so we always allow the ++ * next tsn after the ctsn ack point to be accepted. ++ * This lets us avoid deadlocks in which we have to ++ * drop frames that would otherwise let us drain the ++ * receive queue. ++ */ ++ if ((sctp_tsnmap_get_ctsn(map) + 1) != tsn) ++ return SCTP_IERROR_IGNORE_TSN; ++ ++ /* ++ * We're going to accept the frame but we should renege ++ * to make space for it. This will send us down that ++ * path later in this function. ++ */ ++ rcvbuf_over = 1; ++ } + } + + /* Process ECN based congestion. +@@ -5226,6 +5259,7 @@ static int sctp_eat_data(const struct sc + datalen -= sizeof(sctp_data_chunk_t); + + deliver = SCTP_CMD_CHUNK_ULP; ++ chunk->data_accepted = 1; + + /* Think about partial delivery. */ + if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) { +@@ -5242,7 +5276,8 @@ static int sctp_eat_data(const struct sc + * large spill over. + */ + if (!asoc->rwnd || asoc->rwnd_over || +- (datalen > asoc->rwnd + asoc->frag_point)) { ++ (datalen > asoc->rwnd + asoc->frag_point) || ++ rcvbuf_over) { + + /* If this is the next TSN, consider reneging to make + * room. Note: Playing nice with a confused sender. A +@@ -5250,8 +5285,8 @@ static int sctp_eat_data(const struct sc + * space and in the future we may want to detect and + * do more drastic reneging. + */ +- if (sctp_tsnmap_has_gap(&asoc->peer.tsn_map) && +- (sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1) == tsn) { ++ if (sctp_tsnmap_has_gap(map) && ++ (sctp_tsnmap_get_ctsn(map) + 1) == tsn) { + SCTP_DEBUG_PRINTK("Reneging for tsn:%u\n", tsn); + deliver = SCTP_CMD_RENEGE; + } else { +diff -upr linux-2.6.16.orig/net/sctp/sm_statetable.c linux-2.6.16-026test015/net/sctp/sm_statetable.c +--- linux-2.6.16.orig/net/sctp/sm_statetable.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sctp/sm_statetable.c 2006-07-04 14:41:36.000000000 +0400 +@@ -366,9 +366,9 @@ const sctp_sm_table_entry_t *sctp_sm_loo + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ +- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ ++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ +- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ ++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_ESTABLISHED */ \ +@@ -380,7 +380,7 @@ const sctp_sm_table_entry_t *sctp_sm_loo + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ +- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ ++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + } /* TYPE_SCTP_ECN_ECNE */ + + #define TYPE_SCTP_ECN_CWR { \ +@@ -401,7 +401,7 @@ const sctp_sm_table_entry_t *sctp_sm_loo + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ +- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ ++ {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + } /* TYPE_SCTP_ECN_CWR */ + + #define TYPE_SCTP_SHUTDOWN_COMPLETE { \ +@@ -647,7 +647,7 @@ chunk_event_table_unknown[SCTP_STATE_NUM + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ +- {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ ++ {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ +diff -upr linux-2.6.16.orig/net/sctp/ulpqueue.c linux-2.6.16-026test015/net/sctp/ulpqueue.c +--- linux-2.6.16.orig/net/sctp/ulpqueue.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sctp/ulpqueue.c 2006-07-04 14:41:36.000000000 +0400 +@@ -279,6 +279,7 @@ static inline void sctp_ulpq_store_reasm + static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) + { + struct sk_buff *pos; ++ struct sk_buff *new = NULL; + struct sctp_ulpevent *event; + struct sk_buff *pnext, *last; + struct sk_buff *list = skb_shinfo(f_frag)->frag_list; +@@ -297,11 +298,33 @@ static struct sctp_ulpevent *sctp_make_r + */ + if (last) + last->next = pos; +- else +- skb_shinfo(f_frag)->frag_list = pos; ++ else { ++ if (skb_cloned(f_frag)) { ++ /* This is a cloned skb, we can't just modify ++ * the frag_list. We need a new skb to do that. ++ * Instead of calling skb_unshare(), we'll do it ++ * ourselves since we need to delay the free. ++ */ ++ new = skb_copy(f_frag, GFP_ATOMIC); ++ if (!new) ++ return NULL; /* try again later */ ++ ++ new->sk = f_frag->sk; ++ ++ skb_shinfo(new)->frag_list = pos; ++ } else ++ skb_shinfo(f_frag)->frag_list = pos; ++ } + + /* Remove the first fragment from the reassembly queue. */ + __skb_unlink(f_frag, queue); ++ ++ /* if we did unshare, then free the old skb and re-assign */ ++ if (new) { ++ kfree_skb(f_frag); ++ f_frag = new; ++ } ++ + while (pos) { + + pnext = pos->next; +diff -upr linux-2.6.16.orig/net/socket.c linux-2.6.16-026test015/net/socket.c +--- linux-2.6.16.orig/net/socket.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/socket.c 2006-07-04 14:41:39.000000000 +0400 +@@ -84,6 +84,7 @@ + #include <linux/compat.h> + #include <linux/kmod.h> + #include <linux/audit.h> ++#include <linux/in.h> + + #ifdef CONFIG_NET_RADIO + #include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ +@@ -1075,6 +1076,49 @@ int sock_wake_async(struct socket *sock, + return 0; + } + ++int vz_security_proto_check(int family, int type, int protocol) ++{ ++#ifdef CONFIG_VE ++ if (ve_is_super(get_exec_env())) ++ return 0; ++ ++ switch (family) { ++ case PF_UNSPEC: ++ case PF_PACKET: ++ case PF_NETLINK: ++ case PF_UNIX: ++ break; ++ case PF_INET: ++ switch (protocol) { ++ case IPPROTO_IP: ++ case IPPROTO_ICMP: ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_RAW: ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++ break; ++ case PF_INET6: ++ switch (protocol) { ++ case IPPROTO_IP: ++ case IPPROTO_ICMPV6: ++ case IPPROTO_TCP: ++ case IPPROTO_UDP: ++ case IPPROTO_RAW: ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++ break; ++ default: ++ return -EAFNOSUPPORT; ++ } ++#endif ++ return 0; ++} ++ + static int __sock_create(int family, int type, int protocol, struct socket **res, int kern) + { + int err; +@@ -1102,6 +1146,11 @@ static int __sock_create(int family, int + family = PF_PACKET; + } + ++ /* VZ compatibility layer */ ++ err = vz_security_proto_check(family, type, protocol); ++ if (err < 0) ++ return err; ++ + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; +diff -upr linux-2.6.16.orig/net/sunrpc/clnt.c linux-2.6.16-026test015/net/sunrpc/clnt.c +--- linux-2.6.16.orig/net/sunrpc/clnt.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sunrpc/clnt.c 2006-07-04 14:41:38.000000000 +0400 +@@ -168,10 +168,10 @@ rpc_new_client(struct rpc_xprt *xprt, ch + } + + /* save the nodename */ +- clnt->cl_nodelen = strlen(system_utsname.nodename); ++ clnt->cl_nodelen = strlen(ve_utsname.nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; +- memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); ++ memcpy(clnt->cl_nodename, ve_utsname.nodename, clnt->cl_nodelen); + return clnt; + + out_no_auth: +diff -upr linux-2.6.16.orig/net/sunrpc/sched.c linux-2.6.16-026test015/net/sunrpc/sched.c +--- linux-2.6.16.orig/net/sunrpc/sched.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sunrpc/sched.c 2006-07-04 14:41:38.000000000 +0400 +@@ -605,7 +605,9 @@ EXPORT_SYMBOL(rpc_exit_task); + static int __rpc_execute(struct rpc_task *task) + { + int status = 0; ++ struct ve_struct *env; + ++ env = set_exec_env(get_ve0()); + dprintk("RPC: %4d rpc_execute flgs %x\n", + task->tk_pid, task->tk_flags); + +@@ -693,6 +695,7 @@ static int __rpc_execute(struct rpc_task + rpc_mark_complete_task(task); + /* Release all resources associated with the task */ + rpc_release_task(task); ++ (void)set_exec_env(env); + return status; + } + +diff -upr linux-2.6.16.orig/net/sunrpc/svcsock.c linux-2.6.16-026test015/net/sunrpc/svcsock.c +--- linux-2.6.16.orig/net/sunrpc/svcsock.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/sunrpc/svcsock.c 2006-07-04 14:41:38.000000000 +0400 +@@ -361,6 +361,9 @@ svc_sendto(struct svc_rqst *rqstp, struc + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; ++ struct ve_struct *old_env; ++ ++ old_env = set_exec_env(get_ve0()); + + slen = xdr->len; + +@@ -425,6 +428,8 @@ out: + rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, + rqstp->rq_addr.sin_addr.s_addr); + ++ (void)set_exec_env(old_env); ++ + return len; + } + +@@ -437,9 +442,12 @@ svc_recv_available(struct svc_sock *svsk + mm_segment_t oldfs; + struct socket *sock = svsk->sk_sock; + int avail, err; ++ struct ve_struct *old_env; + + oldfs = get_fs(); set_fs(KERNEL_DS); ++ old_env = set_exec_env(get_ve0()); + err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); ++ (void)set_exec_env(old_env); + set_fs(oldfs); + + return (err >= 0)? avail : err; +@@ -454,6 +462,7 @@ svc_recvfrom(struct svc_rqst *rqstp, str + struct msghdr msg; + struct socket *sock; + int len, alen; ++ struct ve_struct *old_env; + + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + sock = rqstp->rq_sock->sk_sock; +@@ -465,7 +474,9 @@ svc_recvfrom(struct svc_rqst *rqstp, str + + msg.msg_flags = MSG_DONTWAIT; + ++ old_env = set_exec_env(get_ve0()); + len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); ++ (void)set_exec_env(get_ve0()); + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + * possibly we should cache this in the svc_sock structure +@@ -761,17 +772,19 @@ svc_tcp_accept(struct svc_sock *svsk) + const struct proto_ops *ops; + struct svc_sock *newsvsk; + int err, slen; ++ struct ve_struct *old_env; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return; + ++ old_env = set_exec_env(get_ve0()); + err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock); + if (err) { + if (err == -ENOMEM) + printk(KERN_WARNING "%s: no more sockets!\n", + serv->sv_name); +- return; ++ goto restore; + } + + dprintk("svc: tcp_accept %p allocated\n", newsock); +@@ -865,6 +878,8 @@ svc_tcp_accept(struct svc_sock *svsk) + + } + ++ (void)set_exec_env(old_env); ++ + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + +@@ -872,6 +887,8 @@ svc_tcp_accept(struct svc_sock *svsk) + + failed: + sock_release(newsock); ++restore: ++ (void)set_exec_env(old_env); + return; + } + +@@ -1388,6 +1405,7 @@ svc_create_socket(struct svc_serv *serv, + struct socket *sock; + int error; + int type; ++ struct ve_struct *old_env; + + dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", + serv->sv_program->pg_name, protocol, +@@ -1401,8 +1419,10 @@ svc_create_socket(struct svc_serv *serv, + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + ++ old_env = set_exec_env(get_ve0()); ++ + if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) +- return error; ++ goto restore; + + if (sin != NULL) { + if (type == SOCK_STREAM) +@@ -1418,12 +1438,16 @@ svc_create_socket(struct svc_serv *serv, + goto bummer; + } + +- if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) ++ if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) { ++ (void)set_exec_env(old_env); + return 0; ++ } + + bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); ++restore: ++ (void)set_exec_env(old_env); + return error; + } + +diff -upr linux-2.6.16.orig/net/unix/af_unix.c linux-2.6.16-026test015/net/unix/af_unix.c +--- linux-2.6.16.orig/net/unix/af_unix.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/unix/af_unix.c 2006-07-04 14:41:38.000000000 +0400 +@@ -118,6 +118,9 @@ + #include <net/checksum.h> + #include <linux/security.h> + ++#include <ub/ub_net.h> ++#include <ub/beancounter.h> ++ + int sysctl_unix_max_dgram_qlen = 10; + + struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +@@ -235,6 +238,8 @@ static struct sock *__unix_find_socket_b + sk_for_each(s, node, &unix_socket_table[hash ^ type]) { + struct unix_sock *u = unix_sk(s); + ++ if (!ve_accessible(VE_OWNER_SK(s), get_exec_env())) ++ continue; + if (u->addr->len == len && + !memcmp(u->addr->name, sunname, len)) + goto found; +@@ -439,7 +444,7 @@ static int unix_listen(struct socket *so + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + /* set credentials so connect can copy them */ +- sk->sk_peercred.pid = current->tgid; ++ sk->sk_peercred.pid = virt_tgid(current); + sk->sk_peercred.uid = current->euid; + sk->sk_peercred.gid = current->egid; + err = 0; +@@ -553,6 +558,8 @@ static struct sock * unix_create1(struct + sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); + if (!sk) + goto out; ++ if (ub_other_sock_charge(sk)) ++ goto out_sk_free; + + atomic_inc(&unix_nr_socks); + +@@ -571,6 +578,9 @@ static struct sock * unix_create1(struct + unix_insert_socket(unix_sockets_unbound, sk); + out: + return sk; ++out_sk_free: ++ sk_free(sk); ++ return NULL; + } + + static int unix_create(struct socket *sock, int protocol) +@@ -676,7 +686,7 @@ static struct sock *unix_find_other(stru + err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); + if (err) + goto fail; +- err = vfs_permission(&nd, MAY_WRITE); ++ err = vfs_permission(&nd, MAY_WRITE, NULL); + if (err) + goto put_fail; + +@@ -932,6 +942,7 @@ static int unix_stream_connect(struct so + int st; + int err; + long timeo; ++ unsigned long chargesize; + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) +@@ -960,6 +971,10 @@ static int unix_stream_connect(struct so + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; ++ chargesize = skb_charge_fullsize(skb); ++ if (ub_sock_getwres_other(newsk, chargesize) < 0) ++ goto out; ++ ub_skb_set_charge(skb, newsk, chargesize, UB_OTHERSOCKBUF); + + restart: + /* Find listening sock. */ +@@ -1043,7 +1058,7 @@ restart: + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; +- newsk->sk_peercred.pid = current->tgid; ++ newsk->sk_peercred.pid = virt_tgid(current); + newsk->sk_peercred.uid = current->euid; + newsk->sk_peercred.gid = current->egid; + newu = unix_sk(newsk); +@@ -1107,7 +1122,7 @@ static int unix_socketpair(struct socket + sock_hold(skb); + unix_peer(ska)=skb; + unix_peer(skb)=ska; +- ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; ++ ska->sk_peercred.pid = skb->sk_peercred.pid = virt_tgid(current); + ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; + ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; + +@@ -1433,6 +1448,16 @@ static int unix_stream_sendmsg(struct ki + + size=len-sent; + ++ if (msg->msg_flags & MSG_DONTWAIT) ++ ub_sock_makewres_other(sk, skb_charge_size(size)); ++ if (sock_bc(sk) != NULL && ++ sock_bc(sk)->poll_reserv >= ++ SOCK_MIN_UBCSPACE && ++ skb_charge_size(size) > ++ sock_bc(sk)->poll_reserv) ++ size = skb_charge_datalen(sock_bc(sk)->poll_reserv); ++ ++ + /* Keep two messages in the pipe so it schedules better */ + if (size > sk->sk_sndbuf / 2 - 64) + size = sk->sk_sndbuf / 2 - 64; +@@ -1444,7 +1469,8 @@ static int unix_stream_sendmsg(struct ki + * Grab a buffer + */ + +- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); ++ skb = sock_alloc_send_skb2(sk, size, SOCK_MIN_UBCSPACE, ++ msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) + goto out_err; +@@ -1869,6 +1895,7 @@ static unsigned int unix_poll(struct fil + { + struct sock *sk = sock->sk; + unsigned int mask; ++ int no_ub_res; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; +@@ -1879,6 +1906,10 @@ static unsigned int unix_poll(struct fil + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + ++ no_ub_res = ub_sock_makewres_other(sk, SOCK_MIN_UBCSPACE_CH); ++ if (no_ub_res) ++ ub_sock_sndqueueadd_other(sk, SOCK_MIN_UBCSPACE_CH); ++ + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) +@@ -1892,7 +1923,7 @@ static unsigned int unix_poll(struct fil + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ +- if (unix_writable(sk)) ++ if (!no_ub_res && unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +@@ -2044,7 +2075,7 @@ static int __init af_unix_init(void) + + sock_register(&unix_family_ops); + #ifdef CONFIG_PROC_FS +- proc_net_fops_create("unix", 0, &unix_seq_fops); ++ proc_glob_fops_create("net/unix", 0, &unix_seq_fops); + #endif + unix_sysctl_register(); + out: +@@ -2055,7 +2086,7 @@ static void __exit af_unix_exit(void) + { + sock_unregister(PF_UNIX); + unix_sysctl_unregister(); +- proc_net_remove("unix"); ++ remove_proc_glob_entry("net/unix", NULL); + proto_unregister(&unix_proto); + } + +diff -upr linux-2.6.16.orig/net/unix/garbage.c linux-2.6.16-026test015/net/unix/garbage.c +--- linux-2.6.16.orig/net/unix/garbage.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/net/unix/garbage.c 2006-07-04 14:41:39.000000000 +0400 +@@ -76,6 +76,7 @@ + #include <linux/netdevice.h> + #include <linux/file.h> + #include <linux/proc_fs.h> ++#include <linux/module.h> + + #include <net/sock.h> + #include <net/af_unix.h> +@@ -135,7 +136,7 @@ void unix_notinflight(struct file *fp) + atomic_dec(&unix_tot_inflight); + } + } +- ++EXPORT_SYMBOL_GPL(unix_notinflight); + + /* + * Garbage Collector Support Functions +diff -upr linux-2.6.16.orig/security/commoncap.c linux-2.6.16-026test015/security/commoncap.c +--- linux-2.6.16.orig/security/commoncap.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/security/commoncap.c 2006-07-04 14:41:38.000000000 +0400 +@@ -35,7 +35,7 @@ EXPORT_SYMBOL(cap_netlink_send); + + int cap_netlink_recv(struct sk_buff *skb) + { +- if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) ++ if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_VE_NET_ADMIN)) + return -EPERM; + return 0; + } +@@ -197,7 +197,7 @@ int cap_inode_setxattr(struct dentry *de + { + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) + return -EPERM; + return 0; + } +@@ -206,7 +206,7 @@ int cap_inode_removexattr(struct dentry + { + if (!strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) && +- !capable(CAP_SYS_ADMIN)) ++ !capable(CAP_SYS_ADMIN) && !capable(CAP_VE_ADMIN)) + return -EPERM; + return 0; + } +@@ -312,7 +312,7 @@ void cap_task_reparent_to_init (struct t + + int cap_syslog (int type) + { +- if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) ++ if ((type != 3 && type != 10) && !capable(CAP_VE_SYS_ADMIN)) + return -EPERM; + return 0; + } +diff -upr linux-2.6.16.orig/security/keys/key.c linux-2.6.16-026test015/security/keys/key.c +--- linux-2.6.16.orig/security/keys/key.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/security/keys/key.c 2006-07-04 14:41:36.000000000 +0400 +@@ -785,6 +785,10 @@ key_ref_t key_create_or_update(key_ref_t + + key_check(keyring); + ++ key_ref = ERR_PTR(-ENOTDIR); ++ if (keyring->type != &key_type_keyring) ++ goto error_2; ++ + down_write(&keyring->sem); + + /* if we're going to allocate a new key, we're going to have +diff -upr linux-2.6.16.orig/security/keys/keyring.c linux-2.6.16-026test015/security/keys/keyring.c +--- linux-2.6.16.orig/security/keys/keyring.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/security/keys/keyring.c 2006-07-04 14:41:36.000000000 +0400 +@@ -437,6 +437,7 @@ EXPORT_SYMBOL(keyring_search); + /* + * search the given keyring only (no recursion) + * - keyring must be locked by caller ++ * - caller must guarantee that the keyring is a keyring + */ + key_ref_t __keyring_search_one(key_ref_t keyring_ref, + const struct key_type *ktype, +diff -upr linux-2.6.16.orig/security/selinux/hooks.c linux-2.6.16-026test015/security/selinux/hooks.c +--- linux-2.6.16.orig/security/selinux/hooks.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/security/selinux/hooks.c 2006-07-04 14:41:38.000000000 +0400 +@@ -4167,12 +4167,12 @@ static int selinux_setprocattr(struct ta + struct task_struct *g, *t; + struct mm_struct *mm = p->mm; + read_lock(&tasklist_lock); +- do_each_thread(g, t) ++ do_each_thread_ve(g, t) + if (t->mm == mm && t != p) { + read_unlock(&tasklist_lock); + return -EPERM; + } +- while_each_thread(g, t); ++ while_each_thread_ve(g, t); + read_unlock(&tasklist_lock); + } + +diff -upr linux-2.6.16.orig/security/selinux/ss/mls.c linux-2.6.16-026test015/security/selinux/ss/mls.c +--- linux-2.6.16.orig/security/selinux/ss/mls.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/security/selinux/ss/mls.c 2006-07-04 14:41:36.000000000 +0400 +@@ -264,7 +264,7 @@ int mls_context_to_sid(char oldc, + + if (!selinux_mls_enabled) { + if (def_sid != SECSID_NULL && oldc) +- *scontext += strlen(*scontext); ++ *scontext += strlen(*scontext)+1; + return 0; + } + +diff -upr linux-2.6.16.orig/security/selinux/ss/services.c linux-2.6.16-026test015/security/selinux/ss/services.c +--- linux-2.6.16.orig/security/selinux/ss/services.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/security/selinux/ss/services.c 2006-07-04 14:41:36.000000000 +0400 +@@ -592,6 +592,10 @@ int security_sid_to_context(u32 sid, cha + + *scontext_len = strlen(initial_sid_to_string[sid]) + 1; + scontextp = kmalloc(*scontext_len,GFP_ATOMIC); ++ if (!scontextp) { ++ rc = -ENOMEM; ++ goto out; ++ } + strcpy(scontextp, initial_sid_to_string[sid]); + *scontext = scontextp; + goto out; +diff -upr linux-2.6.16.orig/sound/isa/opti9xx/opti92x-ad1848.c linux-2.6.16-026test015/sound/isa/opti9xx/opti92x-ad1848.c +--- linux-2.6.16.orig/sound/isa/opti9xx/opti92x-ad1848.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/sound/isa/opti9xx/opti92x-ad1848.c 2006-07-04 14:41:36.000000000 +0400 +@@ -2088,9 +2088,11 @@ static int __init alsa_card_opti9xx_init + int error; + struct platform_device *device; + ++#ifdef CONFIG_PNP + pnp_register_card_driver(&opti9xx_pnpc_driver); + if (snd_opti9xx_pnp_is_probed) + return 0; ++#endif + if (! is_isapnp_selected()) { + error = platform_driver_register(&snd_opti9xx_driver); + if (error < 0) +@@ -2102,7 +2104,9 @@ static int __init alsa_card_opti9xx_init + } + platform_driver_unregister(&snd_opti9xx_driver); + } ++#ifdef CONFIG_PNP + pnp_unregister_card_driver(&opti9xx_pnpc_driver); ++#endif + #ifdef MODULE + printk(KERN_ERR "no OPTi " CHIP_NAME " soundcard found\n"); + #endif +@@ -2115,7 +2119,9 @@ static void __exit alsa_card_opti9xx_exi + platform_device_unregister(snd_opti9xx_platform_device); + platform_driver_unregister(&snd_opti9xx_driver); + } ++#ifdef CONFIG_PNP + pnp_unregister_card_driver(&opti9xx_pnpc_driver); ++#endif + } + + module_init(alsa_card_opti9xx_init) +diff -upr linux-2.6.16.orig/sound/oss/dmasound/tas_common.c linux-2.6.16-026test015/sound/oss/dmasound/tas_common.c +--- linux-2.6.16.orig/sound/oss/dmasound/tas_common.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/sound/oss/dmasound/tas_common.c 2006-07-04 14:41:36.000000000 +0400 +@@ -195,8 +195,8 @@ tas_init(int driver_id, const char *driv + + printk(KERN_INFO "tas driver [%s])\n", driver_name); + +-#ifndef CONFIG_I2C_KEYWEST +- request_module("i2c-keywest"); ++#ifndef CONFIG_I2C_POWERMAC ++ request_module("i2c-powermac"); + #endif + tas_node = find_devices("deq"); + if (tas_node == NULL) +diff -upr linux-2.6.16.orig/sound/pci/hda/patch_realtek.c linux-2.6.16-026test015/sound/pci/hda/patch_realtek.c +--- linux-2.6.16.orig/sound/pci/hda/patch_realtek.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/sound/pci/hda/patch_realtek.c 2006-07-04 14:41:36.000000000 +0400 +@@ -2948,6 +2948,8 @@ static struct hda_board_config alc260_cf + { .modelname = "basic", .config = ALC260_BASIC }, + { .pci_subvendor = 0x104d, .pci_subdevice = 0x81bb, + .config = ALC260_BASIC }, /* Sony VAIO */ ++ { .pci_subvendor = 0x152d, .pci_subdevice = 0x0729, ++ .config = ALC260_BASIC }, /* CTL Travel Master U553W */ + { .modelname = "hp", .config = ALC260_HP }, + { .pci_subvendor = 0x103c, .pci_subdevice = 0x3010, .config = ALC260_HP }, + { .pci_subvendor = 0x103c, .pci_subdevice = 0x3011, .config = ALC260_HP }, +diff -upr linux-2.6.16.orig/sound/ppc/daca.c linux-2.6.16-026test015/sound/ppc/daca.c +--- linux-2.6.16.orig/sound/ppc/daca.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/sound/ppc/daca.c 2006-07-04 14:41:36.000000000 +0400 +@@ -256,7 +256,7 @@ int __init snd_pmac_daca_init(struct snd + + #ifdef CONFIG_KMOD + if (current->fs->root) +- request_module("i2c-keywest"); ++ request_module("i2c-powermac"); + #endif /* CONFIG_KMOD */ + + mix = kmalloc(sizeof(*mix), GFP_KERNEL); +diff -upr linux-2.6.16.orig/sound/ppc/tumbler.c linux-2.6.16-026test015/sound/ppc/tumbler.c +--- linux-2.6.16.orig/sound/ppc/tumbler.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16-026test015/sound/ppc/tumbler.c 2006-07-04 14:41:36.000000000 +0400 +@@ -1314,7 +1314,7 @@ int __init snd_pmac_tumbler_init(struct + + #ifdef CONFIG_KMOD + if (current->fs->root) +- request_module("i2c-keywest"); ++ request_module("i2c-powermac"); + #endif /* CONFIG_KMOD */ + + mix = kmalloc(sizeof(*mix), GFP_KERNEL); diff --git a/openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch b/openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch new file mode 100644 index 0000000..0f43c67 --- /dev/null +++ b/openvz-sources/026.015-r2/0101_diff-merge-2.6.16.24-20060714.patch @@ -0,0 +1,19 @@ +From: OpenVZ team <devel@openvz.org> +Date: Fri, 14 Jul 2006 10:23:43 +0000 (+0400) +Subject: Merged 2.6.16.24 from /linux/kernel/git/stable/linux-2.6.16.y +X-Git-Url: http://10.0.101.105/cgi-bin/gitweb.cgi?p=kernel;a=commitdiff;h=9a23ec204b88ab5e678dc3e33fe03d7531167e66 + +Merged 2.6.16.24 from /linux/kernel/git/stable/linux-2.6.16.y +--- + +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -1954,7 +1954,7 @@ asmlinkage long sys_prctl(int option, un + error = current->mm->dumpable; + break; + case PR_SET_DUMPABLE: +- if (arg2 < 0 || arg2 > 2) { ++ if (arg2 < 0 || arg2 > 1) { + error = -EINVAL; + break; + } diff --git a/openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch b/openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch new file mode 100644 index 0000000..a02bf91 --- /dev/null +++ b/openvz-sources/026.015-r2/0102_procfs-dumpable-race.patch @@ -0,0 +1,20 @@ +Index: linux-2.6.16-gentoo-r12/fs/proc/base.c +=================================================================== +--- linux-2.6.16-gentoo-r12.orig/fs/proc/base.c ++++ linux-2.6.16-gentoo-r12/fs/proc/base.c +@@ -1367,6 +1367,7 @@ static int pid_revalidate(struct dentry + inode->i_uid = 0; + inode->i_gid = 0; + } ++ inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); + return 1; + } +@@ -1394,6 +1395,7 @@ static int tid_fd_revalidate(struct dent + inode->i_uid = 0; + inode->i_gid = 0; + } ++ inode->i_mode &= ~(S_ISUID | S_ISGID); + security_task_to_inode(task, inode); + return 1; + } |