From 02b964da3e6fbab007089403c9b2d9277e43a614 Mon Sep 17 00:00:00 2001 From: Michael Marineau Date: Fri, 22 Feb 2008 23:43:19 +0000 Subject: Update 2.6.21 patches to the latest from fedora 8, sync up with other things from our 2.6.20 patche set svn path=/patches/; revision=71 --- trunk/2.6.21/01006_linux-2.6.21.7.patch | 1110 + trunk/2.6.21/20950_linux-2.6.21.6-xen-3.1.0.patch | 96051 ------------------- trunk/2.6.21/20950_linux-2.6.21.7-xen-3.1.0.patch | 96051 +++++++++++++++++++ ...nux-2.6-xen-x86_64-silence-up-apic-errors.patch | 13 - .../20963_linux-2.6-xen-sleazy-fpu-i386.patch | 96 + .../20964_linux-2.6-xen-sleazy-fpu-x86_64.patch | 93 + .../20965_linux-2.6-xen-privcmd-use-nopfn.patch | 35 + ...20966_linux-2.6-xen-fix-irq-warn-mismerge.patch | 21 + trunk/2.6.21/26000_linux-2.6-cve-2008-0600.patch | 37 + .../30037_amd64-zero-extend-32bit-ptrace-xen.patch | 50 + ...i386-fix-xen_l1_entry_update-for-highptes.patch | 26 + trunk/2.6.21/50008_reenable-tls-warning.patch | 23 + trunk/2.6.21/50009_gentooify-tls-warning.patch | 16 + .../50010_remove-pte_offset_map-redefinition.patch | 36 + 14 files changed, 97594 insertions(+), 96064 deletions(-) create mode 100644 trunk/2.6.21/01006_linux-2.6.21.7.patch delete mode 100644 trunk/2.6.21/20950_linux-2.6.21.6-xen-3.1.0.patch create mode 100644 trunk/2.6.21/20950_linux-2.6.21.7-xen-3.1.0.patch delete mode 100644 trunk/2.6.21/20952_linux-2.6-xen-x86_64-silence-up-apic-errors.patch create mode 100644 trunk/2.6.21/20963_linux-2.6-xen-sleazy-fpu-i386.patch create mode 100644 trunk/2.6.21/20964_linux-2.6-xen-sleazy-fpu-x86_64.patch create mode 100644 trunk/2.6.21/20965_linux-2.6-xen-privcmd-use-nopfn.patch create mode 100644 trunk/2.6.21/20966_linux-2.6-xen-fix-irq-warn-mismerge.patch create mode 100644 trunk/2.6.21/26000_linux-2.6-cve-2008-0600.patch create mode 100644 trunk/2.6.21/30037_amd64-zero-extend-32bit-ptrace-xen.patch create mode 100644 trunk/2.6.21/40001_i386-fix-xen_l1_entry_update-for-highptes.patch create mode 100644 trunk/2.6.21/50008_reenable-tls-warning.patch create mode 100644 trunk/2.6.21/50009_gentooify-tls-warning.patch create mode 100644 trunk/2.6.21/50010_remove-pte_offset_map-redefinition.patch diff --git a/trunk/2.6.21/01006_linux-2.6.21.7.patch b/trunk/2.6.21/01006_linux-2.6.21.7.patch new file mode 100644 index 0000000..74c4984 --- /dev/null +++ b/trunk/2.6.21/01006_linux-2.6.21.7.patch @@ -0,0 +1,1110 @@ +diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S +index 18bddcb..cb1f16c 100644 +--- a/arch/i386/kernel/entry.S ++++ b/arch/i386/kernel/entry.S +@@ -371,10 +371,6 @@ ENTRY(system_call) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) +- testl $TF_MASK,PT_EFLAGS(%esp) +- jz no_singlestep +- orl $_TIF_SINGLESTEP,TI_flags(%ebp) +-no_singlestep: + # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) +@@ -389,6 +385,10 @@ syscall_exit: + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF ++ testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit ++ jz no_singlestep ++ orl $_TIF_SINGLESTEP,TI_flags(%ebp) ++no_singlestep: + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work +diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c +index f72e8e8..a84304e 100644 +--- a/arch/powerpc/kernel/signal_64.c ++++ b/arch/powerpc/kernel/signal_64.c +@@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, + */ + discard_lazy_cpu_state(); + ++ /* ++ * Force reload of FP/VEC. ++ * This has to be done before copying stuff into current->thread.fpr/vr ++ * for the reasons explained in the previous comment. ++ */ ++ regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC); ++ + err |= __copy_from_user(¤t->thread.fpr, &sc->fp_regs, FP_REGS_SIZE); + + #ifdef CONFIG_ALTIVEC +@@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, + current->thread.vrsave = 0; + #endif /* CONFIG_ALTIVEC */ + +- /* Force reload of FP/VEC */ +- regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC); +- + return err; + } + +diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c +index 6fd126a..df35d6d 100644 +--- a/arch/x86_64/mm/init.c ++++ b/arch/x86_64/mm/init.c +@@ -72,6 +72,8 @@ void show_mem(void) + + for_each_online_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { ++ if (!pfn_valid(pgdat->node_start_pfn + i)) ++ continue; + page = pfn_to_page(pgdat->node_start_pfn + i); + total++; + if (PageReserved(page)) +diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c +index cf9d344..14de1e8 100644 +--- a/drivers/ide/pci/hpt366.c ++++ b/drivers/ide/pci/hpt366.c +@@ -1,5 +1,5 @@ + /* +- * linux/drivers/ide/pci/hpt366.c Version 1.03 May 4, 2007 ++ * linux/drivers/ide/pci/hpt366.c Version 1.04 Jun 4, 2007 + * + * Copyright (C) 1999-2003 Andre Hedrick + * Portions Copyright (C) 2001 Sun Microsystems, Inc. +@@ -106,7 +106,8 @@ + * switch to calculating PCI clock frequency based on the chip's base DPLL + * frequency + * - switch to using the DPLL clock and enable UltraATA/133 mode by default on +- * anything newer than HPT370/A ++ * anything newer than HPT370/A (except HPT374 that is not capable of this ++ * mode according to the manual) + * - fold PCI clock detection and DPLL setup code into init_chipset_hpt366(), + * also fixing the interchanged 25/40 MHz PCI clock cases for HPT36x chips; + * unify HPT36x/37x timing setup code and the speedproc handlers by joining +@@ -365,7 +366,6 @@ static u32 sixty_six_base_hpt37x[] = { + }; + + #define HPT366_DEBUG_DRIVE_INFO 0 +-#define HPT374_ALLOW_ATA133_6 1 + #define HPT371_ALLOW_ATA133_6 1 + #define HPT302_ALLOW_ATA133_6 1 + #define HPT372_ALLOW_ATA133_6 1 +@@ -450,7 +450,7 @@ static struct hpt_info hpt370a __devinitdata = { + + static struct hpt_info hpt374 __devinitdata = { + .chip_type = HPT374, +- .max_mode = HPT374_ALLOW_ATA133_6 ? 4 : 3, ++ .max_mode = 3, + .dpll_clk = 48, + .settings = hpt37x_settings + }; +diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c +index 4c2471e..b9ff4e3 100644 +--- a/drivers/md/dm-crypt.c ++++ b/drivers/md/dm-crypt.c +@@ -33,7 +33,6 @@ + struct crypt_io { + struct dm_target *target; + struct bio *base_bio; +- struct bio *first_clone; + struct work_struct work; + atomic_t pending; + int error; +@@ -107,6 +106,8 @@ struct crypt_config { + + static struct kmem_cache *_crypt_io_pool; + ++static void clone_init(struct crypt_io *, struct bio *); ++ + /* + * Different IV generation algorithms: + * +@@ -378,25 +379,20 @@ static int crypt_convert(struct crypt_config *cc, + * This should never violate the device limitations + * May return a smaller bio when running out of pages + */ +-static struct bio * +-crypt_alloc_buffer(struct crypt_config *cc, unsigned int size, +- struct bio *base_bio, unsigned int *bio_vec_idx) ++static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size, ++ unsigned int *bio_vec_idx) + { ++ struct crypt_config *cc = io->target->private; + struct bio *clone; + unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM; + unsigned int i; + +- if (base_bio) { +- clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs); +- __bio_clone(clone, base_bio); +- } else +- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); +- ++ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs); + if (!clone) + return NULL; + +- clone->bi_destructor = dm_crypt_bio_destructor; ++ clone_init(io, clone); + + /* if the last bio was not complete, continue where that one ended */ + clone->bi_idx = *bio_vec_idx; +@@ -495,9 +491,6 @@ static void dec_pending(struct crypt_io *io, int error) + if (!atomic_dec_and_test(&io->pending)) + return; + +- if (io->first_clone) +- bio_put(io->first_clone); +- + bio_endio(io->base_bio, io->base_bio->bi_size, io->error); + + mempool_free(io, cc->io_pool); +@@ -562,6 +555,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone) + clone->bi_end_io = crypt_endio; + clone->bi_bdev = cc->dev->bdev; + clone->bi_rw = io->base_bio->bi_rw; ++ clone->bi_destructor = dm_crypt_bio_destructor; + } + + static void process_read(struct crypt_io *io) +@@ -585,7 +579,6 @@ static void process_read(struct crypt_io *io) + } + + clone_init(io, clone); +- clone->bi_destructor = dm_crypt_bio_destructor; + clone->bi_idx = 0; + clone->bi_vcnt = bio_segments(base_bio); + clone->bi_size = base_bio->bi_size; +@@ -615,8 +608,7 @@ static void process_write(struct crypt_io *io) + * so repeat the whole process until all the data can be handled. + */ + while (remaining) { +- clone = crypt_alloc_buffer(cc, base_bio->bi_size, +- io->first_clone, &bvec_idx); ++ clone = crypt_alloc_buffer(io, base_bio->bi_size, &bvec_idx); + if (unlikely(!clone)) { + dec_pending(io, -ENOMEM); + return; +@@ -631,31 +623,23 @@ static void process_write(struct crypt_io *io) + return; + } + +- clone_init(io, clone); + clone->bi_sector = cc->start + sector; +- +- if (!io->first_clone) { +- /* +- * hold a reference to the first clone, because it +- * holds the bio_vec array and that can't be freed +- * before all other clones are released +- */ +- bio_get(clone); +- io->first_clone = clone; +- } +- + remaining -= clone->bi_size; + sector += bio_sectors(clone); + +- /* prevent bio_put of first_clone */ ++ /* Grab another reference to the io struct ++ * before we kick off the request */ + if (remaining) + atomic_inc(&io->pending); + + generic_make_request(clone); + ++ /* Do not reference clone after this - it ++ * may be gone already. */ ++ + /* out of memory -> run queues */ + if (remaining) +- congestion_wait(bio_data_dir(clone), HZ/100); ++ congestion_wait(WRITE, HZ/100); + } + } + +@@ -954,10 +938,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio, + struct crypt_config *cc = ti->private; + struct crypt_io *io; + ++ if (bio_barrier(bio)) ++ return -EOPNOTSUPP; ++ + io = mempool_alloc(cc->io_pool, GFP_NOIO); + io->target = ti; + io->base_bio = bio; +- io->first_clone = NULL; + io->error = io->post_process = 0; + atomic_set(&io->pending, 0); + kcryptd_queue_io(io); +diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c +index 3a95cc5..46677d7 100644 +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -1240,17 +1240,24 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) + } + r1_bio->read_disk = primary; + for (i=0; iraid_disks; i++) +- if (r1_bio->bios[i]->bi_end_io == end_sync_read && +- test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { ++ if (r1_bio->bios[i]->bi_end_io == end_sync_read) { + int j; + int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); + struct bio *pbio = r1_bio->bios[primary]; + struct bio *sbio = r1_bio->bios[i]; +- for (j = vcnt; j-- ; ) +- if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), +- page_address(sbio->bi_io_vec[j].bv_page), +- PAGE_SIZE)) +- break; ++ ++ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { ++ for (j = vcnt; j-- ; ) { ++ struct page *p, *s; ++ p = pbio->bi_io_vec[j].bv_page; ++ s = sbio->bi_io_vec[j].bv_page; ++ if (memcmp(page_address(p), ++ page_address(s), ++ PAGE_SIZE)) ++ break; ++ } ++ } else ++ j = 0; + if (j >= 0) + mddev->resync_mismatches += r1_bio->sectors; + if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { +diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c +index 82249a6..9eb66c1 100644 +--- a/drivers/md/raid10.c ++++ b/drivers/md/raid10.c +@@ -1867,6 +1867,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i + int d = r10_bio->devs[i].devnum; + bio = r10_bio->devs[i].bio; + bio->bi_end_io = NULL; ++ clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (conf->mirrors[d].rdev == NULL || + test_bit(Faulty, &conf->mirrors[d].rdev->flags)) + continue; +@@ -2037,6 +2038,11 @@ static int run(mddev_t *mddev) + /* 'size' is now the number of chunks in the array */ + /* calculate "used chunks per device" in 'stride' */ + stride = size * conf->copies; ++ ++ /* We need to round up when dividing by raid_disks to ++ * get the stride size. ++ */ ++ stride += conf->raid_disks - 1; + sector_div(stride, conf->raid_disks); + mddev->size = stride << (conf->chunk_shift-1); + +diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c +index 5720b77..d4bef35 100644 +--- a/drivers/media/video/bt8xx/bttv-driver.c ++++ b/drivers/media/video/bt8xx/bttv-driver.c +@@ -1313,7 +1313,7 @@ set_tvnorm(struct bttv *btv, unsigned int norm) + + /* Call with btv->lock down. */ + static void +-set_input(struct bttv *btv, unsigned int input) ++set_input(struct bttv *btv, unsigned int input, unsigned int norm) + { + unsigned long flags; + +@@ -1332,7 +1332,7 @@ set_input(struct bttv *btv, unsigned int input) + } + audio_input(btv,(input == bttv_tvcards[btv->c.type].tuner ? + TVAUDIO_INPUT_TUNER : TVAUDIO_INPUT_EXTERN)); +- set_tvnorm(btv,btv->tvnorm); ++ set_tvnorm(btv, norm); + i2c_vidiocschan(btv); + } + +@@ -1423,7 +1423,7 @@ static void bttv_reinit_bt848(struct bttv *btv) + + init_bt848(btv); + btv->pll.pll_current = -1; +- set_input(btv,btv->input); ++ set_input(btv, btv->input, btv->tvnorm); + } + + static int get_control(struct bttv *btv, struct v4l2_control *c) +@@ -1993,8 +1993,7 @@ static int bttv_common_ioctls(struct bttv *btv, unsigned int cmd, void *arg) + return 0; + } + +- btv->tvnorm = v->norm; +- set_input(btv,v->channel); ++ set_input(btv, v->channel, v->norm); + mutex_unlock(&btv->lock); + return 0; + } +@@ -2130,7 +2129,7 @@ static int bttv_common_ioctls(struct bttv *btv, unsigned int cmd, void *arg) + if (*i > bttv_tvcards[btv->c.type].video_inputs) + return -EINVAL; + mutex_lock(&btv->lock); +- set_input(btv,*i); ++ set_input(btv, *i, btv->tvnorm); + mutex_unlock(&btv->lock); + return 0; + } +@@ -4762,7 +4761,7 @@ static int __devinit bttv_probe(struct pci_dev *dev, + bt848_hue(btv,32768); + bt848_sat(btv,32768); + audio_mute(btv, 1); +- set_input(btv,0); ++ set_input(btv, 0, btv->tvnorm); + bttv_crop_reset(&btv->crop[0], btv->tvnorm); + btv->crop[1] = btv->crop[0]; /* current = default */ + disclaim_vbi_lines(btv); +diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c +index b0466b8..a80b1cb 100644 +--- a/drivers/media/video/cx88/cx88-blackbird.c ++++ b/drivers/media/video/cx88/cx88-blackbird.c +@@ -1034,6 +1034,8 @@ static int vidioc_g_tuner (struct file *file, void *priv, + + if (unlikely(UNSET == core->tuner_type)) + return -EINVAL; ++ if (0 != t->index) ++ return -EINVAL; + + strcpy(t->name, "Television"); + t->type = V4L2_TUNER_ANALOG_TV; +diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c b/drivers/media/video/saa7134/saa7134-tvaudio.c +index dd759d6..36b3fa3 100644 +--- a/drivers/media/video/saa7134/saa7134-tvaudio.c ++++ b/drivers/media/video/saa7134/saa7134-tvaudio.c +@@ -1006,7 +1006,7 @@ int saa7134_tvaudio_init2(struct saa7134_dev *dev) + int saa7134_tvaudio_fini(struct saa7134_dev *dev) + { + /* shutdown tvaudio thread */ +- if (dev->thread.pid >= 0) { ++ if (dev->thread.pid > 0) { + dev->thread.shutdown = 1; + wake_up_interruptible(&dev->thread.wq); + wait_for_completion(&dev->thread.exit); +diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c +index 5006c67..1137291 100644 +--- a/drivers/net/bnx2.c ++++ b/drivers/net/bnx2.c +@@ -54,8 +54,8 @@ + + #define DRV_MODULE_NAME "bnx2" + #define PFX DRV_MODULE_NAME ": " +-#define DRV_MODULE_VERSION "1.5.8.1" +-#define DRV_MODULE_RELDATE "May 7, 2007" ++#define DRV_MODULE_VERSION "1.5.8.2" ++#define DRV_MODULE_RELDATE "June 5, 2007" + + #define RUN_AT(x) (jiffies + (x)) + +@@ -1550,6 +1550,7 @@ bnx2_init_context(struct bnx2 *bp) + vcid = 96; + while (vcid) { + u32 vcid_addr, pcid_addr, offset; ++ int i; + + vcid--; + +@@ -1570,16 +1571,20 @@ bnx2_init_context(struct bnx2 *bp) + pcid_addr = vcid_addr; + } + +- REG_WR(bp, BNX2_CTX_VIRT_ADDR, 0x00); +- REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr); ++ for (i = 0; i < (CTX_SIZE / PHY_CTX_SIZE); i++) { ++ vcid_addr += (i << PHY_CTX_SHIFT); ++ pcid_addr += (i << PHY_CTX_SHIFT); + +- /* Zero out the context. */ +- for (offset = 0; offset < PHY_CTX_SIZE; offset += 4) { +- CTX_WR(bp, 0x00, offset, 0); +- } ++ REG_WR(bp, BNX2_CTX_VIRT_ADDR, 0x00); ++ REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr); + +- REG_WR(bp, BNX2_CTX_VIRT_ADDR, vcid_addr); +- REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr); ++ /* Zero out the context. */ ++ for (offset = 0; offset < PHY_CTX_SIZE; offset += 4) ++ CTX_WR(bp, 0x00, offset, 0); ++ ++ REG_WR(bp, BNX2_CTX_VIRT_ADDR, vcid_addr); ++ REG_WR(bp, BNX2_CTX_PAGE_TBL, pcid_addr); ++ } + } + } + +diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c +index b6b444b..e525a5b 100644 +--- a/drivers/net/sky2.c ++++ b/drivers/net/sky2.c +@@ -95,7 +95,7 @@ static int disable_msi = 0; + module_param(disable_msi, int, 0); + MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)"); + +-static int idle_timeout = 0; ++static int idle_timeout = 100; + module_param(idle_timeout, int, 0); + MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)"); + +@@ -2433,6 +2433,13 @@ static int sky2_poll(struct net_device *dev0, int *budget) + + work_done = sky2_status_intr(hw, work_limit); + if (work_done < work_limit) { ++ /* Bug/Errata workaround? ++ * Need to kick the TX irq moderation timer. ++ */ ++ if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) { ++ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP); ++ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START); ++ } + netif_rx_complete(dev0); + + sky2_read32(hw, B0_Y2_SP_LISR); +diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c +index 3d2fcc5..64ed5ef 100644 +--- a/drivers/serial/mpsc.c ++++ b/drivers/serial/mpsc.c +@@ -502,7 +502,8 @@ mpsc_sdma_intr_ack(struct mpsc_port_info *pi) + + if (pi->mirror_regs) + pi->shared_regs->SDMA_INTR_CAUSE_m = 0; +- writel(0, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE); ++ writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE + ++ pi->port.line); + return; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 49fe299..8cf1d7f 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1138,6 +1138,7 @@ static inline void put_task_struct(struct task_struct *t) + /* Not implemented yet, only for 486*/ + #define PF_STARTING 0x00000002 /* being created */ + #define PF_EXITING 0x00000004 /* getting shut down */ ++#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ + #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ + #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ + #define PF_DUMPCORE 0x00000200 /* dumped core */ +diff --git a/ipc/shm.c b/ipc/shm.c +index 4fefbad..8d2672d 100644 +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -254,8 +254,10 @@ struct mempolicy *shm_get_policy(struct vm_area_struct *vma, unsigned long addr) + + if (sfd->vm_ops->get_policy) + pol = sfd->vm_ops->get_policy(vma, addr); +- else ++ else if (vma->vm_policy) + pol = vma->vm_policy; ++ else ++ pol = current->mempolicy; + return pol; + } + #endif +diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c +index 3749193..2b8311b 100644 +--- a/kernel/auditfilter.c ++++ b/kernel/auditfilter.c +@@ -905,7 +905,7 @@ static void audit_update_watch(struct audit_parent *parent, + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ +- if (invalidating && ++ if (invalidating && current->audit_context && + audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) + audit_set_auditable(current->audit_context); + +diff --git a/kernel/exit.c b/kernel/exit.c +index b55ed4c..7debf34 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -884,13 +884,29 @@ fastcall NORET_TYPE void do_exit(long code) + if (unlikely(tsk->flags & PF_EXITING)) { + printk(KERN_ALERT + "Fixing recursive fault but reboot is needed!\n"); ++ /* ++ * We can do this unlocked here. The futex code uses ++ * this flag just to verify whether the pi state ++ * cleanup has been done or not. In the worst case it ++ * loops once more. We pretend that the cleanup was ++ * done as there is no way to return. Either the ++ * OWNER_DIED bit is set by now or we push the blocked ++ * task into the wait for ever nirwana as well. ++ */ ++ tsk->flags |= PF_EXITPIDONE; + if (tsk->io_context) + exit_io_context(); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + ++ /* ++ * tsk->flags are checked in the futex code to protect against ++ * an exiting task cleaning up the robust pi futexes. ++ */ ++ spin_lock_irq(&tsk->pi_lock); + tsk->flags |= PF_EXITING; ++ spin_unlock_irq(&tsk->pi_lock); + + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", +@@ -957,6 +973,12 @@ fastcall NORET_TYPE void do_exit(long code) + * Make sure we are holding no locks: + */ + debug_check_no_locks_held(tsk); ++ /* ++ * We can do this unlocked here. The futex code uses this flag ++ * just to verify whether the pi state cleanup has been done ++ * or not. In the worst case it loops once more. ++ */ ++ tsk->flags |= PF_EXITPIDONE; + + if (tsk->io_context) + exit_io_context(); +diff --git a/kernel/futex.c b/kernel/futex.c +index 5a270b5..4809436 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -390,18 +390,12 @@ static struct task_struct * futex_find_get_task(pid_t pid) + + rcu_read_lock(); + p = find_task_by_pid(pid); +- if (!p) +- goto out_unlock; +- if ((current->euid != p->euid) && (current->euid != p->uid)) { +- p = NULL; +- goto out_unlock; +- } +- if (p->exit_state != 0) { +- p = NULL; +- goto out_unlock; +- } +- get_task_struct(p); +-out_unlock: ++ ++ if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) ++ p = ERR_PTR(-ESRCH); ++ else ++ get_task_struct(p); ++ + rcu_read_unlock(); + + return p; +@@ -467,7 +461,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) + struct futex_q *this, *next; + struct list_head *head; + struct task_struct *p; +- pid_t pid; ++ pid_t pid = uval & FUTEX_TID_MASK; + + head = &hb->chain; + +@@ -485,6 +479,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) + return -EINVAL; + + WARN_ON(!atomic_read(&pi_state->refcount)); ++ WARN_ON(pid && pi_state->owner && ++ pi_state->owner->pid != pid); + + atomic_inc(&pi_state->refcount); + me->pi_state = pi_state; +@@ -495,15 +491,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) + + /* + * We are the first waiter - try to look up the real owner and attach +- * the new pi_state to it, but bail out when the owner died bit is set +- * and TID = 0: ++ * the new pi_state to it, but bail out when TID = 0 + */ +- pid = uval & FUTEX_TID_MASK; +- if (!pid && (uval & FUTEX_OWNER_DIED)) ++ if (!pid) + return -ESRCH; + p = futex_find_get_task(pid); +- if (!p) +- return -ESRCH; ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ /* ++ * We need to look at the task state flags to figure out, ++ * whether the task is exiting. To protect against the do_exit ++ * change of the task flags, we do this protected by ++ * p->pi_lock: ++ */ ++ spin_lock_irq(&p->pi_lock); ++ if (unlikely(p->flags & PF_EXITING)) { ++ /* ++ * The task is on the way out. When PF_EXITPIDONE is ++ * set, we know that the task has finished the ++ * cleanup: ++ */ ++ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; ++ ++ spin_unlock_irq(&p->pi_lock); ++ put_task_struct(p); ++ return ret; ++ } + + pi_state = alloc_pi_state(); + +@@ -516,7 +530,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) + /* Store the key for possible exit cleanups: */ + pi_state->key = me->key; + +- spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; +@@ -583,15 +596,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) + * preserve the owner died bit.) + */ + if (!(uval & FUTEX_OWNER_DIED)) { ++ int ret = 0; ++ + newval = FUTEX_WAITERS | new_owner->pid; + + pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + pagefault_enable(); ++ + if (curval == -EFAULT) +- return -EFAULT; ++ ret = -EFAULT; + if (curval != uval) +- return -EINVAL; ++ ret = -EINVAL; ++ if (ret) { ++ spin_unlock(&pi_state->pi_mutex.wait_lock); ++ return ret; ++ } + } + + spin_lock_irq(&pi_state->owner->pi_lock); +@@ -1149,6 +1169,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + if (unlikely(ret != 0)) + goto out_release_sem; + ++ retry_unlocked: + hb = queue_lock(&q, -1, NULL); + + retry_locked: +@@ -1200,34 +1221,58 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + ret = lookup_pi_state(uval, hb, &q); + + if (unlikely(ret)) { +- /* +- * There were no waiters and the owner task lookup +- * failed. When the OWNER_DIED bit is set, then we +- * know that this is a robust futex and we actually +- * take the lock. This is safe as we are protected by +- * the hash bucket lock. We also set the waiters bit +- * unconditionally here, to simplify glibc handling of +- * multiple tasks racing to acquire the lock and +- * cleanup the problems which were left by the dead +- * owner. +- */ +- if (curval & FUTEX_OWNER_DIED) { +- uval = newval; +- newval = current->pid | +- FUTEX_OWNER_DIED | FUTEX_WAITERS; ++ switch (ret) { + +- pagefault_disable(); +- curval = futex_atomic_cmpxchg_inatomic(uaddr, +- uval, newval); +- pagefault_enable(); ++ case -EAGAIN: ++ /* ++ * Task is exiting and we just wait for the ++ * exit to complete. ++ */ ++ queue_unlock(&q, hb); ++ up_read(&curr->mm->mmap_sem); ++ cond_resched(); ++ goto retry; + +- if (unlikely(curval == -EFAULT)) ++ case -ESRCH: ++ /* ++ * No owner found for this futex. Check if the ++ * OWNER_DIED bit is set to figure out whether ++ * this is a robust futex or not. ++ */ ++ if (get_futex_value_locked(&curval, uaddr)) + goto uaddr_faulted; +- if (unlikely(curval != uval)) +- goto retry_locked; +- ret = 0; ++ ++ /* ++ * There were no waiters and the owner task lookup ++ * failed. When the OWNER_DIED bit is set, then we ++ * know that this is a robust futex and we actually ++ * take the lock. This is safe as we are protected by ++ * the hash bucket lock. We also set the waiters bit ++ * unconditionally here, to simplify glibc handling of ++ * multiple tasks racing to acquire the lock and ++ * cleanup the problems which were left by the dead ++ * owner. ++ */ ++ if (curval & FUTEX_OWNER_DIED) { ++ uval = newval; ++ newval = current->pid | ++ FUTEX_OWNER_DIED | FUTEX_WAITERS; ++ ++ pagefault_disable(); ++ curval = futex_atomic_cmpxchg_inatomic(uaddr, ++ uval, ++ newval); ++ pagefault_enable(); ++ ++ if (unlikely(curval == -EFAULT)) ++ goto uaddr_faulted; ++ if (unlikely(curval != uval)) ++ goto retry_locked; ++ ret = 0; ++ } ++ default: ++ goto out_unlock_release_sem; + } +- goto out_unlock_release_sem; + } + + /* +@@ -1279,39 +1324,52 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + list_add(&q.pi_state->list, ¤t->pi_state_list); + spin_unlock_irq(¤t->pi_lock); + +- /* Unqueue and drop the lock */ +- unqueue_me_pi(&q, hb); +- up_read(&curr->mm->mmap_sem); + /* + * We own it, so we have to replace the pending owner +- * TID. This must be atomic as we have preserve the ++ * TID. This must be atomic as we have to preserve the + * owner died bit here. + */ +- ret = get_user(uval, uaddr); ++ ret = get_futex_value_locked(&uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; ++ ++ pagefault_disable(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); ++ pagefault_enable(); ++ + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } +- } else { ++ } else if (ret) { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ +- if (ret && q.pi_state->owner == curr) { +- if (rt_mutex_trylock(&q.pi_state->pi_mutex)) +- ret = 0; ++ if (q.pi_state->owner == curr && ++ rt_mutex_trylock(&q.pi_state->pi_mutex)) { ++ ret = 0; ++ } else { ++ /* ++ * Paranoia check. If we did not take the lock ++ * in the trylock above, then we should not be ++ * the owner of the rtmutex, neither the real ++ * nor the pending one: ++ */ ++ if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) ++ printk(KERN_ERR "futex_lock_pi: ret = %d " ++ "pi-mutex: %p pi-state %p\n", ret, ++ q.pi_state->pi_mutex.owner, ++ q.pi_state->owner); + } +- /* Unqueue and drop the lock */ +- unqueue_me_pi(&q, hb); +- up_read(&curr->mm->mmap_sem); + } ++ /* Unqueue and drop the lock */ ++ unqueue_me_pi(&q, hb); ++ up_read(&curr->mm->mmap_sem); + + if (!detect && ret == -EDEADLK && 0) + force_sig(SIGKILL, current); +@@ -1331,16 +1389,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. ++ * ++ * ... and hb->lock. :-) --ANK + */ ++ queue_unlock(&q, hb); ++ + if (attempt++) { +- if (futex_handle_fault((unsigned long)uaddr, attempt)) { +- ret = -EFAULT; +- goto out_unlock_release_sem; +- } +- goto retry_locked; ++ ret = futex_handle_fault((unsigned long)uaddr, attempt); ++ if (ret) ++ goto out_release_sem; ++ goto retry_unlocked; + } + +- queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + + ret = get_user(uval, uaddr); +@@ -1382,9 +1442,9 @@ retry: + goto out; + + hb = hash_futex(&key); ++retry_unlocked: + spin_lock(&hb->lock); + +-retry_locked: + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking +@@ -1446,16 +1506,17 @@ pi_faulted: + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. ++ * ++ * ... and hb->lock. :-) --ANK + */ ++ spin_unlock(&hb->lock); ++ + if (attempt++) { +- if (futex_handle_fault((unsigned long)uaddr, attempt)) { +- ret = -EFAULT; +- goto out_unlock; +- } +- goto retry_locked; ++ ret = futex_handle_fault((unsigned long)uaddr, attempt); ++ if (ret) ++ goto out; ++ goto retry_unlocked; + } +- +- spin_unlock(&hb->lock); + up_read(¤t->mm->mmap_sem); + + ret = get_user(uval, uaddr); +diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c +index 44318ca..9577ac8 100644 +--- a/kernel/posix-timers.c ++++ b/kernel/posix-timers.c +@@ -354,9 +354,40 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) + * it should be restarted. + */ + if (timr->it.real.interval.tv64 != 0) { ++ ktime_t now = hrtimer_cb_get_time(timer); ++ ++ /* ++ * FIXME: What we really want, is to stop this ++ * timer completely and restart it in case the ++ * SIG_IGN is removed. This is a non trivial ++ * change which involves sighand locking ++ * (sigh !), which we don't want to do late in ++ * the release cycle. ++ * ++ * For now we just let timers with an interval ++ * less than a jiffie expire every jiffie to ++ * avoid softirq starvation in case of SIG_IGN ++ * and a very small interval, which would put ++ * the timer right back on the softirq pending ++ * list. By moving now ahead of time we trick ++ * hrtimer_forward() to expire the timer ++ * later, while we still maintain the overrun ++ * accuracy, but have some inconsistency in ++ * the timer_gettime() case. This is at least ++ * better than a starved softirq. A more ++ * complex fix which solves also another related ++ * inconsistency is already in the pipeline. ++ */ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ { ++ ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); ++ ++ if (timr->it.real.interval.tv64 < kj.tv64) ++ now = ktime_add(now, kj); ++ } ++#endif + timr->it_overrun += +- hrtimer_forward(timer, +- hrtimer_cb_get_time(timer), ++ hrtimer_forward(timer, now, + timr->it.real.interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; +diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c +index 180978c..17d28ce 100644 +--- a/kernel/rtmutex.c ++++ b/kernel/rtmutex.c +@@ -212,6 +212,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + if (!waiter || !waiter->task) + goto out_unlock_pi; + ++ /* ++ * Check the orig_waiter state. After we dropped the locks, ++ * the previous owner of the lock might have released the lock ++ * and made us the pending owner: ++ */ ++ if (orig_waiter && !orig_waiter->task) ++ goto out_unlock_pi; ++ ++ /* ++ * Drop out, when the task has no waiters. Note, ++ * top_waiter can be NULL, when we are in the deboosting ++ * mode! ++ */ + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; +@@ -659,9 +672,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + * all over without going into schedule to try + * to get the lock now: + */ +- if (unlikely(!waiter.task)) ++ if (unlikely(!waiter.task)) { ++ /* ++ * Reset the return value. We might ++ * have returned with -EDEADLK and the ++ * owner released the lock while we ++ * were walking the pi chain. ++ */ ++ ret = 0; + continue; +- ++ } + if (unlikely(ret)) + break; + } +diff --git a/kernel/sched.c b/kernel/sched.c +index a3993b9..f745a44 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -2831,17 +2831,21 @@ static void idle_balance(int this_cpu, struct rq *this_rq) + unsigned long next_balance = jiffies + 60 * HZ; + + for_each_domain(this_cpu, sd) { +- if (sd->flags & SD_BALANCE_NEWIDLE) { ++ unsigned long interval; ++ ++ if (!(sd->flags & SD_LOAD_BALANCE)) ++ continue; ++ ++ if (sd->flags & SD_BALANCE_NEWIDLE) + /* If we've pulled tasks over stop searching: */ + pulled_task = load_balance_newidle(this_cpu, +- this_rq, sd); +- if (time_after(next_balance, +- sd->last_balance + sd->balance_interval)) +- next_balance = sd->last_balance +- + sd->balance_interval; +- if (pulled_task) +- break; +- } ++ this_rq, sd); ++ ++ interval = msecs_to_jiffies(sd->balance_interval); ++ if (time_after(next_balance, sd->last_balance + interval)) ++ next_balance = sd->last_balance + interval; ++ if (pulled_task) ++ break; + } + if (!pulled_task) + /* +diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c +index cb25649..c6b6f35 100644 +--- a/kernel/time/ntp.c ++++ b/kernel/time/ntp.c +@@ -120,7 +120,6 @@ void second_overflow(void) + */ + time_interpolator_update(-NSEC_PER_SEC); + time_state = TIME_OOP; +- clock_was_set(); + printk(KERN_NOTICE "Clock: inserting leap second " + "23:59:60 UTC\n"); + } +@@ -135,7 +134,6 @@ void second_overflow(void) + */ + time_interpolator_update(NSEC_PER_SEC); + time_state = TIME_WAIT; +- clock_was_set(); + printk(KERN_NOTICE "Clock: deleting leap second " + "23:59:59 UTC\n"); + } +diff --git a/mm/rmap.c b/mm/rmap.c +index b82146e..6e35d11 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -53,24 +53,6 @@ + + struct kmem_cache *anon_vma_cachep; + +-static inline void validate_anon_vma(struct vm_area_struct *find_vma) +-{ +-#ifdef CONFIG_DEBUG_VM +- struct anon_vma *anon_vma = find_vma->anon_vma; +- struct vm_area_struct *vma; +- unsigned int mapcount = 0; +- int found = 0; +- +- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { +- mapcount++; +- BUG_ON(mapcount > 100000); +- if (vma == find_vma) +- found = 1; +- } +- BUG_ON(!found); +-#endif +-} +- + /* This must be called under the mmap_sem. */ + int anon_vma_prepare(struct vm_area_struct *vma) + { +@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma) + { + struct anon_vma *anon_vma = vma->anon_vma; + +- if (anon_vma) { ++ if (anon_vma) + list_add_tail(&vma->anon_vma_node, &anon_vma->head); +- validate_anon_vma(vma); +- } + } + + void anon_vma_link(struct vm_area_struct *vma) +@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma) + if (anon_vma) { + spin_lock(&anon_vma->lock); + list_add_tail(&vma->anon_vma_node, &anon_vma->head); +- validate_anon_vma(vma); + spin_unlock(&anon_vma->lock); + } + } +@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma) + return; + + spin_lock(&anon_vma->lock); +- validate_anon_vma(vma); + list_del(&vma->anon_vma_node); + + /* We must garbage collect the anon_vma if it's empty */ diff --git a/trunk/2.6.21/20950_linux-2.6.21.6-xen-3.1.0.patch b/trunk/2.6.21/20950_linux-2.6.21.6-xen-3.1.0.patch deleted file mode 100644 index 9a92224..0000000 --- a/trunk/2.6.21/20950_linux-2.6.21.6-xen-3.1.0.patch +++ /dev/null @@ -1,96051 +0,0 @@ -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/Kconfig ---- a/arch/i386/Kconfig Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/Kconfig Fri Jul 20 11:56:41 2007 -0300 -@@ -16,6 +16,7 @@ config X86_32 - - config GENERIC_TIME - bool -+ depends on !X86_XEN - default y - - config CLOCKSOURCE_WATCHDOG -@@ -126,6 +127,15 @@ config X86_PC - bool "PC-compatible" - help - Choose this option if your computer is a standard PC or compatible. -+ -+config X86_XEN -+ bool "Xen-compatible" -+ select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST -+ select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST -+ select SWIOTLB -+ help -+ Choose this option if you plan to run this kernel on top of the -+ Xen Hypervisor. - - config X86_ELAN - bool "AMD Elan" -@@ -257,6 +267,7 @@ source "arch/i386/Kconfig.cpu" - - config HPET_TIMER - bool "HPET Timer Support" -+ depends on !X86_XEN - help - This enables the use of the HPET for the kernel's internal timer. - HPET is the next generation timer replacing legacy 8254s. -@@ -307,7 +318,7 @@ source "kernel/Kconfig.preempt" - - config X86_UP_APIC - bool "Local APIC support on uniprocessors" -- depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) -+ depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH || XEN_UNPRIVILEGED_GUEST) - help - A local APIC (Advanced Programmable Interrupt Controller) is an - integrated interrupt controller in the CPU. If you have a single-CPU -@@ -332,12 +343,12 @@ config X86_UP_IOAPIC - - config X86_LOCAL_APIC - bool -- depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH -+ depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) || X86_GENERICARCH - default y - - config X86_IO_APIC - bool -- depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH -+ depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) || X86_GENERICARCH - default y - - config X86_VISWS_APIC -@@ -347,7 +358,7 @@ config X86_VISWS_APIC - - config X86_MCE - bool "Machine Check Exception" -- depends on !X86_VOYAGER -+ depends on !(X86_VOYAGER || X86_XEN) - ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). -@@ -446,6 +457,7 @@ config X86_REBOOTFIXUPS - - config MICROCODE - tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" -+ depends on !XEN_UNPRIVILEGED_GUEST - select FW_LOADER - ---help--- - If you say Y here and also to "/dev file system support" in the -@@ -469,6 +481,7 @@ config MICROCODE_OLD_INTERFACE - - config X86_MSR - tristate "/dev/cpu/*/msr - Model-specific register support" -+ depends on !X86_XEN - help - This device gives privileged processes access to the x86 - Model-Specific Registers (MSRs). It is a character device with -@@ -483,6 +496,10 @@ config X86_CPUID - be executed on a specific processor. It is a character device - with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to - /dev/cpu/31/cpuid. -+ -+config SWIOTLB -+ bool -+ default n - - source "drivers/firmware/Kconfig" - -@@ -666,6 +683,7 @@ config HIGHPTE - - config MATH_EMULATION - bool "Math emulation" -+ depends on !X86_XEN - ---help--- - Linux can emulate a math coprocessor (used for floating point - operations) if you don't have one. 486DX and Pentium processors have -@@ -691,6 +709,8 @@ config MATH_EMULATION - - config MTRR - bool "MTRR (Memory Type Range Register) support" -+ depends on !XEN_UNPRIVILEGED_GUEST -+ default y if X86_XEN - ---help--- - On Intel P6 family processors (Pentium Pro, Pentium II and later) - the Memory Type Range Registers (MTRRs) may be used to control -@@ -725,7 +745,7 @@ config MTRR - - config EFI - bool "Boot from EFI support" -- depends on ACPI -+ depends on ACPI && !X86_XEN - default n - ---help--- - This enables the kernel to boot on EFI platforms using -@@ -743,7 +763,7 @@ config EFI - - config IRQBALANCE - bool "Enable kernel irq balancing" -- depends on SMP && X86_IO_APIC -+ depends on SMP && X86_IO_APIC && !X86_XEN - default y - help - The default yes will allow the kernel to do irq load balancing. -@@ -777,6 +797,7 @@ source kernel/Kconfig.hz - - config KEXEC - bool "kexec system call" -+ depends on !XEN_UNPRIVILEGED_GUEST - help - kexec is a system call that implements the ability to shutdown your - current kernel, and to start another kernel. It is like a reboot -@@ -893,6 +914,8 @@ config COMPAT_VDSO - config COMPAT_VDSO - bool "Compat VDSO support" - default y -+ depends on !PARAVIRT -+ depends on !X86_XEN - help - Map the VDSO to the predictable old-style address too. - ---help--- -@@ -909,18 +932,20 @@ config ARCH_ENABLE_MEMORY_HOTPLUG - depends on HIGHMEM - - menu "Power management options (ACPI, APM)" -- depends on !X86_VOYAGER -- -+ depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) -+ -+if !X86_XEN - source kernel/power/Kconfig -+endif - - source "drivers/acpi/Kconfig" - - menu "APM (Advanced Power Management) BIOS Support" --depends on PM && !X86_VISWS -+depends on PM && !(X86_VISWS || X86_XEN) - - config APM - tristate "APM (Advanced Power Management) BIOS support" -- depends on PM -+ depends on PM && PM_LEGACY - ---help--- - APM is a BIOS specification for saving power using several different - techniques. This is mostly useful for battery powered laptops with -@@ -1105,6 +1130,7 @@ choice - - config PCI_GOBIOS - bool "BIOS" -+ depends on !X86_XEN - - config PCI_GOMMCONFIG - bool "MMConfig" -@@ -1112,6 +1138,13 @@ config PCI_GODIRECT - config PCI_GODIRECT - bool "Direct" - -+config PCI_GOXEN_FE -+ bool "Xen PCI Frontend" -+ depends on X86_XEN -+ help -+ The PCI device frontend driver allows the kernel to import arbitrary -+ PCI devices from a PCI backend to support PCI driver domains. -+ - config PCI_GOANY - bool "Any" - -@@ -1119,7 +1152,7 @@ endchoice - - config PCI_BIOS - bool -- depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) -+ depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY) - default y - - config PCI_DIRECT -@@ -1131,6 +1164,18 @@ config PCI_MMCONFIG - bool - depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) - default y -+ -+config XEN_PCIDEV_FRONTEND -+ bool -+ depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY) -+ default y -+ -+config XEN_PCIDEV_FE_DEBUG -+ bool "Xen PCI Frontend Debugging" -+ depends on XEN_PCIDEV_FRONTEND -+ default n -+ help -+ Enables some debug statements within the PCI Frontend. - - source "drivers/pci/pcie/Kconfig" - -@@ -1142,7 +1187,7 @@ config ISA_DMA_API - - config ISA - bool "ISA support" -- depends on !(X86_VOYAGER || X86_VISWS) -+ depends on !(X86_VOYAGER || X86_VISWS || X86_XEN) - help - Find out whether you have ISA slots on your motherboard. ISA is the - name of a bus system, i.e. the way the CPU talks to the other stuff -@@ -1169,7 +1214,7 @@ source "drivers/eisa/Kconfig" - source "drivers/eisa/Kconfig" - - config MCA -- bool "MCA support" if !(X86_VISWS || X86_VOYAGER) -+ bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN) - default y if X86_VOYAGER - help - MicroChannel Architecture is found in some IBM PS/2 machines and -@@ -1245,6 +1290,8 @@ source "security/Kconfig" - - source "crypto/Kconfig" - -+source "drivers/xen/Kconfig" -+ - source "lib/Kconfig" - - # -@@ -1270,7 +1317,7 @@ config X86_SMP - - config X86_HT - bool -- depends on SMP && !(X86_VISWS || X86_VOYAGER) -+ depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN) - default y - - config X86_BIOS_REBOOT -@@ -1283,6 +1330,16 @@ config X86_TRAMPOLINE - depends on X86_SMP || (X86_VOYAGER && SMP) - default y - -+config X86_NO_TSS -+ bool -+ depends on X86_XEN -+ default y -+ -+config X86_NO_IDT -+ bool -+ depends on X86_XEN -+ default y -+ - config KTIME_SCALAR - bool - default y -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/Kconfig.cpu ---- a/arch/i386/Kconfig.cpu Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/Kconfig.cpu Fri Jul 20 11:56:41 2007 -0300 -@@ -262,7 +262,7 @@ config X86_PPRO_FENCE - - config X86_F00F_BUG - bool -- depends on M586MMX || M586TSC || M586 || M486 || M386 -+ depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT - default y - - config X86_WP_WORKS_OK -@@ -322,5 +322,5 @@ config X86_OOSTORE - - config X86_TSC - bool -- depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ -- default y -+ depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ && !X86_XEN -+ default y -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/Kconfig.debug ---- a/arch/i386/Kconfig.debug Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/Kconfig.debug Fri Jul 20 11:56:41 2007 -0300 -@@ -79,6 +79,7 @@ config DOUBLEFAULT - config DOUBLEFAULT - default y - bool "Enable doublefault exception handler" if EMBEDDED -+ depends on !X86_NO_TSS - help - This option allows trapping of rare doublefault exceptions that - would otherwise cause a system to silently reboot. Disabling this -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/Makefile ---- a/arch/i386/Makefile Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/Makefile Fri Jul 20 11:56:41 2007 -0300 -@@ -60,6 +60,11 @@ AFLAGS += $(call as-instr,.cfi_startproc - - CFLAGS += $(cflags-y) - -+cppflags-$(CONFIG_XEN) += \ -+ -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) -+ -+CPPFLAGS += $(cppflags-y) -+ - # Default subarch .c files - mcore-y := mach-default - -@@ -82,6 +87,10 @@ mcore-$(CONFIG_X86_BIGSMP) := mach-defau - #Summit subarch support - mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit - mcore-$(CONFIG_X86_SUMMIT) := mach-default -+ -+# Xen subarch support -+mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen -+mcore-$(CONFIG_X86_XEN) := mach-xen - - # generic subarchitecture - mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic -@@ -117,6 +126,19 @@ PHONY += zImage bzImage compressed zlilo - PHONY += zImage bzImage compressed zlilo bzlilo \ - zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install - -+ifdef CONFIG_XEN -+CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS) -+head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task.o -+boot := arch/i386/boot-xen -+.PHONY: vmlinuz -+all: vmlinuz -+ -+vmlinuz: vmlinux -+ $(Q)$(MAKE) $(build)=$(boot) $@ -+ -+install: -+ $(Q)$(MAKE) $(build)=$(boot) XENGUEST=$(XENGUEST) $@ -+else - all: bzImage - - # KBUILD_IMAGE specify target image being built -@@ -139,6 +161,7 @@ fdimage fdimage144 fdimage288 isoimage: - - install: - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install -+endif - - archclean: - $(Q)$(MAKE) $(clean)=arch/i386/boot -@@ -157,3 +180,4 @@ CLEAN_FILES += arch/$(ARCH)/boot/fdimage - CLEAN_FILES += arch/$(ARCH)/boot/fdimage \ - arch/$(ARCH)/boot/image.iso \ - arch/$(ARCH)/boot/mtools.conf -+CLEAN_FILES += vmlinuz vmlinux-stripped -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/boot-xen/Makefile ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/boot-xen/Makefile Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,21 @@ -+ -+OBJCOPYFLAGS := -g --strip-unneeded -+ -+vmlinuz: vmlinux-stripped FORCE -+ $(call if_changed,gzip) -+ -+vmlinux-stripped: vmlinux FORCE -+ $(call if_changed,objcopy) -+ -+INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH)) -+ -+XINSTALL_NAME ?= $(KERNELRELEASE) -+install: -+ mkdir -p $(INSTALL_ROOT)/boot -+ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) -+ rm -f $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) -+ install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) -+ install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX) -+ install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX) -+ install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX) -+ ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/Makefile ---- a/arch/i386/kernel/Makefile Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/Makefile Fri Jul 20 11:56:41 2007 -0300 -@@ -47,6 +47,12 @@ EXTRA_AFLAGS := -traditional - - obj-$(CONFIG_SCx200) += scx200.o - -+ifdef CONFIG_XEN -+vsyscall_note := vsyscall-note-xen.o -+else -+vsyscall_note := vsyscall-note.o -+endif -+ - # vsyscall.o contains the vsyscall DSO images as __initdata. - # We must build both images before we can assemble it. - # Note: kbuild does not track this dependency due to usage of .incbin -@@ -68,7 +74,7 @@ SYSCFLAGS_vsyscall-int80.so = $(vsyscall - - $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ - $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ -- $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE -+ $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE - $(call if_changed,syscall) - - # We also create a special relocatable object that should mirror the symbol -@@ -80,9 +86,21 @@ extra-y += vsyscall-syms.o - - SYSCFLAGS_vsyscall-syms.o = -r - $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ -- $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE -+ $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE - $(call if_changed,syscall) - - k8-y += ../../x86_64/kernel/k8.o - stacktrace-y += ../../x86_64/kernel/stacktrace.o - -+ifdef CONFIG_XEN -+include $(srctree)/scripts/Makefile.xen -+ -+obj-y += fixup.o -+microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o -+n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o -+ -+obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) -+obj-y := $(call cherrypickxen, $(obj-y)) -+extra-y := $(call cherrypickxen, $(extra-y)) -+%/head-xen.o %/head-xen.s: EXTRA_AFLAGS := -+endif -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/acpi/Makefile ---- a/arch/i386/kernel/acpi/Makefile Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/acpi/Makefile Fri Jul 20 11:56:41 2007 -0300 -@@ -7,4 +7,3 @@ ifneq ($(CONFIG_ACPI_PROCESSOR),) - ifneq ($(CONFIG_ACPI_PROCESSOR),) - obj-y += cstate.o processor.o - endif -- -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/acpi/boot.c ---- a/arch/i386/kernel/acpi/boot.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/acpi/boot.c Fri Jul 20 11:56:41 2007 -0300 -@@ -103,7 +103,7 @@ static u64 acpi_lapic_addr __initdata = - */ - enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; - --#ifdef CONFIG_X86_64 -+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) - - /* rely on all ACPI tables being in the direct mapping */ - char *__acpi_map_table(unsigned long phys_addr, unsigned long size) -@@ -136,8 +136,10 @@ char *__acpi_map_table(unsigned long phy - unsigned long base, offset, mapped_size; - int idx; - -+#ifndef CONFIG_XEN - if (phys + size < 8 * 1024 * 1024) - return __va(phys); -+#endif - - offset = phys & (PAGE_SIZE - 1); - mapped_size = PAGE_SIZE - offset; -@@ -592,7 +594,13 @@ acpi_scan_rsdp(unsigned long start, unsi - * RSDP signature. - */ - for (offset = 0; offset < length; offset += 16) { -+#ifdef CONFIG_XEN -+ unsigned long vstart = (unsigned long)isa_bus_to_virt(start); -+ -+ if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len)) -+#else - if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len)) -+#endif - continue; - return (start + offset); - } -@@ -668,7 +676,7 @@ static int __init acpi_parse_fadt(struct - static int __init acpi_parse_fadt(struct acpi_table_header *table) - { - --#ifdef CONFIG_X86_PM_TIMER -+#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN) - /* detect the location of the ACPI PM Timer */ - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { - /* FADT rev. 2 */ -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/apic-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/apic-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,1560 @@ -+/* -+ * Local APIC handling, local APIC timers -+ * -+ * (c) 1999, 2000 Ingo Molnar -+ * -+ * Fixes -+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; -+ * thanks to Eric Gilmore -+ * and Rolf G. Tews -+ * for testing these extensively. -+ * Maciej W. Rozycki : Various updates and fixes. -+ * Mikael Pettersson : Power Management for UP-APIC. -+ * Pavel Machek and -+ * Mikael Pettersson : PM converted to driver model. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+#include "io_ports.h" -+ -+/* -+ * Sanity check -+ */ -+#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F -+# error SPURIOUS_APIC_VECTOR definition error -+#endif -+ -+/* -+ * Knob to control our willingness to enable the local APIC. -+ * -+ * -1=force-disable, +1=force-enable -+ */ -+static int enable_local_apic __initdata = 0; -+ -+#ifndef CONFIG_XEN -+/* Local APIC timer verification ok */ -+static int local_apic_timer_verify_ok; -+#endif -+ -+/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -+static int local_apic_timer_disabled; -+/* Local APIC timer works in C2 */ -+int local_apic_timer_c2_ok; -+EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -+ -+/* -+ * Debug level, exported for io_apic.c -+ */ -+int apic_verbosity; -+ -+#ifndef CONFIG_XEN -+static unsigned int calibration_result; -+ -+static int lapic_next_event(unsigned long delta, -+ struct clock_event_device *evt); -+static void lapic_timer_setup(enum clock_event_mode mode, -+ struct clock_event_device *evt); -+static void lapic_timer_broadcast(cpumask_t mask); -+static void apic_pm_activate(void); -+ -+/* -+ * The local apic timer can be used for any function which is CPU local. -+ */ -+static struct clock_event_device lapic_clockevent = { -+ .name = "lapic", -+ .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT -+ | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, -+ .shift = 32, -+ .set_mode = lapic_timer_setup, -+ .set_next_event = lapic_next_event, -+ .broadcast = lapic_timer_broadcast, -+ .rating = 100, -+ .irq = -1, -+}; -+static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -+ -+/* Local APIC was disabled by the BIOS and enabled by the kernel */ -+static int enabled_via_apicbase; -+#endif -+ -+/* -+ * Get the LAPIC version -+ */ -+static inline int lapic_get_version(void) -+{ -+ return GET_APIC_VERSION(apic_read(APIC_LVR)); -+} -+ -+/* -+ * Check, if the APIC is integrated or a seperate chip -+ */ -+static inline int lapic_is_integrated(void) -+{ -+ return APIC_INTEGRATED(lapic_get_version()); -+} -+ -+/* -+ * Check, whether this is a modern or a first generation APIC -+ */ -+static int modern_apic(void) -+{ -+#ifndef CONFIG_XEN -+ /* AMD systems use old APIC versions, so check the CPU */ -+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -+ boot_cpu_data.x86 >= 0xf) -+ return 1; -+ return lapic_get_version() >= 0x14; -+#else -+ return 1; -+#endif -+} -+ -+#ifndef CONFIG_XEN -+/** -+ * enable_NMI_through_LVT0 - enable NMI through local vector table 0 -+ */ -+void enable_NMI_through_LVT0 (void * dummy) -+{ -+ unsigned int v = APIC_DM_NMI; -+ -+ /* Level triggered for 82489DX */ -+ if (!lapic_is_integrated()) -+ v |= APIC_LVT_LEVEL_TRIGGER; -+ apic_write_around(APIC_LVT0, v); -+} -+#endif /* !CONFIG_XEN */ -+ -+/** -+ * get_physical_broadcast - Get number of physical broadcast IDs -+ */ -+int get_physical_broadcast(void) -+{ -+ return modern_apic() ? 0xff : 0xf; -+} -+ -+#ifndef CONFIG_XEN -+/** -+ * lapic_get_maxlvt - get the maximum number of local vector table entries -+ */ -+int lapic_get_maxlvt(void) -+{ -+ unsigned int v = apic_read(APIC_LVR); -+ -+ /* 82489DXs do not report # of LVT entries. */ -+ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; -+} -+ -+/* -+ * Local APIC timer -+ */ -+ -+/* Clock divisor is set to 16 */ -+#define APIC_DIVISOR 16 -+ -+/* -+ * This function sets up the local APIC timer, with a timeout of -+ * 'clocks' APIC bus clock. During calibration we actually call -+ * this function twice on the boot CPU, once with a bogus timeout -+ * value, second time for real. The other (noncalibrating) CPUs -+ * call this function only once, with the real, calibrated value. -+ * -+ * We do reads before writes even if unnecessary, to get around the -+ * P5 APIC double write bug. -+ */ -+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -+{ -+ unsigned int lvtt_value, tmp_value; -+ -+ lvtt_value = LOCAL_TIMER_VECTOR; -+ if (!oneshot) -+ lvtt_value |= APIC_LVT_TIMER_PERIODIC; -+ if (!lapic_is_integrated()) -+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); -+ -+ if (!irqen) -+ lvtt_value |= APIC_LVT_MASKED; -+ -+ apic_write_around(APIC_LVTT, lvtt_value); -+ -+ /* -+ * Divide PICLK by 16 -+ */ -+ tmp_value = apic_read(APIC_TDCR); -+ apic_write_around(APIC_TDCR, (tmp_value -+ & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) -+ | APIC_TDR_DIV_16); -+ -+ if (!oneshot) -+ apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -+} -+ -+/* -+ * Program the next event, relative to now -+ */ -+static int lapic_next_event(unsigned long delta, -+ struct clock_event_device *evt) -+{ -+ apic_write_around(APIC_TMICT, delta); -+ return 0; -+} -+ -+/* -+ * Setup the lapic timer in periodic or oneshot mode -+ */ -+static void lapic_timer_setup(enum clock_event_mode mode, -+ struct clock_event_device *evt) -+{ -+ unsigned long flags; -+ unsigned int v; -+ -+ /* Lapic used for broadcast ? */ -+ if (!local_apic_timer_verify_ok) -+ return; -+ -+ local_irq_save(flags); -+ -+ switch (mode) { -+ case CLOCK_EVT_MODE_PERIODIC: -+ case CLOCK_EVT_MODE_ONESHOT: -+ __setup_APIC_LVTT(calibration_result, -+ mode != CLOCK_EVT_MODE_PERIODIC, 1); -+ break; -+ case CLOCK_EVT_MODE_UNUSED: -+ case CLOCK_EVT_MODE_SHUTDOWN: -+ v = apic_read(APIC_LVTT); -+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); -+ apic_write_around(APIC_LVTT, v); -+ break; -+ } -+ -+ local_irq_restore(flags); -+} -+ -+/* -+ * Local APIC timer broadcast function -+ */ -+static void lapic_timer_broadcast(cpumask_t mask) -+{ -+#ifdef CONFIG_SMP -+ send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -+#endif -+} -+ -+/* -+ * Setup the local APIC timer for this CPU. Copy the initilized values -+ * of the boot CPU and register the clock event in the framework. -+ */ -+static void __devinit setup_APIC_timer(void) -+{ -+ struct clock_event_device *levt = &__get_cpu_var(lapic_events); -+ -+ memcpy(levt, &lapic_clockevent, sizeof(*levt)); -+ levt->cpumask = cpumask_of_cpu(smp_processor_id()); -+ -+ clockevents_register_device(levt); -+} -+ -+/* -+ * In this functions we calibrate APIC bus clocks to the external timer. -+ * -+ * We want to do the calibration only once since we want to have local timer -+ * irqs syncron. CPUs connected by the same APIC bus have the very same bus -+ * frequency. -+ * -+ * This was previously done by reading the PIT/HPET and waiting for a wrap -+ * around to find out, that a tick has elapsed. I have a box, where the PIT -+ * readout is broken, so it never gets out of the wait loop again. This was -+ * also reported by others. -+ * -+ * Monitoring the jiffies value is inaccurate and the clockevents -+ * infrastructure allows us to do a simple substitution of the interrupt -+ * handler. -+ * -+ * The calibration routine also uses the pm_timer when possible, as the PIT -+ * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes -+ * back to normal later in the boot process). -+ */ -+ -+#define LAPIC_CAL_LOOPS (HZ/10) -+ -+static __initdata volatile int lapic_cal_loops = -1; -+static __initdata long lapic_cal_t1, lapic_cal_t2; -+static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -+static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; -+static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; -+ -+/* -+ * Temporary interrupt handler. -+ */ -+static void __init lapic_cal_handler(struct clock_event_device *dev) -+{ -+ unsigned long long tsc = 0; -+ long tapic = apic_read(APIC_TMCCT); -+ unsigned long pm = acpi_pm_read_early(); -+ -+ if (cpu_has_tsc) -+ rdtscll(tsc); -+ -+ switch (lapic_cal_loops++) { -+ case 0: -+ lapic_cal_t1 = tapic; -+ lapic_cal_tsc1 = tsc; -+ lapic_cal_pm1 = pm; -+ lapic_cal_j1 = jiffies; -+ break; -+ -+ case LAPIC_CAL_LOOPS: -+ lapic_cal_t2 = tapic; -+ lapic_cal_tsc2 = tsc; -+ if (pm < lapic_cal_pm1) -+ pm += ACPI_PM_OVRRUN; -+ lapic_cal_pm2 = pm; -+ lapic_cal_j2 = jiffies; -+ break; -+ } -+} -+ -+/* -+ * Setup the boot APIC -+ * -+ * Calibrate and verify the result. -+ */ -+void __init setup_boot_APIC_clock(void) -+{ -+ struct clock_event_device *levt = &__get_cpu_var(lapic_events); -+ const long pm_100ms = PMTMR_TICKS_PER_SEC/10; -+ const long pm_thresh = pm_100ms/100; -+ void (*real_handler)(struct clock_event_device *dev); -+ unsigned long deltaj; -+ long delta, deltapm; -+ int pm_referenced = 0; -+ -+ if (boot_cpu_has(X86_FEATURE_LAPIC_TIMER_BROKEN)) -+ local_apic_timer_disabled = 1; -+ -+ /* -+ * The local apic timer can be disabled via the kernel -+ * commandline or from the test above. Register the lapic -+ * timer as a dummy clock event source on SMP systems, so the -+ * broadcast mechanism is used. On UP systems simply ignore it. -+ */ -+ if (local_apic_timer_disabled) { -+ /* No broadcast on UP ! */ -+ if (num_possible_cpus() > 1) -+ setup_APIC_timer(); -+ return; -+ } -+ -+ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" -+ "calibrating APIC timer ...\n"); -+ -+ local_irq_disable(); -+ -+ /* Replace the global interrupt handler */ -+ real_handler = global_clock_event->event_handler; -+ global_clock_event->event_handler = lapic_cal_handler; -+ -+ /* -+ * Setup the APIC counter to 1e9. There is no way the lapic -+ * can underflow in the 100ms detection time frame -+ */ -+ __setup_APIC_LVTT(1000000000, 0, 0); -+ -+ /* Let the interrupts run */ -+ local_irq_enable(); -+ -+ while (lapic_cal_loops <= LAPIC_CAL_LOOPS) -+ cpu_relax(); -+ -+ local_irq_disable(); -+ -+ /* Restore the real event handler */ -+ global_clock_event->event_handler = real_handler; -+ -+ /* Build delta t1-t2 as apic timer counts down */ -+ delta = lapic_cal_t1 - lapic_cal_t2; -+ apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); -+ -+ /* Check, if the PM timer is available */ -+ deltapm = lapic_cal_pm2 - lapic_cal_pm1; -+ apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); -+ -+ if (deltapm) { -+ unsigned long mult; -+ u64 res; -+ -+ mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); -+ -+ if (deltapm > (pm_100ms - pm_thresh) && -+ deltapm < (pm_100ms + pm_thresh)) { -+ apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); -+ } else { -+ res = (((u64) deltapm) * mult) >> 22; -+ do_div(res, 1000000); -+ printk(KERN_WARNING "APIC calibration not consistent " -+ "with PM Timer: %ldms instead of 100ms\n", -+ (long)res); -+ /* Correct the lapic counter value */ -+ res = (((u64) delta ) * pm_100ms); -+ do_div(res, deltapm); -+ printk(KERN_INFO "APIC delta adjusted to PM-Timer: " -+ "%lu (%ld)\n", (unsigned long) res, delta); -+ delta = (long) res; -+ } -+ pm_referenced = 1; -+ } -+ -+ /* Calculate the scaled math multiplication factor */ -+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); -+ lapic_clockevent.max_delta_ns = -+ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); -+ lapic_clockevent.min_delta_ns = -+ clockevent_delta2ns(0xF, &lapic_clockevent); -+ -+ calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; -+ -+ apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); -+ apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); -+ apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", -+ calibration_result); -+ -+ if (cpu_has_tsc) { -+ delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); -+ apic_printk(APIC_VERBOSE, "..... CPU clock speed is " -+ "%ld.%04ld MHz.\n", -+ (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), -+ (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); -+ } -+ -+ apic_printk(APIC_VERBOSE, "..... host bus clock speed is " -+ "%u.%04u MHz.\n", -+ calibration_result / (1000000 / HZ), -+ calibration_result % (1000000 / HZ)); -+ -+ local_apic_timer_verify_ok = 1; -+ -+ /* We trust the pm timer based calibration */ -+ if (!pm_referenced) { -+ apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); -+ -+ /* -+ * Setup the apic timer manually -+ */ -+ levt->event_handler = lapic_cal_handler; -+ lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); -+ lapic_cal_loops = -1; -+ -+ /* Let the interrupts run */ -+ local_irq_enable(); -+ -+ while(lapic_cal_loops <= LAPIC_CAL_LOOPS) -+ cpu_relax(); -+ -+ local_irq_disable(); -+ -+ /* Stop the lapic timer */ -+ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); -+ -+ local_irq_enable(); -+ -+ /* Jiffies delta */ -+ deltaj = lapic_cal_j2 - lapic_cal_j1; -+ apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); -+ -+ /* Check, if the jiffies result is consistent */ -+ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) -+ apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); -+ else -+ local_apic_timer_verify_ok = 0; -+ } else -+ local_irq_enable(); -+ -+ if (!local_apic_timer_verify_ok) { -+ printk(KERN_WARNING -+ "APIC timer disabled due to verification failure.\n"); -+ /* No broadcast on UP ! */ -+ if (num_possible_cpus() == 1) -+ return; -+ } else { -+ /* -+ * If nmi_watchdog is set to IO_APIC, we need the -+ * PIT/HPET going. Otherwise register lapic as a dummy -+ * device. -+ */ -+ if (nmi_watchdog != NMI_IO_APIC) -+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; -+ } -+ -+ /* Setup the lapic or request the broadcast */ -+ setup_APIC_timer(); -+} -+ -+void __devinit setup_secondary_APIC_clock(void) -+{ -+ setup_APIC_timer(); -+} -+ -+/* -+ * The guts of the apic timer interrupt -+ */ -+static void local_apic_timer_interrupt(void) -+{ -+ int cpu = smp_processor_id(); -+ struct clock_event_device *evt = &per_cpu(lapic_events, cpu); -+ -+ /* -+ * Normally we should not be here till LAPIC has been initialized but -+ * in some cases like kdump, its possible that there is a pending LAPIC -+ * timer interrupt from previous kernel's context and is delivered in -+ * new kernel the moment interrupts are enabled. -+ * -+ * Interrupts are enabled early and LAPIC is setup much later, hence -+ * its possible that when we get here evt->event_handler is NULL. -+ * Check for event_handler being NULL and discard the interrupt as -+ * spurious. -+ */ -+ if (!evt->event_handler) { -+ printk(KERN_WARNING -+ "Spurious LAPIC timer interrupt on cpu %d\n", cpu); -+ /* Switch it off */ -+ lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); -+ return; -+ } -+ -+ per_cpu(irq_stat, cpu).apic_timer_irqs++; -+ -+ evt->event_handler(evt); -+} -+ -+/* -+ * Local APIC timer interrupt. This is the most natural way for doing -+ * local interrupts, but local timer interrupts can be emulated by -+ * broadcast interrupts too. [in case the hw doesn't support APIC timers] -+ * -+ * [ if a single-CPU system runs an SMP kernel then we call the local -+ * interrupt as well. Thus we cannot inline the local irq ... ] -+ */ -+ -+void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) -+{ -+ struct pt_regs *old_regs = set_irq_regs(regs); -+ -+ /* -+ * NOTE! We'd better ACK the irq immediately, -+ * because timer handling can be slow. -+ */ -+ ack_APIC_irq(); -+ /* -+ * update_process_times() expects us to have done irq_enter(). -+ * Besides, if we don't timer interrupts ignore the global -+ * interrupt lock, which is the WrongThing (tm) to do. -+ */ -+ irq_enter(); -+ local_apic_timer_interrupt(); -+ irq_exit(); -+ -+ set_irq_regs(old_regs); -+} -+#endif /* !CONFIG_XEN */ -+ -+int setup_profiling_timer(unsigned int multiplier) -+{ -+ return -EINVAL; -+} -+ -+#ifndef CONFIG_XEN -+/* -+ * Local APIC start and shutdown -+ */ -+ -+/** -+ * clear_local_APIC - shutdown the local APIC -+ * -+ * This is called, when a CPU is disabled and before rebooting, so the state of -+ * the local APIC has no dangling leftovers. Also used to cleanout any BIOS -+ * leftovers during boot. -+ */ -+void clear_local_APIC(void) -+{ -+ int maxlvt = lapic_get_maxlvt(); -+ unsigned long v; -+ -+ /* -+ * Masking an LVT entry can trigger a local APIC error -+ * if the vector is zero. Mask LVTERR first to prevent this. -+ */ -+ if (maxlvt >= 3) { -+ v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ -+ apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); -+ } -+ /* -+ * Careful: we have to set masks only first to deassert -+ * any level-triggered sources. -+ */ -+ v = apic_read(APIC_LVTT); -+ apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); -+ v = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -+ v = apic_read(APIC_LVT1); -+ apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); -+ if (maxlvt >= 4) { -+ v = apic_read(APIC_LVTPC); -+ apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); -+ } -+ -+ /* lets not touch this if we didn't frob it */ -+#ifdef CONFIG_X86_MCE_P4THERMAL -+ if (maxlvt >= 5) { -+ v = apic_read(APIC_LVTTHMR); -+ apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); -+ } -+#endif -+ /* -+ * Clean APIC state for other OSs: -+ */ -+ apic_write_around(APIC_LVTT, APIC_LVT_MASKED); -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED); -+ apic_write_around(APIC_LVT1, APIC_LVT_MASKED); -+ if (maxlvt >= 3) -+ apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); -+ if (maxlvt >= 4) -+ apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); -+ -+#ifdef CONFIG_X86_MCE_P4THERMAL -+ if (maxlvt >= 5) -+ apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); -+#endif -+ /* Integrated APIC (!82489DX) ? */ -+ if (lapic_is_integrated()) { -+ if (maxlvt > 3) -+ /* Clear ESR due to Pentium errata 3AP and 11AP */ -+ apic_write(APIC_ESR, 0); -+ apic_read(APIC_ESR); -+ } -+} -+ -+/** -+ * disable_local_APIC - clear and disable the local APIC -+ */ -+void disable_local_APIC(void) -+{ -+ unsigned long value; -+ -+ clear_local_APIC(); -+ -+ /* -+ * Disable APIC (implies clearing of registers -+ * for 82489DX!). -+ */ -+ value = apic_read(APIC_SPIV); -+ value &= ~APIC_SPIV_APIC_ENABLED; -+ apic_write_around(APIC_SPIV, value); -+ -+ /* -+ * When LAPIC was disabled by the BIOS and enabled by the kernel, -+ * restore the disabled state. -+ */ -+ if (enabled_via_apicbase) { -+ unsigned int l, h; -+ -+ rdmsr(MSR_IA32_APICBASE, l, h); -+ l &= ~MSR_IA32_APICBASE_ENABLE; -+ wrmsr(MSR_IA32_APICBASE, l, h); -+ } -+} -+ -+/* -+ * If Linux enabled the LAPIC against the BIOS default disable it down before -+ * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and -+ * not power-off. Additionally clear all LVT entries before disable_local_APIC -+ * for the case where Linux didn't enable the LAPIC. -+ */ -+void lapic_shutdown(void) -+{ -+ unsigned long flags; -+ -+ if (!cpu_has_apic) -+ return; -+ -+ local_irq_save(flags); -+ clear_local_APIC(); -+ -+ if (enabled_via_apicbase) -+ disable_local_APIC(); -+ -+ local_irq_restore(flags); -+} -+ -+/* -+ * This is to verify that we're looking at a real local APIC. -+ * Check these against your board if the CPUs aren't getting -+ * started for no apparent reason. -+ */ -+int __init verify_local_APIC(void) -+{ -+ unsigned int reg0, reg1; -+ -+ /* -+ * The version register is read-only in a real APIC. -+ */ -+ reg0 = apic_read(APIC_LVR); -+ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); -+ apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); -+ reg1 = apic_read(APIC_LVR); -+ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); -+ -+ /* -+ * The two version reads above should print the same -+ * numbers. If the second one is different, then we -+ * poke at a non-APIC. -+ */ -+ if (reg1 != reg0) -+ return 0; -+ -+ /* -+ * Check if the version looks reasonably. -+ */ -+ reg1 = GET_APIC_VERSION(reg0); -+ if (reg1 == 0x00 || reg1 == 0xff) -+ return 0; -+ reg1 = lapic_get_maxlvt(); -+ if (reg1 < 0x02 || reg1 == 0xff) -+ return 0; -+ -+ /* -+ * The ID register is read/write in a real APIC. -+ */ -+ reg0 = apic_read(APIC_ID); -+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); -+ -+ /* -+ * The next two are just to see if we have sane values. -+ * They're only really relevant if we're in Virtual Wire -+ * compatibility mode, but most boxes are anymore. -+ */ -+ reg0 = apic_read(APIC_LVT0); -+ apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); -+ reg1 = apic_read(APIC_LVT1); -+ apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); -+ -+ return 1; -+} -+ -+/** -+ * sync_Arb_IDs - synchronize APIC bus arbitration IDs -+ */ -+void __init sync_Arb_IDs(void) -+{ -+ /* -+ * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not -+ * needed on AMD. -+ */ -+ if (modern_apic()) -+ return; -+ /* -+ * Wait for idle. -+ */ -+ apic_wait_icr_idle(); -+ -+ apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); -+ apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG -+ | APIC_DM_INIT); -+} -+ -+/* -+ * An initial setup of the virtual wire mode. -+ */ -+void __init init_bsp_APIC(void) -+{ -+ unsigned long value; -+ -+ /* -+ * Don't do the setup now if we have a SMP BIOS as the -+ * through-I/O-APIC virtual wire mode might be active. -+ */ -+ if (smp_found_config || !cpu_has_apic) -+ return; -+ -+ /* -+ * Do not trust the local APIC being empty at bootup. -+ */ -+ clear_local_APIC(); -+ -+ /* -+ * Enable APIC. -+ */ -+ value = apic_read(APIC_SPIV); -+ value &= ~APIC_VECTOR_MASK; -+ value |= APIC_SPIV_APIC_ENABLED; -+ -+ /* This bit is reserved on P4/Xeon and should be cleared */ -+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && -+ (boot_cpu_data.x86 == 15)) -+ value &= ~APIC_SPIV_FOCUS_DISABLED; -+ else -+ value |= APIC_SPIV_FOCUS_DISABLED; -+ value |= SPURIOUS_APIC_VECTOR; -+ apic_write_around(APIC_SPIV, value); -+ -+ /* -+ * Set up the virtual wire mode. -+ */ -+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); -+ value = APIC_DM_NMI; -+ if (!lapic_is_integrated()) /* 82489DX */ -+ value |= APIC_LVT_LEVEL_TRIGGER; -+ apic_write_around(APIC_LVT1, value); -+} -+ -+/** -+ * setup_local_APIC - setup the local APIC -+ */ -+void __devinit setup_local_APIC(void) -+{ -+ unsigned long oldvalue, value, maxlvt, integrated; -+ int i, j; -+ -+ /* Pound the ESR really hard over the head with a big hammer - mbligh */ -+ if (esr_disable) { -+ apic_write(APIC_ESR, 0); -+ apic_write(APIC_ESR, 0); -+ apic_write(APIC_ESR, 0); -+ apic_write(APIC_ESR, 0); -+ } -+ -+ integrated = lapic_is_integrated(); -+ -+ /* -+ * Double-check whether this APIC is really registered. -+ */ -+ if (!apic_id_registered()) -+ BUG(); -+ -+ /* -+ * Intel recommends to set DFR, LDR and TPR before enabling -+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel -+ * document number 292116). So here it goes... -+ */ -+ init_apic_ldr(); -+ -+ /* -+ * Set Task Priority to 'accept all'. We never change this -+ * later on. -+ */ -+ value = apic_read(APIC_TASKPRI); -+ value &= ~APIC_TPRI_MASK; -+ apic_write_around(APIC_TASKPRI, value); -+ -+ /* -+ * After a crash, we no longer service the interrupts and a pending -+ * interrupt from previous kernel might still have ISR bit set. -+ * -+ * Most probably by now CPU has serviced that pending interrupt and -+ * it might not have done the ack_APIC_irq() because it thought, -+ * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it -+ * does not clear the ISR bit and cpu thinks it has already serivced -+ * the interrupt. Hence a vector might get locked. It was noticed -+ * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. -+ */ -+ for (i = APIC_ISR_NR - 1; i >= 0; i--) { -+ value = apic_read(APIC_ISR + i*0x10); -+ for (j = 31; j >= 0; j--) { -+ if (value & (1< 3) /* Due to the Pentium erratum 3AP. */ -+ apic_write(APIC_ESR, 0); -+ oldvalue = apic_read(APIC_ESR); -+ -+ /* enables sending errors */ -+ value = ERROR_APIC_VECTOR; -+ apic_write_around(APIC_LVTERR, value); -+ /* -+ * spec says clear errors after enabling vector. -+ */ -+ if (maxlvt > 3) -+ apic_write(APIC_ESR, 0); -+ value = apic_read(APIC_ESR); -+ if (value != oldvalue) -+ apic_printk(APIC_VERBOSE, "ESR value before enabling " -+ "vector: 0x%08lx after: 0x%08lx\n", -+ oldvalue, value); -+ } else { -+ if (esr_disable) -+ /* -+ * Something untraceble is creating bad interrupts on -+ * secondary quads ... for the moment, just leave the -+ * ESR disabled - we can't do anything useful with the -+ * errors anyway - mbligh -+ */ -+ printk(KERN_INFO "Leaving ESR disabled.\n"); -+ else -+ printk(KERN_INFO "No ESR for 82489DX.\n"); -+ } -+ -+ /* Disable the local apic timer */ -+ value = apic_read(APIC_LVTT); -+ value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); -+ apic_write_around(APIC_LVTT, value); -+ -+ setup_apic_nmi_watchdog(NULL); -+ apic_pm_activate(); -+} -+ -+/* -+ * Detect and initialize APIC -+ */ -+static int __init detect_init_APIC (void) -+{ -+ u32 h, l, features; -+ -+ /* Disabled by kernel option? */ -+ if (enable_local_apic < 0) -+ return -1; -+ -+ switch (boot_cpu_data.x86_vendor) { -+ case X86_VENDOR_AMD: -+ if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || -+ (boot_cpu_data.x86 == 15)) -+ break; -+ goto no_apic; -+ case X86_VENDOR_INTEL: -+ if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || -+ (boot_cpu_data.x86 == 5 && cpu_has_apic)) -+ break; -+ goto no_apic; -+ default: -+ goto no_apic; -+ } -+ -+ if (!cpu_has_apic) { -+ /* -+ * Over-ride BIOS and try to enable the local APIC only if -+ * "lapic" specified. -+ */ -+ if (enable_local_apic <= 0) { -+ printk(KERN_INFO "Local APIC disabled by BIOS -- " -+ "you can enable it with \"lapic\"\n"); -+ return -1; -+ } -+ /* -+ * Some BIOSes disable the local APIC in the APIC_BASE -+ * MSR. This can only be done in software for Intel P6 or later -+ * and AMD K7 (Model > 1) or later. -+ */ -+ rdmsr(MSR_IA32_APICBASE, l, h); -+ if (!(l & MSR_IA32_APICBASE_ENABLE)) { -+ printk(KERN_INFO -+ "Local APIC disabled by BIOS -- reenabling.\n"); -+ l &= ~MSR_IA32_APICBASE_BASE; -+ l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; -+ wrmsr(MSR_IA32_APICBASE, l, h); -+ enabled_via_apicbase = 1; -+ } -+ } -+ /* -+ * The APIC feature bit should now be enabled -+ * in `cpuid' -+ */ -+ features = cpuid_edx(1); -+ if (!(features & (1 << X86_FEATURE_APIC))) { -+ printk(KERN_WARNING "Could not enable APIC!\n"); -+ return -1; -+ } -+ set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -+ -+ /* The BIOS may have set up the APIC at some other address */ -+ rdmsr(MSR_IA32_APICBASE, l, h); -+ if (l & MSR_IA32_APICBASE_ENABLE) -+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; -+ -+ if (nmi_watchdog != NMI_NONE) -+ nmi_watchdog = NMI_LOCAL_APIC; -+ -+ printk(KERN_INFO "Found and enabled local APIC!\n"); -+ -+ apic_pm_activate(); -+ -+ return 0; -+ -+no_apic: -+ printk(KERN_INFO "No local APIC present or hardware disabled\n"); -+ return -1; -+} -+ -+/** -+ * init_apic_mappings - initialize APIC mappings -+ */ -+void __init init_apic_mappings(void) -+{ -+ unsigned long apic_phys; -+ -+ /* -+ * If no local APIC can be found then set up a fake all -+ * zeroes page to simulate the local APIC and another -+ * one for the IO-APIC. -+ */ -+ if (!smp_found_config && detect_init_APIC()) { -+ apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); -+ apic_phys = __pa(apic_phys); -+ } else -+ apic_phys = mp_lapic_addr; -+ -+ set_fixmap_nocache(FIX_APIC_BASE, apic_phys); -+ printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, -+ apic_phys); -+ -+ /* -+ * Fetch the APIC ID of the BSP in case we have a -+ * default configuration (or the MP table is broken). -+ */ -+ if (boot_cpu_physical_apicid == -1U) -+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -+ -+#ifdef CONFIG_X86_IO_APIC -+ { -+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; -+ int i; -+ -+ for (i = 0; i < nr_ioapics; i++) { -+ if (smp_found_config) { -+ ioapic_phys = mp_ioapics[i].mpc_apicaddr; -+ if (!ioapic_phys) { -+ printk(KERN_ERR -+ "WARNING: bogus zero IO-APIC " -+ "address found in MPTABLE, " -+ "disabling IO/APIC support!\n"); -+ smp_found_config = 0; -+ skip_ioapic_setup = 1; -+ goto fake_ioapic_page; -+ } -+ } else { -+fake_ioapic_page: -+ ioapic_phys = (unsigned long) -+ alloc_bootmem_pages(PAGE_SIZE); -+ ioapic_phys = __pa(ioapic_phys); -+ } -+ set_fixmap_nocache(idx, ioapic_phys); -+ printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", -+ __fix_to_virt(idx), ioapic_phys); -+ idx++; -+ } -+ } -+#endif -+} -+#endif /* !CONFIG_XEN */ -+/* -+ * This initializes the IO-APIC and APIC hardware if this is -+ * a UP kernel. -+ */ -+int __init APIC_init_uniprocessor (void) -+{ -+#ifndef CONFIG_XEN -+ if (enable_local_apic < 0) -+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -+ -+ if (!smp_found_config && !cpu_has_apic) -+ return -1; -+ -+ /* -+ * Complain if the BIOS pretends there is one. -+ */ -+ if (!cpu_has_apic && -+ APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { -+ printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", -+ boot_cpu_physical_apicid); -+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -+ return -1; -+ } -+ -+ verify_local_APIC(); -+ -+ connect_bsp_APIC(); -+ -+ /* -+ * Hack: In case of kdump, after a crash, kernel might be booting -+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid -+ * might be zero if read from MP tables. Get it from LAPIC. -+ */ -+#ifdef CONFIG_CRASH_DUMP -+ boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -+#endif -+ phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); -+ -+ setup_local_APIC(); -+#endif /* !CONFIG_XEN */ -+ -+#ifdef CONFIG_X86_IO_APIC -+ if (smp_found_config) -+ if (!skip_ioapic_setup && nr_ioapics) -+ setup_IO_APIC(); -+#endif -+#ifndef CONFIG_XEN -+ setup_boot_clock(); -+#endif -+ -+ return 0; -+} -+ -+/* -+ * APIC command line parameters -+ */ -+static int __init parse_lapic(char *arg) -+{ -+ enable_local_apic = 1; -+ return 0; -+} -+early_param("lapic", parse_lapic); -+ -+static int __init parse_nolapic(char *arg) -+{ -+ enable_local_apic = -1; -+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -+ return 0; -+} -+early_param("nolapic", parse_nolapic); -+ -+static int __init parse_disable_lapic_timer(char *arg) -+{ -+ local_apic_timer_disabled = 1; -+ return 0; -+} -+early_param("nolapic_timer", parse_disable_lapic_timer); -+ -+static int __init parse_lapic_timer_c2_ok(char *arg) -+{ -+ local_apic_timer_c2_ok = 1; -+ return 0; -+} -+early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -+ -+static int __init apic_set_verbosity(char *str) -+{ -+ if (strcmp("debug", str) == 0) -+ apic_verbosity = APIC_DEBUG; -+ else if (strcmp("verbose", str) == 0) -+ apic_verbosity = APIC_VERBOSE; -+ return 1; -+} -+ -+__setup("apic=", apic_set_verbosity); -+ -+ -+/* -+ * Local APIC interrupts -+ */ -+#ifndef CONFIG_XEN -+/* -+ * This interrupt should _never_ happen with our APIC/SMP architecture -+ */ -+void smp_spurious_interrupt(struct pt_regs *regs) -+{ -+ unsigned long v; -+ -+ irq_enter(); -+ /* -+ * Check if this really is a spurious interrupt and ACK it -+ * if it is a vectored one. Just in case... -+ * Spurious interrupts should not be ACKed. -+ */ -+ v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); -+ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) -+ ack_APIC_irq(); -+ -+ /* see sw-dev-man vol 3, chapter 7.4.13.5 */ -+ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " -+ "should never happen.\n", smp_processor_id()); -+ irq_exit(); -+} -+ -+/* -+ * This interrupt should never happen with our APIC/SMP architecture -+ */ -+void smp_error_interrupt(struct pt_regs *regs) -+{ -+ unsigned long v, v1; -+ -+ irq_enter(); -+ /* First tickle the hardware, only then report what went on. -- REW */ -+ v = apic_read(APIC_ESR); -+ apic_write(APIC_ESR, 0); -+ v1 = apic_read(APIC_ESR); -+ ack_APIC_irq(); -+ atomic_inc(&irq_err_count); -+ -+ /* Here is what the APIC error bits mean: -+ 0: Send CS error -+ 1: Receive CS error -+ 2: Send accept error -+ 3: Receive accept error -+ 4: Reserved -+ 5: Send illegal vector -+ 6: Received illegal vector -+ 7: Illegal register address -+ */ -+ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", -+ smp_processor_id(), v , v1); -+ irq_exit(); -+} -+ -+/* -+ * Initialize APIC interrupts -+ */ -+void __init apic_intr_init(void) -+{ -+#ifdef CONFIG_SMP -+ smp_intr_init(); -+#endif -+ /* self generated IPI for local APIC timer */ -+ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); -+ -+ /* IPI vectors for APIC spurious and error interrupts */ -+ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); -+ set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -+ -+ /* thermal monitor LVT interrupt */ -+#ifdef CONFIG_X86_MCE_P4THERMAL -+ set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -+#endif -+} -+ -+/** -+ * connect_bsp_APIC - attach the APIC to the interrupt system -+ */ -+void __init connect_bsp_APIC(void) -+{ -+ if (pic_mode) { -+ /* -+ * Do not trust the local APIC being empty at bootup. -+ */ -+ clear_local_APIC(); -+ /* -+ * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's -+ * local APIC to INT and NMI lines. -+ */ -+ apic_printk(APIC_VERBOSE, "leaving PIC mode, " -+ "enabling APIC mode.\n"); -+ outb(0x70, 0x22); -+ outb(0x01, 0x23); -+ } -+ enable_apic_mode(); -+} -+ -+/** -+ * disconnect_bsp_APIC - detach the APIC from the interrupt system -+ * @virt_wire_setup: indicates, whether virtual wire mode is selected -+ * -+ * Virtual wire mode is necessary to deliver legacy interrupts even when the -+ * APIC is disabled. -+ */ -+void disconnect_bsp_APIC(int virt_wire_setup) -+{ -+ if (pic_mode) { -+ /* -+ * Put the board back into PIC mode (has an effect only on -+ * certain older boards). Note that APIC interrupts, including -+ * IPIs, won't work beyond this point! The only exception are -+ * INIT IPIs. -+ */ -+ apic_printk(APIC_VERBOSE, "disabling APIC mode, " -+ "entering PIC mode.\n"); -+ outb(0x70, 0x22); -+ outb(0x00, 0x23); -+ } else { -+ /* Go back to Virtual Wire compatibility mode */ -+ unsigned long value; -+ -+ /* For the spurious interrupt use vector F, and enable it */ -+ value = apic_read(APIC_SPIV); -+ value &= ~APIC_VECTOR_MASK; -+ value |= APIC_SPIV_APIC_ENABLED; -+ value |= 0xf; -+ apic_write_around(APIC_SPIV, value); -+ -+ if (!virt_wire_setup) { -+ /* -+ * For LVT0 make it edge triggered, active high, -+ * external and enabled -+ */ -+ value = apic_read(APIC_LVT0); -+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | -+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | -+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); -+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; -+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); -+ apic_write_around(APIC_LVT0, value); -+ } else { -+ /* Disable LVT0 */ -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED); -+ } -+ -+ /* -+ * For LVT1 make it edge triggered, active high, nmi and -+ * enabled -+ */ -+ value = apic_read(APIC_LVT1); -+ value &= ~( -+ APIC_MODE_MASK | APIC_SEND_PENDING | -+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | -+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); -+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; -+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); -+ apic_write_around(APIC_LVT1, value); -+ } -+} -+ -+/* -+ * Power management -+ */ -+#ifdef CONFIG_PM -+ -+static struct { -+ int active; -+ /* r/w apic fields */ -+ unsigned int apic_id; -+ unsigned int apic_taskpri; -+ unsigned int apic_ldr; -+ unsigned int apic_dfr; -+ unsigned int apic_spiv; -+ unsigned int apic_lvtt; -+ unsigned int apic_lvtpc; -+ unsigned int apic_lvt0; -+ unsigned int apic_lvt1; -+ unsigned int apic_lvterr; -+ unsigned int apic_tmict; -+ unsigned int apic_tdcr; -+ unsigned int apic_thmr; -+} apic_pm_state; -+ -+static int lapic_suspend(struct sys_device *dev, pm_message_t state) -+{ -+ unsigned long flags; -+ int maxlvt; -+ -+ if (!apic_pm_state.active) -+ return 0; -+ -+ maxlvt = lapic_get_maxlvt(); -+ -+ apic_pm_state.apic_id = apic_read(APIC_ID); -+ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); -+ apic_pm_state.apic_ldr = apic_read(APIC_LDR); -+ apic_pm_state.apic_dfr = apic_read(APIC_DFR); -+ apic_pm_state.apic_spiv = apic_read(APIC_SPIV); -+ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); -+ if (maxlvt >= 4) -+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); -+ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); -+ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); -+ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); -+ apic_pm_state.apic_tmict = apic_read(APIC_TMICT); -+ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -+#ifdef CONFIG_X86_MCE_P4THERMAL -+ if (maxlvt >= 5) -+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -+#endif -+ -+ local_irq_save(flags); -+ disable_local_APIC(); -+ local_irq_restore(flags); -+ return 0; -+} -+ -+static int lapic_resume(struct sys_device *dev) -+{ -+ unsigned int l, h; -+ unsigned long flags; -+ int maxlvt; -+ -+ if (!apic_pm_state.active) -+ return 0; -+ -+ maxlvt = lapic_get_maxlvt(); -+ -+ local_irq_save(flags); -+ -+ /* -+ * Make sure the APICBASE points to the right address -+ * -+ * FIXME! This will be wrong if we ever support suspend on -+ * SMP! We'll need to do this as part of the CPU restore! -+ */ -+ rdmsr(MSR_IA32_APICBASE, l, h); -+ l &= ~MSR_IA32_APICBASE_BASE; -+ l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; -+ wrmsr(MSR_IA32_APICBASE, l, h); -+ -+ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); -+ apic_write(APIC_ID, apic_pm_state.apic_id); -+ apic_write(APIC_DFR, apic_pm_state.apic_dfr); -+ apic_write(APIC_LDR, apic_pm_state.apic_ldr); -+ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); -+ apic_write(APIC_SPIV, apic_pm_state.apic_spiv); -+ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); -+ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -+#ifdef CONFIG_X86_MCE_P4THERMAL -+ if (maxlvt >= 5) -+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -+#endif -+ if (maxlvt >= 4) -+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); -+ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); -+ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); -+ apic_write(APIC_TMICT, apic_pm_state.apic_tmict); -+ apic_write(APIC_ESR, 0); -+ apic_read(APIC_ESR); -+ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); -+ apic_write(APIC_ESR, 0); -+ apic_read(APIC_ESR); -+ local_irq_restore(flags); -+ return 0; -+} -+ -+/* -+ * This device has no shutdown method - fully functioning local APICs -+ * are needed on every CPU up until machine_halt/restart/poweroff. -+ */ -+ -+static struct sysdev_class lapic_sysclass = { -+ set_kset_name("lapic"), -+ .resume = lapic_resume, -+ .suspend = lapic_suspend, -+}; -+ -+static struct sys_device device_lapic = { -+ .id = 0, -+ .cls = &lapic_sysclass, -+}; -+ -+static void __devinit apic_pm_activate(void) -+{ -+ apic_pm_state.active = 1; -+} -+ -+static int __init init_lapic_sysfs(void) -+{ -+ int error; -+ -+ if (!cpu_has_apic) -+ return 0; -+ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ -+ -+ error = sysdev_class_register(&lapic_sysclass); -+ if (!error) -+ error = sysdev_register(&device_lapic); -+ return error; -+} -+device_initcall(init_lapic_sysfs); -+ -+#else /* CONFIG_PM */ -+ -+static void apic_pm_activate(void) { } -+ -+#endif /* CONFIG_PM */ -+#endif /* !CONFIG_XEN */ -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/asm-offsets.c ---- a/arch/i386/kernel/asm-offsets.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/asm-offsets.c Fri Jul 20 11:56:41 2007 -0300 -@@ -89,9 +89,14 @@ void foo(void) - OFFSET(pbe_orig_address, pbe, orig_address); - OFFSET(pbe_next, pbe, next); - -+#ifndef CONFIG_X86_NO_TSS - /* Offset from the sysenter stack to tss.esp0 */ -- DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - -+ DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) - - sizeof(struct tss_struct)); -+#else -+ /* sysenter stack points directly to esp0 */ -+ DEFINE(SYSENTER_stack_esp0, 0); -+#endif - - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VDSO_PRELINK, VDSO_PRELINK); -@@ -111,4 +116,10 @@ void foo(void) - OFFSET(PARAVIRT_iret, paravirt_ops, iret); - OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); - #endif -+ -+ -+#ifdef CONFIG_XEN -+ BLANK(); -+ OFFSET(XEN_START_mfn_list, start_info, mfn_list); -+#endif - } -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/cpu/common.c ---- a/arch/i386/kernel/cpu/common.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/cpu/common.c Fri Jul 20 11:56:41 2007 -0300 -@@ -19,6 +19,10 @@ - #include - #endif - #include -+ -+#ifdef CONFIG_XEN -+#include -+#endif - - #include "cpu.h" - -@@ -601,6 +605,31 @@ void __init early_cpu_init(void) - #endif - } - -+/* We can't move load_gdt to asm/desc.h because it lacks make_lowmen_page_readonly() -+ definition, and as this is still the only user of load_gdt in xen. -+ ToDo: JQ -+ */ -+ -+#ifdef CONFIG_XEN -+#undef load_gdt -+static void __cpuinit load_gdt(struct Xgt_desc_struct *gdt_descr) -+{ -+ unsigned long frames[16]; -+ unsigned long va; -+ int f; -+ -+ for (va = gdt_descr->address, f = 0; -+ va < gdt_descr->address + gdt_descr->size; -+ va += PAGE_SIZE, f++) { -+ frames[f] = virt_to_mfn(va); -+ make_lowmem_page_readonly( -+ (void *)va, XENFEAT_writable_descriptor_tables); -+ } -+ if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) -+ BUG(); -+} -+#endif /* CONFIG_XEN */ -+ - /* Make sure %gs is initialized properly in idle threads */ - struct pt_regs * __devinit idle_regs(struct pt_regs *regs) - { -@@ -633,6 +662,10 @@ static __cpuinit int alloc_gdt(int cpu) - - memset(gdt, 0, PAGE_SIZE); - memset(pda, 0, sizeof(*pda)); -+#ifdef CONFIG_XEN -+ memcpy(gdt, cpu_gdt_table, GDT_SIZE); -+ cpu_gdt_descr->size = GDT_SIZE; -+#endif - } else { - /* GDT and PDA might already have been allocated if - this is a CPU hotplug re-insertion. */ -@@ -690,14 +723,18 @@ __cpuinit int init_gdt(int cpu, struct t - - BUG_ON(gdt == NULL || pda == NULL); - -+#ifndef CONFIG_XEN - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - memcpy(gdt, cpu_gdt_table, GDT_SIZE); - cpu_gdt_descr->size = GDT_SIZE - 1; -- -- pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, -+#endif -+ -+ -+ if (cpu == 0) -+ pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, - (u32 *)&gdt[GDT_ENTRY_PDA].b, - (unsigned long)pda, sizeof(*pda) - 1, - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ -@@ -724,7 +761,9 @@ void __cpuinit cpu_set_gdt(int cpu) - /* Common CPU init for both boot and secondary CPUs */ - static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) - { -+#ifndef CONFIG_X86_NO_TSS - struct tss_struct * t = &per_cpu(init_tss, cpu); -+#endif - struct thread_struct *thread = &curr->thread; - - if (cpu_test_and_set(cpu, cpu_initialized)) { -@@ -743,7 +782,9 @@ static void __cpuinit _cpu_init(int cpu, - set_in_cr4(X86_CR4_TSD); - } - -+#ifndef CONFIG_X86_NO_IDT - load_idt(&idt_descr); -+#endif - - /* - * Set up and load the per-CPU TSS and LDT -@@ -755,8 +796,10 @@ static void __cpuinit _cpu_init(int cpu, - enter_lazy_tlb(&init_mm, curr); - - load_esp0(t, thread); -+#ifndef CONFIG_X86_NO_TSS - set_tss_desc(cpu,t); - load_TR_desc(); -+#endif - load_LDT(&init_mm.context); - - #ifdef CONFIG_DOUBLEFAULT -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/cpu/mtrr/Makefile ---- a/arch/i386/kernel/cpu/mtrr/Makefile Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/cpu/mtrr/Makefile Fri Jul 20 11:56:41 2007 -0300 -@@ -1,3 +1,10 @@ obj-y := main.o if.o generic.o state.o - obj-y := main.o if.o generic.o state.o - obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o - -+ifdef CONFIG_XEN -+include $(srctree)/scripts/Makefile.xen -+n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o -+ -+obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) -+obj-y := $(call cherrypickxen, $(obj-y)) -+endif -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/cpu/mtrr/main-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/cpu/mtrr/main-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,198 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include "mtrr.h" -+ -+static DEFINE_MUTEX(mtrr_mutex); -+ -+static void generic_get_mtrr(unsigned int reg, unsigned long *base, -+ unsigned long *size, mtrr_type * type) -+{ -+ struct xen_platform_op op; -+ -+ op.cmd = XENPF_read_memtype; -+ op.u.read_memtype.reg = reg; -+ (void)HYPERVISOR_platform_op(&op); -+ -+ *size = op.u.read_memtype.nr_mfns; -+ *base = op.u.read_memtype.mfn; -+ *type = op.u.read_memtype.type; -+} -+ -+struct mtrr_ops generic_mtrr_ops = { -+ .use_intel_if = 1, -+ .get = generic_get_mtrr, -+}; -+ -+struct mtrr_ops *mtrr_if = &generic_mtrr_ops; -+unsigned int num_var_ranges; -+unsigned int *usage_table; -+ -+/* This function returns the number of variable MTRRs */ -+static void __init set_num_var_ranges(void) -+{ -+ struct xen_platform_op op; -+ -+ for (num_var_ranges = 0; ; num_var_ranges++) { -+ op.cmd = XENPF_read_memtype; -+ op.u.read_memtype.reg = num_var_ranges; -+ if (HYPERVISOR_platform_op(&op) != 0) -+ break; -+ } -+} -+ -+static void __init init_table(void) -+{ -+ int i, max; -+ -+ max = num_var_ranges; -+ if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) -+ == NULL) { -+ printk(KERN_ERR "mtrr: could not allocate\n"); -+ return; -+ } -+ for (i = 0; i < max; i++) -+ usage_table[i] = 0; -+} -+ -+int mtrr_add_page(unsigned long base, unsigned long size, -+ unsigned int type, char increment) -+{ -+ int error; -+ struct xen_platform_op op; -+ -+ mutex_lock(&mtrr_mutex); -+ -+ op.cmd = XENPF_add_memtype; -+ op.u.add_memtype.mfn = base; -+ op.u.add_memtype.nr_mfns = size; -+ op.u.add_memtype.type = type; -+ error = HYPERVISOR_platform_op(&op); -+ if (error) { -+ mutex_unlock(&mtrr_mutex); -+ BUG_ON(error > 0); -+ return error; -+ } -+ -+ if (increment) -+ ++usage_table[op.u.add_memtype.reg]; -+ -+ mutex_unlock(&mtrr_mutex); -+ -+ return op.u.add_memtype.reg; -+} -+ -+static int mtrr_check(unsigned long base, unsigned long size) -+{ -+ if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { -+ printk(KERN_WARNING -+ "mtrr: size and base must be multiples of 4 kiB\n"); -+ printk(KERN_DEBUG -+ "mtrr: size: 0x%lx base: 0x%lx\n", size, base); -+ dump_stack(); -+ return -1; -+ } -+ return 0; -+} -+ -+int -+mtrr_add(unsigned long base, unsigned long size, unsigned int type, -+ char increment) -+{ -+ if (mtrr_check(base, size)) -+ return -EINVAL; -+ return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, -+ increment); -+} -+ -+int mtrr_del_page(int reg, unsigned long base, unsigned long size) -+{ -+ unsigned i; -+ mtrr_type ltype; -+ unsigned long lbase, lsize; -+ int error = -EINVAL; -+ struct xen_platform_op op; -+ -+ mutex_lock(&mtrr_mutex); -+ -+ if (reg < 0) { -+ /* Search for existing MTRR */ -+ for (i = 0; i < num_var_ranges; ++i) { -+ mtrr_if->get(i, &lbase, &lsize, <ype); -+ if (lbase == base && lsize == size) { -+ reg = i; -+ break; -+ } -+ } -+ if (reg < 0) { -+ printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, -+ size); -+ goto out; -+ } -+ } -+ if (usage_table[reg] < 1) { -+ printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); -+ goto out; -+ } -+ if (--usage_table[reg] < 1) { -+ op.cmd = XENPF_del_memtype; -+ op.u.del_memtype.handle = 0; -+ op.u.del_memtype.reg = reg; -+ error = HYPERVISOR_platform_op(&op); -+ if (error) { -+ BUG_ON(error > 0); -+ goto out; -+ } -+ } -+ error = reg; -+ out: -+ mutex_unlock(&mtrr_mutex); -+ return error; -+} -+ -+int -+mtrr_del(int reg, unsigned long base, unsigned long size) -+{ -+ if (mtrr_check(base, size)) -+ return -EINVAL; -+ return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); -+} -+ -+EXPORT_SYMBOL(mtrr_add); -+EXPORT_SYMBOL(mtrr_del); -+ -+void __init mtrr_bp_init(void) -+{ -+} -+ -+void mtrr_ap_init(void) -+{ -+} -+ -+static int __init mtrr_init(void) -+{ -+ struct cpuinfo_x86 *c = &boot_cpu_data; -+ -+ if (!is_initial_xendomain()) -+ return -ENODEV; -+ -+ if ((!cpu_has(c, X86_FEATURE_MTRR)) && -+ (!cpu_has(c, X86_FEATURE_K6_MTRR)) && -+ (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && -+ (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) -+ return -ENODEV; -+ -+ set_num_var_ranges(); -+ init_table(); -+ -+ return 0; -+} -+ -+subsys_initcall(mtrr_init); -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/crash.c ---- a/arch/i386/kernel/crash.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/crash.c Fri Jul 20 11:56:41 2007 -0300 -@@ -31,6 +31,7 @@ - /* This keeps a track of which one is crashing cpu. */ - static int crashing_cpu; - -+#ifndef CONFIG_XEN - #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) - static atomic_t waiting_for_crash_ipi; - -@@ -112,6 +113,7 @@ static void nmi_shootdown_cpus(void) - /* There are no cpus to shootdown */ - } - #endif -+#endif /* CONFIG_XEN */ - - void machine_crash_shutdown(struct pt_regs *regs) - { -@@ -128,10 +130,12 @@ void machine_crash_shutdown(struct pt_re - - /* Make a note of crashing cpu. Will be used in NMI callback.*/ - crashing_cpu = safe_smp_processor_id(); -+#ifndef CONFIG_XEN - nmi_shootdown_cpus(); - lapic_shutdown(); - #if defined(CONFIG_X86_IO_APIC) - disable_IO_APIC(); - #endif -+#endif /* CONFIG_XEN */ - crash_save_cpu(regs, safe_smp_processor_id()); - } -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/e820.c ---- a/arch/i386/kernel/e820.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/e820.c Fri Jul 20 11:56:41 2007 -0300 -@@ -16,10 +16,24 @@ - #include - #include - -+#ifdef CONFIG_XEN -+#include -+#include -+#include -+#include -+#include -+#endif -+ - #ifdef CONFIG_EFI - int efi_enabled = 0; - EXPORT_SYMBOL(efi_enabled); - #endif -+ -+#ifdef CONFIG_XEN -+struct e820map machine_e820; -+#endif -+static void __init -+e820_setup_gap(struct e820entry *e820, int nr_map); - - struct e820map e820; - struct change_member { -@@ -182,6 +196,12 @@ static void __init probe_roms(void) - unsigned char *rom; - int i; - -+#ifdef CONFIG_XEN -+ /* Nothing to do if not running in dom0. */ -+ if (!is_initial_xendomain()) -+ return; -+#endif -+ - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { -@@ -249,36 +269,54 @@ legacy_init_iomem_resources(struct resou - legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) - { - int i; -+ struct e820entry *map = e820.map; -+ int nr_map = e820.nr_map; -+#ifdef CONFIG_XEN_PRIVILEGED_GUEST -+ struct xen_memory_map memmap; -+ -+ map = machine_e820.map; -+ memmap.nr_entries = E820MAX; -+ -+ set_xen_guest_handle(memmap.buffer, map); -+ -+ if(HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) -+ BUG(); -+ machine_e820.nr_map = memmap.nr_entries; -+ nr_map = memmap.nr_entries; -+ e820_setup_gap(map, memmap.nr_entries); -+#endif - - probe_roms(); -- for (i = 0; i < e820.nr_map; i++) { -+ for (i = 0; i < nr_map; i++) { - struct resource *res; - #ifndef CONFIG_RESOURCES_64BIT -- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) -+ if (map[i].addr + map[i].size > 0x100000000ULL) - continue; - #endif - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); -- switch (e820.map[i].type) { -+ switch (map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } -- res->start = e820.map[i].addr; -- res->end = res->start + e820.map[i].size - 1; -+ res->start = map[i].addr; -+ res->end = res->start + map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res)) { - kfree(res); - continue; - } -- if (e820.map[i].type == E820_RAM) { -+ if (map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ -+#ifndef CONFIG_XEN - request_resource(res, code_resource); - request_resource(res, data_resource); -+#endif - #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); - #endif -@@ -297,6 +335,11 @@ static int __init request_standard_resou - int i; - - printk("Setting up standard PCI resources\n"); -+#ifdef CONFIG_XEN -+ /* Nothing to do if not running in dom0. */ -+ if (!is_initial_xendomain()) -+ return 0; -+#endif - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); - else -@@ -516,10 +559,13 @@ int __init sanitize_e820_map(struct e820 - */ - int __init copy_e820_map(struct e820entry * biosmap, int nr_map) - { -+#ifndef CONFIG_XEN - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; -- -+#else -+ BUG_ON(nr_map < 1); -+#endif - do { - unsigned long long start = biosmap->addr; - unsigned long long size = biosmap->size; -@@ -531,6 +577,7 @@ int __init copy_e820_map(struct e820entr - if (start > end) - return -1; - -+#ifndef CONFIG_XEN - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. -@@ -551,6 +598,7 @@ int __init copy_e820_map(struct e820entr - size = end - start; - } - } -+#endif - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -@@ -655,6 +703,15 @@ void __init register_bootmem_low_pages(u - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - -+#ifdef CONFIG_XEN -+ /* -+ * Truncate to the number of actual pages currently -+ * present. -+ */ -+ if (last_pfn > xen_start_info->nr_pages) -+ last_pfn = xen_start_info->nr_pages; -+#endif -+ - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - -@@ -670,7 +727,12 @@ void __init register_bootmem_low_pages(u - } - } - --void __init e820_register_memory(void) -+/* -+ * Locate a unused range of the physical address space below 4G which -+ * can be used for PCI mappings. -+ */ -+static void __init -+e820_setup_gap(struct e820entry *e820, int nr_map) - { - unsigned long gapstart, gapsize, round; - unsigned long long last; -@@ -683,10 +745,10 @@ void __init e820_register_memory(void) - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; -- i = e820.nr_map; -+ i = nr_map; - while (--i >= 0) { -- unsigned long long start = e820.map[i].addr; -- unsigned long long end = start + e820.map[i].size; -+ unsigned long long start = e820[i].addr; -+ unsigned long long end = start + e820[i].size; - - /* - * Since "last" is at most 4GB, we know we'll -@@ -716,6 +778,13 @@ void __init e820_register_memory(void) - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -+} -+ -+void __init e820_register_memory(void) -+{ -+#ifndef CONFIG_XEN -+ e820_setup_gap(e820.map, e820.nr_map); -+#endif - } - - void __init print_memory_map(char *who) -@@ -786,7 +855,7 @@ static __init __always_inline void efi_l - - void __init limit_regions(unsigned long long size) - { -- unsigned long long current_addr; -+ unsigned long long current_addr = 0; - int i; - - print_memory_map("limit_regions start"); -@@ -815,6 +884,19 @@ void __init limit_regions(unsigned long - print_memory_map("limit_regions endfor"); - return; - } -+#ifdef CONFIG_XEN -+ if (i==e820.nr_map && current_addr < size) { -+ /* -+ * The e820 map finished before our requested size so -+ * extend the final entry to the requested address. -+ */ -+ --i; -+ if (e820.map[i].type == E820_RAM) -+ e820.map[i].size -= current_addr - size; -+ else -+ add_memory_region(current_addr, size - current_addr, E820_RAM); -+ } -+#endif - print_memory_map("limit_regions endfunc"); - } - -@@ -830,8 +912,15 @@ e820_all_mapped(unsigned long s, unsigne - u64 start = s; - u64 end = e; - int i; -+#ifndef CONFIG_XEN - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; -+#else -+ if (!is_initial_xendomain()) -+ return 0; -+ for (i = 0; i < machine_e820.nr_map; ++i) { -+ const struct e820entry *ei = &machine_e820.map[i]; -+#endif - if (type && ei->type != type) - continue; - /* is the region (part) in overlap with the current region ?*/ -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/entry-xen.S ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/entry-xen.S Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,1302 @@ -+/* -+ * linux/arch/i386/entry.S -+ * -+ * Copyright (C) 1991, 1992 Linus Torvalds -+ */ -+ -+/* -+ * entry.S contains the system-call and fault low-level handling routines. -+ * This also contains the timer-interrupt handler, as well as all interrupts -+ * and faults that can result in a task-switch. -+ * -+ * NOTE: This code handles signal-recognition, which happens every time -+ * after a timer-interrupt and after each system call. -+ * -+ * I changed all the .align's to 4 (16 byte alignment), as that's faster -+ * on a 486. -+ * -+ * Stack layout in 'ret_from_system_call': -+ * ptrace needs to have all regs on the stack. -+ * if the order here is changed, it needs to be -+ * updated in fork.c:copy_process, signal.c:do_signal, -+ * ptrace.c and ptrace.h -+ * -+ * 0(%esp) - %ebx -+ * 4(%esp) - %ecx -+ * 8(%esp) - %edx -+ * C(%esp) - %esi -+ * 10(%esp) - %edi -+ * 14(%esp) - %ebp -+ * 18(%esp) - %eax -+ * 1C(%esp) - %ds -+ * 20(%esp) - %es -+ * 24(%esp) - %fs -+ * 28(%esp) - orig_eax -+ * 2C(%esp) - %eip -+ * 30(%esp) - %cs -+ * 34(%esp) - %eflags -+ * 38(%esp) - %oldesp -+ * 3C(%esp) - %oldss -+ * -+ * "current" is in register %ebx during any slow entries. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "irq_vectors.h" -+#include -+ -+/* -+ * We use macros for low-level operations which need to be overridden -+ * for paravirtualization. The following will never clobber any registers: -+ * INTERRUPT_RETURN (aka. "iret") -+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") -+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). -+ * -+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must -+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). -+ * Allowing a register to be clobbered can shrink the paravirt replacement -+ * enough to patch inline, increasing performance. -+ */ -+ -+#define nr_syscalls ((syscall_table_size)/4) -+ -+CF_MASK = 0x00000001 -+TF_MASK = 0x00000100 -+IF_MASK = 0x00000200 -+DF_MASK = 0x00000400 -+NT_MASK = 0x00004000 -+VM_MASK = 0x00020000 -+/* Pseudo-eflags. */ -+NMI_MASK = 0x80000000 -+ -+#ifdef CONFIG_XEN -+/* Offsets into shared_info_t. */ -+#define evtchn_upcall_pending /* 0 */ -+#define evtchn_upcall_mask 1 -+ -+#define sizeof_vcpu_shift 6 -+ -+#ifdef CONFIG_SMP -+#define GET_VCPU_INFO movl %fs:PDA_cpu,%esi ; \ -+ shl $sizeof_vcpu_shift,%esi ; \ -+ addl HYPERVISOR_shared_info,%esi -+#else -+#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi -+#endif -+ -+#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) -+#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) -+#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) -+#endif -+ -+#ifdef CONFIG_PREEMPT -+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF -+#else -+#define preempt_stop(clobbers) -+#define resume_kernel restore_nocheck -+#endif -+ -+.macro TRACE_IRQS_IRET -+#ifdef CONFIG_TRACE_IRQFLAGS -+ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? -+ jz 1f -+ TRACE_IRQS_ON -+1: -+#endif -+.endm -+ -+#ifdef CONFIG_VM86 -+#define resume_userspace_sig check_userspace -+#else -+#define resume_userspace_sig resume_userspace -+#endif -+ -+#define SAVE_ALL \ -+ cld; \ -+ pushl %fs; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ /*CFI_REL_OFFSET fs, 0;*/\ -+ pushl %es; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ /*CFI_REL_OFFSET es, 0;*/\ -+ pushl %ds; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ /*CFI_REL_OFFSET ds, 0;*/\ -+ pushl %eax; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET eax, 0;\ -+ pushl %ebp; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET ebp, 0;\ -+ pushl %edi; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET edi, 0;\ -+ pushl %esi; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET esi, 0;\ -+ pushl %edx; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET edx, 0;\ -+ pushl %ecx; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET ecx, 0;\ -+ pushl %ebx; \ -+ CFI_ADJUST_CFA_OFFSET 4;\ -+ CFI_REL_OFFSET ebx, 0;\ -+ movl $(__USER_DS), %edx; \ -+ movl %edx, %ds; \ -+ movl %edx, %es; \ -+ movl $(__KERNEL_PDA), %edx; \ -+ movl %edx, %fs -+ -+#define RESTORE_INT_REGS \ -+ popl %ebx; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE ebx;\ -+ popl %ecx; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE ecx;\ -+ popl %edx; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE edx;\ -+ popl %esi; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE esi;\ -+ popl %edi; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE edi;\ -+ popl %ebp; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE ebp;\ -+ popl %eax; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ CFI_RESTORE eax -+ -+#define RESTORE_REGS \ -+ RESTORE_INT_REGS; \ -+1: popl %ds; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ /*CFI_RESTORE ds;*/\ -+2: popl %es; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ /*CFI_RESTORE es;*/\ -+3: popl %fs; \ -+ CFI_ADJUST_CFA_OFFSET -4;\ -+ /*CFI_RESTORE fs;*/\ -+.pushsection .fixup,"ax"; \ -+4: movl $0,(%esp); \ -+ jmp 1b; \ -+5: movl $0,(%esp); \ -+ jmp 2b; \ -+6: movl $0,(%esp); \ -+ jmp 3b; \ -+.section __ex_table,"a";\ -+ .align 4; \ -+ .long 1b,4b; \ -+ .long 2b,5b; \ -+ .long 3b,6b; \ -+.popsection -+ -+#define RING0_INT_FRAME \ -+ CFI_STARTPROC simple;\ -+ CFI_SIGNAL_FRAME;\ -+ CFI_DEF_CFA esp, 3*4;\ -+ /*CFI_OFFSET cs, -2*4;*/\ -+ CFI_OFFSET eip, -3*4 -+ -+#define RING0_EC_FRAME \ -+ CFI_STARTPROC simple;\ -+ CFI_SIGNAL_FRAME;\ -+ CFI_DEF_CFA esp, 4*4;\ -+ /*CFI_OFFSET cs, -2*4;*/\ -+ CFI_OFFSET eip, -3*4 -+ -+#define RING0_PTREGS_FRAME \ -+ CFI_STARTPROC simple;\ -+ CFI_SIGNAL_FRAME;\ -+ CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ -+ /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ -+ CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ -+ /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ -+ /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ -+ CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ -+ CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ -+ CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ -+ CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ -+ CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ -+ CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ -+ CFI_OFFSET ebx, PT_EBX-PT_OLDESP -+ -+ENTRY(ret_from_fork) -+ CFI_STARTPROC -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ call schedule_tail -+ GET_THREAD_INFO(%ebp) -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ pushl $0x0202 # Reset kernel eflags -+ CFI_ADJUST_CFA_OFFSET 4 -+ popfl -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp syscall_exit -+ CFI_ENDPROC -+END(ret_from_fork) -+ -+/* -+ * Return to user mode is not as complex as all this looks, -+ * but we want the default path for a system call return to -+ * go as quickly as possible which is why some of this is -+ * less clear than it otherwise should be. -+ */ -+ -+ # userspace resumption stub bypassing syscall exit tracing -+ ALIGN -+ RING0_PTREGS_FRAME -+ret_from_exception: -+ preempt_stop(CLBR_ANY) -+ret_from_intr: -+ GET_THREAD_INFO(%ebp) -+check_userspace: -+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS -+ movb PT_CS(%esp), %al -+ andl $(VM_MASK | SEGMENT_RPL_MASK), %eax -+ cmpl $USER_RPL, %eax -+ jb resume_kernel # not returning to v8086 or userspace -+ -+ENTRY(resume_userspace) -+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt -+ # setting need_resched or sigpending -+ # between sampling and the iret -+ movl TI_flags(%ebp), %ecx -+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on -+ # int/exception return? -+ jne work_pending -+ jmp restore_all -+END(ret_from_exception) -+ -+#ifdef CONFIG_PREEMPT -+ENTRY(resume_kernel) -+ DISABLE_INTERRUPTS(CLBR_ANY) -+ cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? -+ jnz restore_nocheck -+need_resched: -+ movl TI_flags(%ebp), %ecx # need_resched set ? -+ testb $_TIF_NEED_RESCHED, %cl -+ jz restore_all -+ testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? -+ jz restore_all -+ call preempt_schedule_irq -+ jmp need_resched -+END(resume_kernel) -+#endif -+ CFI_ENDPROC -+ -+/* SYSENTER_RETURN points to after the "sysenter" instruction in -+ the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ -+ -+ # sysenter call handler stub -+ENTRY(sysenter_entry) -+ CFI_STARTPROC simple -+ CFI_SIGNAL_FRAME -+ CFI_DEF_CFA esp, 0 -+ CFI_REGISTER esp, ebp -+ movl SYSENTER_stack_esp0(%esp),%esp -+sysenter_past_esp: -+ /* -+ * No need to follow this irqs on/off section: the syscall -+ * disabled irqs and here we enable it straight after entry: -+ */ -+ ENABLE_INTERRUPTS(CLBR_NONE) -+ pushl $(__USER_DS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET ss, 0*/ -+ pushl %ebp -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET esp, 0 -+ pushfl -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $(__USER_CS) -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET cs, 0*/ -+#ifndef CONFIG_COMPAT_VDSO -+ /* -+ * Push current_thread_info()->sysenter_return to the stack. -+ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words -+ * pushed above; +8 corresponds to copy_thread's esp0 setting. -+ */ -+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) -+#else -+ pushl $SYSENTER_RETURN -+#endif -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET eip, 0 -+ -+/* -+ * Load the potential sixth argument from user stack. -+ * Careful about security. -+ */ -+ cmpl $__PAGE_OFFSET-3,%ebp -+ jae syscall_fault -+1: movl (%ebp),%ebp -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,syscall_fault -+.previous -+ -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ GET_THREAD_INFO(%ebp) -+ -+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ -+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) -+ jnz syscall_trace_entry -+ cmpl $(nr_syscalls), %eax -+ jae syscall_badsys -+ call *sys_call_table(,%eax,4) -+ movl %eax,PT_EAX(%esp) -+ DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) -+ TRACE_IRQS_OFF -+ movl TI_flags(%ebp), %ecx -+ testw $_TIF_ALLWORK_MASK, %cx -+ jne syscall_exit_work -+/* if something modifies registers it must also disable sysexit */ -+ movl PT_EIP(%esp), %edx -+ movl PT_OLDESP(%esp), %ecx -+ xorl %ebp,%ebp -+ TRACE_IRQS_ON -+1: mov PT_FS(%esp), %fs -+#ifdef CONFIG_XEN -+ __ENABLE_INTERRUPTS -+sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ -+ __TEST_PENDING -+ jnz 14f # process more events if necessary... -+ movl PT_ESI(%esp), %esi -+ sysexit -+14: __DISABLE_INTERRUPTS -+ TRACE_IRQS_OFF -+sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ -+ push %esp -+ CFI_ADJUST_CFA_OFFSET 4 -+ call evtchn_do_upcall -+ add $4,%esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp ret_from_intr -+#else -+ ENABLE_INTERRUPTS_SYSEXIT -+#endif /* !CONFIG_XEN */ -+ CFI_ENDPROC -+.pushsection .fixup,"ax" -+2: movl $0,PT_FS(%esp) -+ jmp 1b -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,2b -+.popsection -+ENDPROC(sysenter_entry) -+ -+ # system call handler stub -+ENTRY(system_call) -+ RING0_INT_FRAME # can't unwind into user space anyway -+ pushl %eax # save orig_eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ GET_THREAD_INFO(%ebp) -+ testl $TF_MASK,PT_EFLAGS(%esp) -+ jz no_singlestep -+ orl $_TIF_SINGLESTEP,TI_flags(%ebp) -+no_singlestep: -+ # system call tracing in operation / emulation -+ /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ -+ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) -+ jnz syscall_trace_entry -+ cmpl $(nr_syscalls), %eax -+ jae syscall_badsys -+syscall_call: -+ call *sys_call_table(,%eax,4) -+ movl %eax,PT_EAX(%esp) # store the return value -+syscall_exit: -+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt -+ # setting need_resched or sigpending -+ # between sampling and the iret -+ TRACE_IRQS_OFF -+ movl TI_flags(%ebp), %ecx -+ testw $_TIF_ALLWORK_MASK, %cx # current->work -+ jne syscall_exit_work -+ -+restore_all: -+#ifndef CONFIG_XEN -+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS -+ # Warning: PT_OLDSS(%esp) contains the wrong/random values if we -+ # are returning to the kernel. -+ # See comments in process.c:copy_thread() for details. -+ movb PT_OLDSS(%esp), %ah -+ movb PT_CS(%esp), %al -+ andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax -+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax -+ CFI_REMEMBER_STATE -+ je ldt_ss # returning to user-space with LDT SS -+restore_nocheck: -+#else -+restore_nocheck: -+ movl PT_EFLAGS(%esp), %eax -+ testl $(VM_MASK|NMI_MASK), %eax -+ CFI_REMEMBER_STATE -+ jnz hypervisor_iret -+ shr $9, %eax # EAX[0] == IRET_EFLAGS.IF -+ GET_VCPU_INFO -+ andb evtchn_upcall_mask(%esi),%al -+ andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask -+ CFI_REMEMBER_STATE -+ jnz restore_all_enable_events # != 0 => enable event delivery -+ CFI_REMEMBER_STATE -+#endif -+ TRACE_IRQS_IRET -+restore_nocheck_notrace: -+ RESTORE_REGS -+ addl $4, %esp # skip orig_eax/error_code -+ CFI_ADJUST_CFA_OFFSET -4 -+1: INTERRUPT_RETURN -+.section .fixup,"ax" -+iret_exc: -+#ifndef CONFIG_XEN -+ TRACE_IRQS_ON -+ ENABLE_INTERRUPTS(CLBR_NONE) -+#endif -+ pushl $0 # no error code -+ pushl $do_iret_error -+ jmp error_code -+.previous -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+ -+ CFI_RESTORE_STATE -+#ifndef CONFIG_XEN -+ldt_ss: -+ larl PT_OLDSS(%esp), %eax -+ jnz restore_nocheck -+ testl $0x00400000, %eax # returning to 32bit stack? -+ jnz restore_nocheck # allright, normal return -+ -+#ifdef CONFIG_PARAVIRT -+ /* -+ * The kernel can't run on a non-flat stack if paravirt mode -+ * is active. Rather than try to fixup the high bits of -+ * ESP, bypass this code entirely. This may break DOSemu -+ * and/or Wine support in a paravirt VM, although the option -+ * is still available to implement the setting of the high -+ * 16-bits in the INTERRUPT_RETURN paravirt-op. -+ */ -+ cmpl $0, paravirt_ops+PARAVIRT_enabled -+ jne restore_nocheck -+#endif -+ -+ /* If returning to userspace with 16bit stack, -+ * try to fix the higher word of ESP, as the CPU -+ * won't restore it. -+ * This is an "official" bug of all the x86-compatible -+ * CPUs, which we can try to work around to make -+ * dosemu and wine happy. */ -+ movl PT_OLDESP(%esp), %eax -+ movl %esp, %edx -+ call patch_espfix_desc -+ pushl $__ESPFIX_SS -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ DISABLE_INTERRUPTS(CLBR_EAX) -+ TRACE_IRQS_OFF -+ lss (%esp), %esp -+ CFI_ADJUST_CFA_OFFSET -8 -+ jmp restore_nocheck -+#else -+ ALIGN -+restore_all_enable_events: -+ TRACE_IRQS_ON -+ __ENABLE_INTERRUPTS -+scrit: /**** START OF CRITICAL REGION ****/ -+ __TEST_PENDING -+ jnz 14f # process more events if necessary... -+ RESTORE_REGS -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+1: iret -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+14: __DISABLE_INTERRUPTS -+ TRACE_IRQS_OFF -+ jmp 11f -+ecrit: /**** END OF CRITICAL REGION ****/ -+ -+ CFI_RESTORE_STATE -+hypervisor_iret: -+ andl $~NMI_MASK, PT_EFLAGS(%esp) -+ RESTORE_REGS -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp hypercall_page + (__HYPERVISOR_iret * 32) -+#endif -+ CFI_ENDPROC -+ENDPROC(system_call) -+ -+ # perform work that needs to be done immediately before resumption -+ ALIGN -+ RING0_PTREGS_FRAME # can't unwind into user space anyway -+work_pending: -+ testb $_TIF_NEED_RESCHED, %cl -+ jz work_notifysig -+work_resched: -+ call schedule -+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt -+ # setting need_resched or sigpending -+ # between sampling and the iret -+ TRACE_IRQS_OFF -+ movl TI_flags(%ebp), %ecx -+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other -+ # than syscall tracing? -+ jz restore_all -+ testb $_TIF_NEED_RESCHED, %cl -+ jnz work_resched -+ -+work_notifysig: # deal with pending signals and -+ # notify-resume requests -+#ifdef CONFIG_VM86 -+ testl $VM_MASK, PT_EFLAGS(%esp) -+ movl %esp, %eax -+ jne work_notifysig_v86 # returning to kernel-space or -+ # vm86-space -+ xorl %edx, %edx -+ call do_notify_resume -+ jmp resume_userspace_sig -+ -+ ALIGN -+work_notifysig_v86: -+ pushl %ecx # save ti_flags for do_notify_resume -+ CFI_ADJUST_CFA_OFFSET 4 -+ call save_v86_state # %eax contains pt_regs pointer -+ popl %ecx -+ CFI_ADJUST_CFA_OFFSET -4 -+ movl %eax, %esp -+#else -+ movl %esp, %eax -+#endif -+ xorl %edx, %edx -+ call do_notify_resume -+ jmp resume_userspace_sig -+END(work_pending) -+ -+ # perform syscall exit tracing -+ ALIGN -+syscall_trace_entry: -+ movl $-ENOSYS,PT_EAX(%esp) -+ movl %esp, %eax -+ xorl %edx,%edx -+ call do_syscall_trace -+ cmpl $0, %eax -+ jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, -+ # so must skip actual syscall -+ movl PT_ORIG_EAX(%esp), %eax -+ cmpl $(nr_syscalls), %eax -+ jnae syscall_call -+ jmp syscall_exit -+END(syscall_trace_entry) -+ -+ # perform syscall exit tracing -+ ALIGN -+syscall_exit_work: -+ testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl -+ jz work_pending -+ TRACE_IRQS_ON -+ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call -+ # schedule() instead -+ movl %esp, %eax -+ movl $1, %edx -+ call do_syscall_trace -+ jmp resume_userspace -+END(syscall_exit_work) -+ CFI_ENDPROC -+ -+ RING0_INT_FRAME # can't unwind into user space anyway -+syscall_fault: -+ pushl %eax # save orig_eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ GET_THREAD_INFO(%ebp) -+ movl $-EFAULT,PT_EAX(%esp) -+ jmp resume_userspace -+END(syscall_fault) -+ -+syscall_badsys: -+ movl $-ENOSYS,PT_EAX(%esp) -+ jmp resume_userspace -+END(syscall_badsys) -+ CFI_ENDPROC -+ -+#ifndef CONFIG_XEN -+#define FIXUP_ESPFIX_STACK \ -+ /* since we are on a wrong stack, we cant make it a C code :( */ \ -+ movl %fs:PDA_cpu, %ebx; \ -+ PER_CPU(cpu_gdt_descr, %ebx); \ -+ movl GDS_address(%ebx), %ebx; \ -+ GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ -+ addl %esp, %eax; \ -+ pushl $__KERNEL_DS; \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ pushl %eax; \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ lss (%esp), %esp; \ -+ CFI_ADJUST_CFA_OFFSET -8; -+#define UNWIND_ESPFIX_STACK \ -+ movl %ss, %eax; \ -+ /* see if on espfix stack */ \ -+ cmpw $__ESPFIX_SS, %ax; \ -+ jne 27f; \ -+ movl $__KERNEL_DS, %eax; \ -+ movl %eax, %ds; \ -+ movl %eax, %es; \ -+ /* switch to normal stack */ \ -+ FIXUP_ESPFIX_STACK; \ -+27:; -+ -+/* -+ * Build the entry stubs and pointer table with -+ * some assembler magic. -+ */ -+.data -+ENTRY(interrupt) -+.text -+ -+ENTRY(irq_entries_start) -+ RING0_INT_FRAME -+vector=0 -+.rept NR_IRQS -+ ALIGN -+ .if vector -+ CFI_ADJUST_CFA_OFFSET -4 -+ .endif -+1: pushl $~(vector) -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp common_interrupt -+ .previous -+ .long 1b -+ .text -+vector=vector+1 -+.endr -+END(irq_entries_start) -+ -+.previous -+END(interrupt) -+.previous -+ -+/* -+ * the CPU automatically disables interrupts when executing an IRQ vector, -+ * so IRQ-flags tracing has to follow that: -+ */ -+ ALIGN -+common_interrupt: -+ SAVE_ALL -+ TRACE_IRQS_OFF -+ movl %esp,%eax -+ call do_IRQ -+ jmp ret_from_intr -+ENDPROC(common_interrupt) -+ CFI_ENDPROC -+ -+#define BUILD_INTERRUPT(name, nr) \ -+ENTRY(name) \ -+ RING0_INT_FRAME; \ -+ pushl $~(nr); \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ SAVE_ALL; \ -+ TRACE_IRQS_OFF \ -+ movl %esp,%eax; \ -+ call smp_/**/name; \ -+ jmp ret_from_intr; \ -+ CFI_ENDPROC; \ -+ENDPROC(name) -+ -+/* The include is where all of the SMP etc. interrupts come from */ -+#include "entry_arch.h" -+#else -+#define UNWIND_ESPFIX_STACK -+#endif -+ -+/* This alternate entry is needed because we hijack the apic LVTT */ -+#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) -+BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) -+#endif -+ -+KPROBE_ENTRY(page_fault) -+ RING0_EC_FRAME -+ pushl $do_page_fault -+ CFI_ADJUST_CFA_OFFSET 4 -+ ALIGN -+error_code: -+ /* the function address is in %fs's slot on the stack */ -+ pushl %es -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET es, 0*/ -+ pushl %ds -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET ds, 0*/ -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET eax, 0 -+ pushl %ebp -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET ebp, 0 -+ pushl %edi -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET edi, 0 -+ pushl %esi -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET esi, 0 -+ pushl %edx -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET edx, 0 -+ pushl %ecx -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET ecx, 0 -+ pushl %ebx -+ CFI_ADJUST_CFA_OFFSET 4 -+ CFI_REL_OFFSET ebx, 0 -+ cld -+ pushl %fs -+ CFI_ADJUST_CFA_OFFSET 4 -+ /*CFI_REL_OFFSET fs, 0*/ -+ movl $(__KERNEL_PDA), %ecx -+ movl %ecx, %fs -+ UNWIND_ESPFIX_STACK -+ popl %ecx -+ CFI_ADJUST_CFA_OFFSET -4 -+ /*CFI_REGISTER es, ecx*/ -+ movl PT_FS(%esp), %edi # get the function address -+ movl PT_ORIG_EAX(%esp), %edx # get the error code -+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart -+ mov %ecx, PT_FS(%esp) -+ /*CFI_REL_OFFSET fs, ES*/ -+ movl $(__USER_DS), %ecx -+ movl %ecx, %ds -+ movl %ecx, %es -+ movl %esp,%eax # pt_regs pointer -+ call *%edi -+ jmp ret_from_exception -+ CFI_ENDPROC -+KPROBE_END(page_fault) -+ -+#ifdef CONFIG_XEN -+# A note on the "critical region" in our callback handler. -+# We want to avoid stacking callback handlers due to events occurring -+# during handling of the last event. To do this, we keep events disabled -+# until we've done all processing. HOWEVER, we must enable events before -+# popping the stack frame (can't be done atomically) and so it would still -+# be possible to get enough handler activations to overflow the stack. -+# Although unlikely, bugs of that kind are hard to track down, so we'd -+# like to avoid the possibility. -+# So, on entry to the handler we detect whether we interrupted an -+# existing activation in its critical region -- if so, we pop the current -+# activation and restart the handler using the previous one. -+# -+# The sysexit critical region is slightly different. sysexit -+# atomically removes the entire stack frame. If we interrupt in the -+# critical region we know that the entire frame is present and correct -+# so we can simply throw away the new one. -+ENTRY(hypervisor_callback) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ movl PT_EIP(%esp),%eax -+ cmpl $scrit,%eax -+ jb 11f -+ cmpl $ecrit,%eax -+ jb critical_region_fixup -+ cmpl $sysexit_scrit,%eax -+ jb 11f -+ cmpl $sysexit_ecrit,%eax -+ ja 11f -+ # interrupted in sysexit critical -+ addl $PT_OLDESP,%esp # Remove cs...ebx from stack frame. -+11: push %esp -+ CFI_ADJUST_CFA_OFFSET 4 -+ call evtchn_do_upcall -+ add $4,%esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp ret_from_intr -+ CFI_ENDPROC -+ -+# [How we do the fixup]. We want to merge the current stack frame with the -+# just-interrupted frame. How we do this depends on where in the critical -+# region the interrupted handler was executing, and so how many saved -+# registers are in each frame. We do this quickly using the lookup table -+# 'critical_fixup_table'. For each byte offset in the critical region, it -+# provides the number of bytes which have already been popped from the -+# interrupted stack frame. -+critical_region_fixup: -+ movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped -+ cmpb $0xff,%cl # 0xff => vcpu_info critical region -+ jne 15f -+ xorl %ecx,%ecx -+15: leal (%esp,%ecx),%esi # %esi points at end of src region -+ leal PT_OLDESP(%esp),%edi # %edi points at end of dst region -+ shrl $2,%ecx # convert words to bytes -+ je 17f # skip loop if nothing to copy -+16: subl $4,%esi # pre-decrementing copy loop -+ subl $4,%edi -+ movl (%esi),%eax -+ movl %eax,(%edi) -+ loop 16b -+17: movl %edi,%esp # final %edi is top of merged stack -+ jmp 11b -+ -+.section .rodata,"a" -+critical_fixup_table: -+ .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING -+ .byte 0xff,0xff # jnz 14f -+ .byte 0x00 # pop %ebx -+ .byte 0x04 # pop %ecx -+ .byte 0x08 # pop %edx -+ .byte 0x0c # pop %esi -+ .byte 0x10 # pop %edi -+ .byte 0x14 # pop %ebp -+ .byte 0x18 # pop %eax -+ .byte 0x1c # pop %ds -+ .byte 0x20 # pop %es -+ .byte 0x24,0x24 # pop %fs -+ .byte 0x28,0x28,0x28 # add $4,%esp -+ .byte 0x2c # iret -+ .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) -+ .byte 0x00,0x00 # jmp 11b -+.previous -+ -+# Hypervisor uses this for application faults while it executes. -+# We get here for two reasons: -+# 1. Fault while reloading DS, ES, FS or GS -+# 2. Fault while executing IRET -+# Category 1 we fix up by reattempting the load, and zeroing the segment -+# register if the load fails. -+# Category 2 we fix up by jumping to do_iret_error. We cannot use the -+# normal Linux return path in this case because if we use the IRET hypercall -+# to pop the stack frame we end up in an infinite loop of failsafe callbacks. -+# We distinguish between categories by maintaining a status value in EAX. -+ENTRY(failsafe_callback) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ movl $1,%eax -+1: mov 4(%esp),%ds -+2: mov 8(%esp),%es -+3: mov 12(%esp),%fs -+4: mov 16(%esp),%gs -+ testl %eax,%eax -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ jz 5f -+ addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) -+ jmp iret_exc -+5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) -+ pushl $0 -+ SAVE_ALL -+ jmp ret_from_exception -+.section .fixup,"ax"; \ -+6: xorl %eax,%eax; \ -+ movl %eax,4(%esp); \ -+ jmp 1b; \ -+7: xorl %eax,%eax; \ -+ movl %eax,8(%esp); \ -+ jmp 2b; \ -+8: xorl %eax,%eax; \ -+ movl %eax,12(%esp); \ -+ jmp 3b; \ -+9: xorl %eax,%eax; \ -+ movl %eax,16(%esp); \ -+ jmp 4b; \ -+.previous; \ -+.section __ex_table,"a"; \ -+ .align 4; \ -+ .long 1b,6b; \ -+ .long 2b,7b; \ -+ .long 3b,8b; \ -+ .long 4b,9b; \ -+.previous -+#endif -+ CFI_ENDPROC -+ -+ENTRY(coprocessor_error) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_coprocessor_error -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(coprocessor_error) -+ -+ENTRY(simd_coprocessor_error) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_simd_coprocessor_error -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(simd_coprocessor_error) -+ -+ENTRY(device_not_available) -+ RING0_INT_FRAME -+ pushl $-1 # mark this as an int -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+#ifndef CONFIG_XEN -+ GET_CR0_INTO_EAX -+ testl $0x4, %eax # EM (math emulation bit) -+ je device_available_emulate -+ pushl $0 # temporary storage for ORIG_EIP -+ CFI_ADJUST_CFA_OFFSET 4 -+ call math_emulate -+ addl $4, %esp -+ CFI_ADJUST_CFA_OFFSET -4 -+ jmp ret_from_exception -+device_available_emulate: -+#endif -+ preempt_stop(CLBR_ANY) -+ call math_state_restore -+ jmp ret_from_exception -+ CFI_ENDPROC -+END(device_not_available) -+ -+#ifndef CONFIG_XEN -+/* -+ * Debug traps and NMI can happen at the one SYSENTER instruction -+ * that sets up the real kernel stack. Check here, since we can't -+ * allow the wrong stack to be used. -+ * -+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have -+ * already pushed 3 words if it hits on the sysenter instruction: -+ * eflags, cs and eip. -+ * -+ * We just load the right stack, and push the three (known) values -+ * by hand onto the new stack - while updating the return eip past -+ * the instruction that would have done it for sysenter. -+ */ -+#define FIX_STACK(offset, ok, label) \ -+ cmpw $__KERNEL_CS,4(%esp); \ -+ jne ok; \ -+label: \ -+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ -+ CFI_DEF_CFA esp, 0; \ -+ CFI_UNDEFINED eip; \ -+ pushfl; \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ pushl $__KERNEL_CS; \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ pushl $sysenter_past_esp; \ -+ CFI_ADJUST_CFA_OFFSET 4; \ -+ CFI_REL_OFFSET eip, 0 -+#endif /* CONFIG_XEN */ -+ -+KPROBE_ENTRY(debug) -+ RING0_INT_FRAME -+#ifndef CONFIG_XEN -+ cmpl $sysenter_entry,(%esp) -+ jne debug_stack_correct -+ FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -+debug_stack_correct: -+#endif /* !CONFIG_XEN */ -+ pushl $-1 # mark this as an int -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # error code 0 -+ movl %esp,%eax # pt_regs pointer -+ call do_debug -+ jmp ret_from_exception -+ CFI_ENDPROC -+KPROBE_END(debug) -+ .previous .text -+ -+#ifndef CONFIG_XEN -+/* -+ * NMI is doubly nasty. It can happen _while_ we're handling -+ * a debug fault, and the debug fault hasn't yet been able to -+ * clear up the stack. So we first check whether we got an -+ * NMI on the sysenter entry path, but after that we need to -+ * check whether we got an NMI on the debug path where the debug -+ * fault happened on the sysenter path. -+ */ -+KPROBE_ENTRY(nmi) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ movl %ss, %eax -+ cmpw $__ESPFIX_SS, %ax -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ je nmi_espfix_stack -+ cmpl $sysenter_entry,(%esp) -+ je nmi_stack_fixup -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ movl %esp,%eax -+ /* Do not access memory above the end of our stack page, -+ * it might not exist. -+ */ -+ andl $(THREAD_SIZE-1),%eax -+ cmpl $(THREAD_SIZE-20),%eax -+ popl %eax -+ CFI_ADJUST_CFA_OFFSET -4 -+ jae nmi_stack_correct -+ cmpl $sysenter_entry,12(%esp) -+ je nmi_debug_stack_check -+nmi_stack_correct: -+ /* We have a RING0_INT_FRAME here */ -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # zero error code -+ movl %esp,%eax # pt_regs pointer -+ call do_nmi -+ jmp restore_nocheck_notrace -+ CFI_ENDPROC -+ -+nmi_stack_fixup: -+ RING0_INT_FRAME -+ FIX_STACK(12,nmi_stack_correct, 1) -+ jmp nmi_stack_correct -+ -+nmi_debug_stack_check: -+ /* We have a RING0_INT_FRAME here */ -+ cmpw $__KERNEL_CS,16(%esp) -+ jne nmi_stack_correct -+ cmpl $debug,(%esp) -+ jb nmi_stack_correct -+ cmpl $debug_esp_fix_insn,(%esp) -+ ja nmi_stack_correct -+ FIX_STACK(24,nmi_stack_correct, 1) -+ jmp nmi_stack_correct -+ -+nmi_espfix_stack: -+ /* We have a RING0_INT_FRAME here. -+ * -+ * create the pointer to lss back -+ */ -+ pushl %ss -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl %esp -+ CFI_ADJUST_CFA_OFFSET 4 -+ addw $4, (%esp) -+ /* copy the iret frame of 12 bytes */ -+ .rept 3 -+ pushl 16(%esp) -+ CFI_ADJUST_CFA_OFFSET 4 -+ .endr -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ FIXUP_ESPFIX_STACK # %eax == %esp -+ xorl %edx,%edx # zero error code -+ call do_nmi -+ RESTORE_REGS -+ lss 12+4(%esp), %esp # back to espfix stack -+ CFI_ADJUST_CFA_OFFSET -24 -+1: INTERRUPT_RETURN -+ CFI_ENDPROC -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+KPROBE_END(nmi) -+#else -+KPROBE_ENTRY(nmi) -+ RING0_INT_FRAME -+ pushl %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # zero error code -+ movl %esp,%eax # pt_regs pointer -+ call do_nmi -+ orl $NMI_MASK, PT_EFLAGS(%esp) -+ jmp restore_all -+ CFI_ENDPROC -+KPROBE_END(nmi) -+#endif -+ -+#ifdef CONFIG_PARAVIRT -+ENTRY(native_iret) -+1: iret -+.section __ex_table,"a" -+ .align 4 -+ .long 1b,iret_exc -+.previous -+END(native_iret) -+ -+ENTRY(native_irq_enable_sysexit) -+ sti -+ sysexit -+END(native_irq_enable_sysexit) -+#endif -+ -+KPROBE_ENTRY(int3) -+ RING0_INT_FRAME -+ pushl $-1 # mark this as an int -+ CFI_ADJUST_CFA_OFFSET 4 -+ SAVE_ALL -+ xorl %edx,%edx # zero error code -+ movl %esp,%eax # pt_regs pointer -+ call do_int3 -+ jmp ret_from_exception -+ CFI_ENDPROC -+KPROBE_END(int3) -+ -+ENTRY(overflow) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_overflow -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(overflow) -+ -+ENTRY(bounds) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_bounds -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(bounds) -+ -+ENTRY(invalid_op) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_invalid_op -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(invalid_op) -+ -+ENTRY(coprocessor_segment_overrun) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_coprocessor_segment_overrun -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(coprocessor_segment_overrun) -+ -+ENTRY(invalid_TSS) -+ RING0_EC_FRAME -+ pushl $do_invalid_TSS -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(invalid_TSS) -+ -+ENTRY(segment_not_present) -+ RING0_EC_FRAME -+ pushl $do_segment_not_present -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(segment_not_present) -+ -+ENTRY(stack_segment) -+ RING0_EC_FRAME -+ pushl $do_stack_segment -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(stack_segment) -+ -+KPROBE_ENTRY(general_protection) -+ RING0_EC_FRAME -+ pushl $do_general_protection -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+KPROBE_END(general_protection) -+ -+ENTRY(alignment_check) -+ RING0_EC_FRAME -+ pushl $do_alignment_check -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(alignment_check) -+ -+ENTRY(divide_error) -+ RING0_INT_FRAME -+ pushl $0 # no error code -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_divide_error -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(divide_error) -+ -+#ifdef CONFIG_X86_MCE -+ENTRY(machine_check) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl machine_check_vector -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(machine_check) -+#endif -+ -+#ifndef CONFIG_XEN -+ENTRY(spurious_interrupt_bug) -+ RING0_INT_FRAME -+ pushl $0 -+ CFI_ADJUST_CFA_OFFSET 4 -+ pushl $do_spurious_interrupt_bug -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+END(spurious_interrupt_bug) -+#endif /* !CONFIG_XEN */ -+ -+ENTRY(kernel_thread_helper) -+ pushl $0 # fake return address for unwinder -+ CFI_STARTPROC -+ movl %edx,%eax -+ push %edx -+ CFI_ADJUST_CFA_OFFSET 4 -+ call *%ebx -+ push %eax -+ CFI_ADJUST_CFA_OFFSET 4 -+ call do_exit -+ CFI_ENDPROC -+ENDPROC(kernel_thread_helper) -+ -+ENTRY(fixup_4gb_segment) -+ RING0_EC_FRAME -+ pushl $do_fixup_4gb_segment -+ CFI_ADJUST_CFA_OFFSET 4 -+ jmp error_code -+ CFI_ENDPROC -+ -+.section .rodata,"a" -+.align 4 -+#include "syscall_table.S" -+ -+syscall_table_size=(.-sys_call_table) -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/entry.S ---- a/arch/i386/kernel/entry.S Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/entry.S Fri Jul 20 11:56:41 2007 -0300 -@@ -287,7 +287,7 @@ ENTRY(sysenter_entry) - CFI_SIGNAL_FRAME - CFI_DEF_CFA esp, 0 - CFI_REGISTER esp, ebp -- movl TSS_sysenter_esp0(%esp),%esp -+ movl SYSENTER_stack_esp0(%esp),%esp - sysenter_past_esp: - /* - * No need to follow this irqs on/off section: the syscall -@@ -752,7 +752,7 @@ END(device_not_available) - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * -- * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have -+ * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * -@@ -764,7 +764,7 @@ END(device_not_available) - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ - label: \ -- movl TSS_sysenter_esp0+offset(%esp),%esp; \ -+ movl SYSENTER_stack_esp0+offset(%esp),%esp; \ - CFI_DEF_CFA esp, 0; \ - CFI_UNDEFINED eip; \ - pushfl; \ -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/fixup.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/fixup.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,95 @@ -+/****************************************************************************** -+ * fixup.c -+ * -+ * Binary-rewriting of certain IA32 instructions, on notification by Xen. -+ * Used to avoid repeated slow emulation of common instructions used by the -+ * user-space TLS (Thread-Local Storage) libraries. -+ * -+ * **** NOTE **** -+ * Issues with the binary rewriting have caused it to be removed. Instead -+ * we rely on Xen's emulator to boot the kernel, and then print a banner -+ * message recommending that the user disables /lib/tls. -+ * -+ * Copyright (c) 2004, K A Fraser -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) -+ -+fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) -+{ -+#if 0 -+ static unsigned long printed = 0; -+ char info[100]; -+ int i; -+ -+ /* Ignore statically-linked init. */ -+ if (current->tgid == 1) -+ return; -+ -+ HYPERVISOR_vm_assist( -+ VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify); -+ -+ if (test_and_set_bit(0, &printed)) -+ return; -+ -+ sprintf(info, "%s (pid=%d)", current->comm, current->tgid); -+ -+ DP(""); -+ DP("***************************************************************"); -+ DP("***************************************************************"); -+ DP("** WARNING: Currently emulating unsupported memory accesses **"); -+ DP("** in /lib/tls glibc libraries. The emulation is **"); -+ DP("** slow. To ensure full performance you should **"); -+ DP("** install a 'xen-friendly' (nosegneg) version of **"); -+ DP("** the library, or disable tls support by executing **"); -+ DP("** the following as root: **"); -+ DP("** mv /lib/tls /lib/tls.disabled **"); -+ DP("** Offending process: %-38.38s **", info); -+ DP("***************************************************************"); -+ DP("***************************************************************"); -+ DP(""); -+ -+ for (i = 5; i > 0; i--) { -+ touch_softlockup_watchdog(); -+ printk("Pausing... %d", i); -+ mdelay(1000); -+ printk("\b\b\b\b\b\b\b\b\b\b\b\b"); -+ } -+ -+ printk("Continuing...\n\n"); -+#else -+ if (printk_ratelimit()) -+ printk(KERN_WARNING -+ "4gb seg fixup, process %s (pid %d), cs:ip %02x:%08lx\n", -+ current->comm, current->tgid, regs->xcs, regs->eip); -+#endif -+} -+ -+static int __init fixup_init(void) -+{ -+ HYPERVISOR_vm_assist( -+ VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); -+ return 0; -+} -+__initcall(fixup_init); -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/head-xen.S ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/head-xen.S Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,299 @@ -+ -+ -+.text -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * References to members of the new_cpu_data structure. -+ */ -+ -+#define X86 new_cpu_data+CPUINFO_x86 -+#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor -+#define X86_MODEL new_cpu_data+CPUINFO_x86_model -+#define X86_MASK new_cpu_data+CPUINFO_x86_mask -+#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math -+#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level -+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability -+#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -+ -+#define VIRT_ENTRY_OFFSET 0x0 -+.org VIRT_ENTRY_OFFSET -+.section .text.head,"ax",@progbits -+ENTRY(startup_32) -+ -+#ifdef CONFIG_PARAVIRT -+ movl %cs, %eax -+ testl $0x3, %eax -+ jnz startup_paravirt -+#endif -+ -+ movl %esi,xen_start_info -+ cld -+ -+ call setup_pda -+ -+ /* Set up the stack pointer */ -+ movl $(init_thread_union+THREAD_SIZE),%esp -+ -+ /* get vendor info */ -+ xorl %eax,%eax # call CPUID with 0 -> return vendor ID -+ XEN_CPUID -+ movl %eax,X86_CPUID # save CPUID level -+ movl %ebx,X86_VENDOR_ID # lo 4 chars -+ movl %edx,X86_VENDOR_ID+4 # next 4 chars -+ movl %ecx,X86_VENDOR_ID+8 # last 4 chars -+ -+ movl $1,%eax # Use the CPUID instruction to get CPU type -+ XEN_CPUID -+ movb %al,%cl # save reg for future use -+ andb $0x0f,%ah # mask processor family -+ movb %ah,X86 -+ andb $0xf0,%al # mask model -+ shrb $4,%al -+ movb %al,X86_MODEL -+ andb $0x0f,%cl # mask mask revision -+ movb %cl,X86_MASK -+ movl %edx,X86_CAPABILITY -+ -+ movb $1,X86_HARD_MATH -+ -+ xorl %eax,%eax # Clear GS and LDT -+ movl %eax,%gs -+ -+ movl $(__KERNEL_PDA),%eax -+ mov %eax,%fs -+ -+ cld # gcc2 wants the direction flag cleared at all times -+ -+ pushl %eax # fake return address -+ jmp start_kernel -+ -+/* -+ * Point the GDT at this CPU's PDA. This will be -+ * cpu_gdt_table and boot_pda. -+ */ -+ENTRY(setup_pda) -+ /* get the PDA pointer */ -+ movl $boot_pda, %eax -+ -+ /* slot the PDA address into the GDT */ -+ mov $cpu_gdt_table, %ecx -+ mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ -+ shr $16, %eax -+ mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ -+ mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ -+ -+ # %esi still points to start_info, and no registers -+ # need to be preserved. -+ -+ movl XEN_START_mfn_list(%esi), %ebx -+ movl $(cpu_gdt_table - __PAGE_OFFSET), %eax -+ shrl $PAGE_SHIFT, %eax -+ movl (%ebx,%eax,4), %ecx -+ pushl %ecx # frame number for set_gdt below -+ -+ xorl %esi, %esi -+ xorl %edx, %edx -+ shldl $PAGE_SHIFT, %ecx, %edx -+ shll $PAGE_SHIFT, %ecx -+ orl $0x61, %ecx -+ movl $cpu_gdt_table, %ebx -+ movl $__HYPERVISOR_update_va_mapping, %eax -+ int $0x82 -+ -+ movl $(PAGE_SIZE_asm / 8), %ecx -+ movl %esp, %ebx -+ movl $__HYPERVISOR_set_gdt, %eax -+ int $0x82 -+ -+ popl %ecx -+ ret -+ -+#define HYPERCALL_PAGE_OFFSET 0x1000 -+.org HYPERCALL_PAGE_OFFSET -+ENTRY(hypercall_page) -+ CFI_STARTPROC -+.skip 0x1000 -+ CFI_ENDPROC -+ -+/* -+ * Real beginning of normal "text" segment -+ */ -+ENTRY(stext) -+ENTRY(_stext) -+ -+/* -+ * BSS section -+ */ -+.section ".bss.page_aligned","w" -+ENTRY(empty_zero_page) -+ .fill 4096,1,0 -+ -+/* -+ * This starts the data section. -+ */ -+.data -+ENTRY(start_pda) -+ .long boot_pda -+ -+.section .text -+#ifdef CONFIG_PARAVIRT -+startup_paravirt: -+ cld -+ movl $(init_thread_union+THREAD_SIZE),%esp -+ -+ /* We take pains to preserve all the regs. */ -+ pushl %edx -+ pushl %ecx -+ pushl %eax -+ -+ pushl $__start_paravirtprobe -+1: -+ movl 0(%esp), %eax -+ cmpl $__stop_paravirtprobe, %eax -+ je unhandled_paravirt -+ pushl (%eax) -+ movl 8(%esp), %eax -+ call *(%esp) -+ popl %eax -+ -+ movl 4(%esp), %eax -+ movl 8(%esp), %ecx -+ movl 12(%esp), %edx -+ -+ addl $4, (%esp) -+ jmp 1b -+ -+unhandled_paravirt: -+ /* Nothing wanted us: we're screwed. */ -+ ud2 -+#endif -+ -+/* -+ * The Global Descriptor Table contains 28 quadwords, per-CPU. -+ */ -+ .section .data.page_aligned, "aw" -+ .align PAGE_SIZE_asm -+ENTRY(cpu_gdt_table) -+ .quad 0x0000000000000000 /* NULL descriptor */ -+ .quad 0x0000000000000000 /* 0x0b reserved */ -+ .quad 0x0000000000000000 /* 0x13 reserved */ -+ .quad 0x0000000000000000 /* 0x1b reserved */ -+ .quad 0x0000000000000000 /* 0x20 unused */ -+ .quad 0x0000000000000000 /* 0x28 unused */ -+ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ -+ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ -+ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ -+ .quad 0x0000000000000000 /* 0x4b reserved */ -+ .quad 0x0000000000000000 /* 0x53 reserved */ -+ .quad 0x0000000000000000 /* 0x5b reserved */ -+ -+ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ -+ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ -+ .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ -+ .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ -+ -+ .quad 0x0000000000000000 /* 0x80 TSS descriptor */ -+ .quad 0x0000000000000000 /* 0x88 LDT descriptor */ -+ -+ /* -+ * Segments used for calling PnP BIOS have byte granularity. -+ * They code segments and data segments have fixed 64k limits, -+ * the transfer segment sizes are set at run time. -+ */ -+ .quad 0x0000000000000000 /* 0x90 32-bit code */ -+ .quad 0x0000000000000000 /* 0x98 16-bit code */ -+ .quad 0x0000000000000000 /* 0xa0 16-bit data */ -+ .quad 0x0000000000000000 /* 0xa8 16-bit data */ -+ .quad 0x0000000000000000 /* 0xb0 16-bit data */ -+ -+ /* -+ * The APM segments have byte granularity and their bases -+ * are set at run time. All have 64k limits. -+ */ -+ .quad 0x0000000000000000 /* 0xb8 APM CS code */ -+ .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ -+ .quad 0x0000000000000000 /* 0xc8 APM DS data */ -+ -+ .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */ -+ .quad 0x00cf92000000ffff /* 0xd8 - PDA */ -+ .quad 0x0000000000000000 /* 0xe0 - unused */ -+ .quad 0x0000000000000000 /* 0xe8 - unused */ -+ .quad 0x0000000000000000 /* 0xf0 - unused */ -+ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ -+ .align PAGE_SIZE_asm -+ -+#if CONFIG_XEN_COMPAT <= 0x030002 -+/* -+ * __xen_guest information -+ */ -+.macro utoa value -+ .if (\value) < 0 || (\value) >= 0x10 -+ utoa (((\value)>>4)&0x0fffffff) -+ .endif -+ .if ((\value) & 0xf) < 10 -+ .byte '0' + ((\value) & 0xf) -+ .else -+ .byte 'A' + ((\value) & 0xf) - 10 -+ .endif -+.endm -+ -+.section __xen_guest -+ .ascii "GUEST_OS=linux,GUEST_VER=2.6" -+ .ascii ",XEN_VER=xen-3.0" -+ .ascii ",VIRT_BASE=0x" -+ utoa __PAGE_OFFSET -+ .ascii ",ELF_PADDR_OFFSET=0x" -+ utoa __PAGE_OFFSET -+ .ascii ",VIRT_ENTRY=0x" -+ utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) -+ .ascii ",HYPERCALL_PAGE=0x" -+ utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) -+ .ascii ",FEATURES=writable_page_tables" -+ .ascii "|writable_descriptor_tables" -+ .ascii "|auto_translated_physmap" -+ .ascii "|pae_pgdir_above_4gb" -+ .ascii "|supervisor_mode_kernel" -+#ifdef CONFIG_X86_PAE -+ .ascii ",PAE=yes[extended-cr3]" -+#else -+ .ascii ",PAE=no" -+#endif -+ .ascii ",LOADER=generic" -+ .byte 0 -+#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ -+ -+ -+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") -+ ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") -+ ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") -+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) -+#if CONFIG_XEN_COMPAT <= 0x030002 -+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET) -+#else -+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0) -+#endif -+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) -+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) -+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) -+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") -+#ifdef CONFIG_X86_PAE -+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") -+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) -+#else -+ ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") -+ ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT) -+#endif -+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") -+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/init_task.c ---- a/arch/i386/kernel/init_task.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/init_task.c Fri Jul 20 11:56:41 2007 -0300 -@@ -14,7 +14,14 @@ static struct files_struct init_files = - static struct files_struct init_files = INIT_FILES; - static struct signal_struct init_signals = INIT_SIGNALS(init_signals); - static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -+ -+#ifdef CONFIG_XEN -+#define swapper_pg_dir ((pgd_t *)NULL) - struct mm_struct init_mm = INIT_MM(init_mm); -+#undef swapper_pg_dir -+#else -+struct mm_struct init_mm = INIT_MM(init_mm); -+#endif - - EXPORT_SYMBOL(init_mm); - -@@ -38,9 +45,11 @@ struct task_struct init_task = INIT_TASK - - EXPORT_SYMBOL(init_task); - -+#ifndef CONFIG_X86_NO_TSS - /* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. - */ - DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; -+#endif - -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/io_apic-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/io_apic-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,2976 @@ -+/* -+ * Intel IO-APIC support for multi-Pentium hosts. -+ * -+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo -+ * -+ * Many thanks to Stig Venaas for trying out countless experimental -+ * patches and reporting/debugging problems patiently! -+ * -+ * (c) 1999, Multiple IO-APIC support, developed by -+ * Ken-ichi Yaku and -+ * Hidemi Kishimoto , -+ * further tested and cleaned up by Zach Brown -+ * and Ingo Molnar -+ * -+ * Fixes -+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs; -+ * thanks to Eric Gilmore -+ * and Rolf G. Tews -+ * for testing these extensively -+ * Paul Diefenbaugh : Added full ACPI support -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include "io_ports.h" -+ -+#ifdef CONFIG_XEN -+ -+#include -+#include -+ -+/* Fake i8259 */ -+#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -+#define disable_8259A_irq(_irq) ((void)0) -+#define i8259A_irq_pending(_irq) (0) -+ -+unsigned long io_apic_irqs; -+ -+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) -+{ -+ struct physdev_apic apic_op; -+ int ret; -+ -+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; -+ apic_op.reg = reg; -+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); -+ if (ret) -+ return ret; -+ return apic_op.value; -+} -+ -+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -+{ -+ struct physdev_apic apic_op; -+ -+ apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; -+ apic_op.reg = reg; -+ apic_op.value = value; -+ HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op); -+} -+ -+#define io_apic_read(a,r) xen_io_apic_read(a,r) -+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) -+ -+#endif /* CONFIG_XEN */ -+ -+int (*ioapic_renumber_irq)(int ioapic, int irq); -+atomic_t irq_mis_count; -+ -+/* Where if anywhere is the i8259 connect in external int mode */ -+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -+ -+static DEFINE_SPINLOCK(ioapic_lock); -+static DEFINE_SPINLOCK(vector_lock); -+ -+int timer_over_8254 __initdata = 1; -+ -+/* -+ * Is the SiS APIC rmw bug present ? -+ * -1 = don't know, 0 = no, 1 = yes -+ */ -+int sis_apic_bug = -1; -+ -+/* -+ * # of IRQ routing registers -+ */ -+int nr_ioapic_registers[MAX_IO_APICS]; -+ -+static int disable_timer_pin_1 __initdata; -+ -+/* -+ * Rough estimation of how many shared IRQs there are, can -+ * be changed anytime. -+ */ -+#define MAX_PLUS_SHARED_IRQS NR_IRQS -+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) -+ -+/* -+ * This is performance-critical, we want to do it O(1) -+ * -+ * the indexing order of this array favors 1:1 mappings -+ * between pins and IRQs. -+ */ -+ -+static struct irq_pin_list { -+ int apic, pin, next; -+} irq_2_pin[PIN_MAP_SIZE]; -+ -+#ifndef CONFIG_XEN -+struct io_apic { -+ unsigned int index; -+ unsigned int unused[3]; -+ unsigned int data; -+}; -+ -+static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -+{ -+ return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) -+ + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); -+} -+ -+static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -+{ -+ struct io_apic __iomem *io_apic = io_apic_base(apic); -+ writel(reg, &io_apic->index); -+ return readl(&io_apic->data); -+} -+ -+static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -+{ -+ struct io_apic __iomem *io_apic = io_apic_base(apic); -+ writel(reg, &io_apic->index); -+ writel(value, &io_apic->data); -+} -+ -+/* -+ * Re-write a value: to be used for read-modify-write -+ * cycles where the read already set up the index register. -+ * -+ * Older SiS APIC requires we rewrite the index register -+ */ -+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -+{ -+ volatile struct io_apic __iomem *io_apic = io_apic_base(apic); -+ if (sis_apic_bug) -+ writel(reg, &io_apic->index); -+ writel(value, &io_apic->data); -+} -+#endif /* !CONFIG_XEN */ -+ -+union entry_union { -+ struct { u32 w1, w2; }; -+ struct IO_APIC_route_entry entry; -+}; -+ -+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -+{ -+ union entry_union eu; -+ unsigned long flags; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); -+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ return eu.entry; -+} -+ -+/* -+ * When we write a new IO APIC routing entry, we need to write the high -+ * word first! If the mask bit in the low word is clear, we will enable -+ * the interrupt, and we need to make sure the entry is fully populated -+ * before that happens. -+ */ -+static void -+__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -+{ -+ union entry_union eu; -+ eu.entry = e; -+ io_apic_write(apic, 0x11 + 2*pin, eu.w2); -+ io_apic_write(apic, 0x10 + 2*pin, eu.w1); -+} -+ -+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -+{ -+ unsigned long flags; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __ioapic_write_entry(apic, pin, e); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+/* -+ * When we mask an IO APIC routing entry, we need to write the low -+ * word first, in order to set the mask bit before we change the -+ * high bits! -+ */ -+ -+#ifndef CONFIG_XEN -+static void ioapic_mask_entry(int apic, int pin) -+{ -+ unsigned long flags; -+ union entry_union eu = { .entry.mask = 1 }; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0x10 + 2*pin, eu.w1); -+ io_apic_write(apic, 0x11 + 2*pin, eu.w2); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+#endif -+ -+/* -+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are -+ * shared ISA-space IRQs, so we have to support them. We are super -+ * fast in the common case, and fast for shared ISA-space IRQs. -+ */ -+static void add_pin_to_irq(unsigned int irq, int apic, int pin) -+{ -+ static int first_free_entry = NR_IRQS; -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ -+ while (entry->next) -+ entry = irq_2_pin + entry->next; -+ -+ if (entry->pin != -1) { -+ entry->next = first_free_entry; -+ entry = irq_2_pin + entry->next; -+ if (++first_free_entry >= PIN_MAP_SIZE) -+ panic("io_apic.c: whoops"); -+ } -+ entry->apic = apic; -+ entry->pin = pin; -+} -+ -+#ifdef CONFIG_XEN -+#define clear_IO_APIC() ((void)0) -+#else -+/* -+ * Reroute an IRQ to a different pin. -+ */ -+static void __init replace_pin_at_irq(unsigned int irq, -+ int oldapic, int oldpin, -+ int newapic, int newpin) -+{ -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ -+ while (1) { -+ if (entry->apic == oldapic && entry->pin == oldpin) { -+ entry->apic = newapic; -+ entry->pin = newpin; -+ } -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+} -+ -+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) -+{ -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ unsigned int pin, reg; -+ -+ for (;;) { -+ pin = entry->pin; -+ if (pin == -1) -+ break; -+ reg = io_apic_read(entry->apic, 0x10 + pin*2); -+ reg &= ~disable; -+ reg |= enable; -+ io_apic_modify(entry->apic, 0x10 + pin*2, reg); -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+} -+ -+/* mask = 1 */ -+static void __mask_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0x00010000, 0); -+} -+ -+/* mask = 0 */ -+static void __unmask_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0, 0x00010000); -+} -+ -+/* mask = 1, trigger = 0 */ -+static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -+} -+ -+/* mask = 0, trigger = 1 */ -+static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -+{ -+ __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -+} -+ -+static void mask_IO_APIC_irq (unsigned int irq) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __mask_IO_APIC_irq(irq); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+static void unmask_IO_APIC_irq (unsigned int irq) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __unmask_IO_APIC_irq(irq); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -+{ -+ struct IO_APIC_route_entry entry; -+ -+ /* Check delivery_mode to be sure we're not clearing an SMI pin */ -+ entry = ioapic_read_entry(apic, pin); -+ if (entry.delivery_mode == dest_SMI) -+ return; -+ -+ /* -+ * Disable it in the IO-APIC irq-routing table: -+ */ -+ ioapic_mask_entry(apic, pin); -+} -+ -+static void clear_IO_APIC (void) -+{ -+ int apic, pin; -+ -+ for (apic = 0; apic < nr_ioapics; apic++) -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) -+ clear_IO_APIC_pin(apic, pin); -+} -+ -+#ifdef CONFIG_SMP -+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -+{ -+ unsigned long flags; -+ int pin; -+ struct irq_pin_list *entry = irq_2_pin + irq; -+ unsigned int apicid_value; -+ cpumask_t tmp; -+ -+ cpus_and(tmp, cpumask, cpu_online_map); -+ if (cpus_empty(tmp)) -+ tmp = TARGET_CPUS; -+ -+ cpus_and(cpumask, tmp, CPU_MASK_ALL); -+ -+ apicid_value = cpu_mask_to_apicid(cpumask); -+ /* Prepare to do the io_apic_write */ -+ apicid_value = apicid_value << 24; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ for (;;) { -+ pin = entry->pin; -+ if (pin == -1) -+ break; -+ io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+ irq_desc[irq].affinity = cpumask; -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+} -+ -+#if defined(CONFIG_IRQBALANCE) -+# include /* kernel_thread() */ -+# include /* kstat */ -+# include /* kmalloc() */ -+# include /* time_after() */ -+ -+#ifdef CONFIG_BALANCED_IRQ_DEBUG -+# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) -+# define Dprintk(x...) do { TDprintk(x); } while (0) -+# else -+# define TDprintk(x...) -+# define Dprintk(x...) -+# endif -+ -+#define IRQBALANCE_CHECK_ARCH -999 -+#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -+#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -+#define BALANCED_IRQ_MORE_DELTA (HZ/10) -+#define BALANCED_IRQ_LESS_DELTA (HZ) -+ -+static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -+static int physical_balance __read_mostly; -+static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; -+ -+static struct irq_cpu_info { -+ unsigned long * last_irq; -+ unsigned long * irq_delta; -+ unsigned long irq; -+} irq_cpu_data[NR_CPUS]; -+ -+#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -+#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) -+#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) -+ -+#define IDLE_ENOUGH(cpu,now) \ -+ (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) -+ -+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -+ -+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) -+ -+static cpumask_t balance_irq_affinity[NR_IRQS] = { -+ [0 ... NR_IRQS-1] = CPU_MASK_ALL -+}; -+ -+void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -+{ -+ balance_irq_affinity[irq] = mask; -+} -+ -+static unsigned long move(int curr_cpu, cpumask_t allowed_mask, -+ unsigned long now, int direction) -+{ -+ int search_idle = 1; -+ int cpu = curr_cpu; -+ -+ goto inside; -+ -+ do { -+ if (unlikely(cpu == curr_cpu)) -+ search_idle = 0; -+inside: -+ if (direction == 1) { -+ cpu++; -+ if (cpu >= NR_CPUS) -+ cpu = 0; -+ } else { -+ cpu--; -+ if (cpu == -1) -+ cpu = NR_CPUS-1; -+ } -+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || -+ (search_idle && !IDLE_ENOUGH(cpu,now))); -+ -+ return cpu; -+} -+ -+static inline void balance_irq(int cpu, int irq) -+{ -+ unsigned long now = jiffies; -+ cpumask_t allowed_mask; -+ unsigned int new_cpu; -+ -+ if (irqbalance_disabled) -+ return; -+ -+ cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); -+ new_cpu = move(cpu, allowed_mask, now, 1); -+ if (cpu != new_cpu) { -+ set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -+ } -+} -+ -+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -+{ -+ int i, j; -+ Dprintk("Rotating IRQs among CPUs.\n"); -+ for_each_online_cpu(i) { -+ for (j = 0; j < NR_IRQS; j++) { -+ if (!irq_desc[j].action) -+ continue; -+ /* Is it a significant load ? */ -+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < -+ useful_load_threshold) -+ continue; -+ balance_irq(i, j); -+ } -+ } -+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, -+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); -+ return; -+} -+ -+static void do_irq_balance(void) -+{ -+ int i, j; -+ unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); -+ unsigned long move_this_load = 0; -+ int max_loaded = 0, min_loaded = 0; -+ int load; -+ unsigned long useful_load_threshold = balanced_irq_interval + 10; -+ int selected_irq; -+ int tmp_loaded, first_attempt = 1; -+ unsigned long tmp_cpu_irq; -+ unsigned long imbalance = 0; -+ cpumask_t allowed_mask, target_cpu_mask, tmp; -+ -+ for_each_possible_cpu(i) { -+ int package_index; -+ CPU_IRQ(i) = 0; -+ if (!cpu_online(i)) -+ continue; -+ package_index = CPU_TO_PACKAGEINDEX(i); -+ for (j = 0; j < NR_IRQS; j++) { -+ unsigned long value_now, delta; -+ /* Is this an active IRQ or balancing disabled ? */ -+ if (!irq_desc[j].action || irq_balancing_disabled(j)) -+ continue; -+ if ( package_index == i ) -+ IRQ_DELTA(package_index,j) = 0; -+ /* Determine the total count per processor per IRQ */ -+ value_now = (unsigned long) kstat_cpu(i).irqs[j]; -+ -+ /* Determine the activity per processor per IRQ */ -+ delta = value_now - LAST_CPU_IRQ(i,j); -+ -+ /* Update last_cpu_irq[][] for the next time */ -+ LAST_CPU_IRQ(i,j) = value_now; -+ -+ /* Ignore IRQs whose rate is less than the clock */ -+ if (delta < useful_load_threshold) -+ continue; -+ /* update the load for the processor or package total */ -+ IRQ_DELTA(package_index,j) += delta; -+ -+ /* Keep track of the higher numbered sibling as well */ -+ if (i != package_index) -+ CPU_IRQ(i) += delta; -+ /* -+ * We have sibling A and sibling B in the package -+ * -+ * cpu_irq[A] = load for cpu A + load for cpu B -+ * cpu_irq[B] = load for cpu B -+ */ -+ CPU_IRQ(package_index) += delta; -+ } -+ } -+ /* Find the least loaded processor package */ -+ for_each_online_cpu(i) { -+ if (i != CPU_TO_PACKAGEINDEX(i)) -+ continue; -+ if (min_cpu_irq > CPU_IRQ(i)) { -+ min_cpu_irq = CPU_IRQ(i); -+ min_loaded = i; -+ } -+ } -+ max_cpu_irq = ULONG_MAX; -+ -+tryanothercpu: -+ /* Look for heaviest loaded processor. -+ * We may come back to get the next heaviest loaded processor. -+ * Skip processors with trivial loads. -+ */ -+ tmp_cpu_irq = 0; -+ tmp_loaded = -1; -+ for_each_online_cpu(i) { -+ if (i != CPU_TO_PACKAGEINDEX(i)) -+ continue; -+ if (max_cpu_irq <= CPU_IRQ(i)) -+ continue; -+ if (tmp_cpu_irq < CPU_IRQ(i)) { -+ tmp_cpu_irq = CPU_IRQ(i); -+ tmp_loaded = i; -+ } -+ } -+ -+ if (tmp_loaded == -1) { -+ /* In the case of small number of heavy interrupt sources, -+ * loading some of the cpus too much. We use Ingo's original -+ * approach to rotate them around. -+ */ -+ if (!first_attempt && imbalance >= useful_load_threshold) { -+ rotate_irqs_among_cpus(useful_load_threshold); -+ return; -+ } -+ goto not_worth_the_effort; -+ } -+ -+ first_attempt = 0; /* heaviest search */ -+ max_cpu_irq = tmp_cpu_irq; /* load */ -+ max_loaded = tmp_loaded; /* processor */ -+ imbalance = (max_cpu_irq - min_cpu_irq) / 2; -+ -+ Dprintk("max_loaded cpu = %d\n", max_loaded); -+ Dprintk("min_loaded cpu = %d\n", min_loaded); -+ Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); -+ Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); -+ Dprintk("load imbalance = %lu\n", imbalance); -+ -+ /* if imbalance is less than approx 10% of max load, then -+ * observe diminishing returns action. - quit -+ */ -+ if (imbalance < (max_cpu_irq >> 3)) { -+ Dprintk("Imbalance too trivial\n"); -+ goto not_worth_the_effort; -+ } -+ -+tryanotherirq: -+ /* if we select an IRQ to move that can't go where we want, then -+ * see if there is another one to try. -+ */ -+ move_this_load = 0; -+ selected_irq = -1; -+ for (j = 0; j < NR_IRQS; j++) { -+ /* Is this an active IRQ? */ -+ if (!irq_desc[j].action) -+ continue; -+ if (imbalance <= IRQ_DELTA(max_loaded,j)) -+ continue; -+ /* Try to find the IRQ that is closest to the imbalance -+ * without going over. -+ */ -+ if (move_this_load < IRQ_DELTA(max_loaded,j)) { -+ move_this_load = IRQ_DELTA(max_loaded,j); -+ selected_irq = j; -+ } -+ } -+ if (selected_irq == -1) { -+ goto tryanothercpu; -+ } -+ -+ imbalance = move_this_load; -+ -+ /* For physical_balance case, we accumlated both load -+ * values in the one of the siblings cpu_irq[], -+ * to use the same code for physical and logical processors -+ * as much as possible. -+ * -+ * NOTE: the cpu_irq[] array holds the sum of the load for -+ * sibling A and sibling B in the slot for the lowest numbered -+ * sibling (A), _AND_ the load for sibling B in the slot for -+ * the higher numbered sibling. -+ * -+ * We seek the least loaded sibling by making the comparison -+ * (A+B)/2 vs B -+ */ -+ load = CPU_IRQ(min_loaded) >> 1; -+ for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { -+ if (load > CPU_IRQ(j)) { -+ /* This won't change cpu_sibling_map[min_loaded] */ -+ load = CPU_IRQ(j); -+ min_loaded = j; -+ } -+ } -+ -+ cpus_and(allowed_mask, -+ cpu_online_map, -+ balance_irq_affinity[selected_irq]); -+ target_cpu_mask = cpumask_of_cpu(min_loaded); -+ cpus_and(tmp, target_cpu_mask, allowed_mask); -+ -+ if (!cpus_empty(tmp)) { -+ -+ Dprintk("irq = %d moved to cpu = %d\n", -+ selected_irq, min_loaded); -+ /* mark for change destination */ -+ set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); -+ -+ /* Since we made a change, come back sooner to -+ * check for more variation. -+ */ -+ balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, -+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); -+ return; -+ } -+ goto tryanotherirq; -+ -+not_worth_the_effort: -+ /* -+ * if we did not find an IRQ to move, then adjust the time interval -+ * upward -+ */ -+ balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, -+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); -+ Dprintk("IRQ worth rotating not found\n"); -+ return; -+} -+ -+static int balanced_irq(void *unused) -+{ -+ int i; -+ unsigned long prev_balance_time = jiffies; -+ long time_remaining = balanced_irq_interval; -+ -+ daemonize("kirqd"); -+ -+ /* push everything to CPU 0 to give us a starting point. */ -+ for (i = 0 ; i < NR_IRQS ; i++) { -+ irq_desc[i].pending_mask = cpumask_of_cpu(0); -+ set_pending_irq(i, cpumask_of_cpu(0)); -+ } -+ -+ for ( ; ; ) { -+ time_remaining = schedule_timeout_interruptible(time_remaining); -+ try_to_freeze(); -+ if (time_after(jiffies, -+ prev_balance_time+balanced_irq_interval)) { -+ preempt_disable(); -+ do_irq_balance(); -+ prev_balance_time = jiffies; -+ time_remaining = balanced_irq_interval; -+ preempt_enable(); -+ } -+ } -+ return 0; -+} -+ -+static int __init balanced_irq_init(void) -+{ -+ int i; -+ struct cpuinfo_x86 *c; -+ cpumask_t tmp; -+ -+ cpus_shift_right(tmp, cpu_online_map, 2); -+ c = &boot_cpu_data; -+ /* When not overwritten by the command line ask subarchitecture. */ -+ if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) -+ irqbalance_disabled = NO_BALANCE_IRQ; -+ if (irqbalance_disabled) -+ return 0; -+ -+ /* disable irqbalance completely if there is only one processor online */ -+ if (num_online_cpus() < 2) { -+ irqbalance_disabled = 1; -+ return 0; -+ } -+ /* -+ * Enable physical balance only if more than 1 physical processor -+ * is present -+ */ -+ if (smp_num_siblings > 1 && !cpus_empty(tmp)) -+ physical_balance = 1; -+ -+ for_each_online_cpu(i) { -+ irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); -+ irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); -+ if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { -+ printk(KERN_ERR "balanced_irq_init: out of memory"); -+ goto failed; -+ } -+ memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); -+ memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); -+ } -+ -+ printk(KERN_INFO "Starting balanced_irq\n"); -+ if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) -+ return 0; -+ else -+ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -+failed: -+ for_each_possible_cpu(i) { -+ kfree(irq_cpu_data[i].irq_delta); -+ irq_cpu_data[i].irq_delta = NULL; -+ kfree(irq_cpu_data[i].last_irq); -+ irq_cpu_data[i].last_irq = NULL; -+ } -+ return 0; -+} -+ -+int __devinit irqbalance_disable(char *str) -+{ -+ irqbalance_disabled = 1; -+ return 1; -+} -+ -+__setup("noirqbalance", irqbalance_disable); -+ -+late_initcall(balanced_irq_init); -+#endif /* CONFIG_IRQBALANCE */ -+#endif /* CONFIG_SMP */ -+#endif /* !CONFIG_XEN */ -+ -+#ifndef CONFIG_SMP -+void fastcall send_IPI_self(int vector) -+{ -+#ifndef CONFIG_XEN -+ unsigned int cfg; -+ -+ /* -+ * Wait for idle. -+ */ -+ apic_wait_icr_idle(); -+ cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; -+ /* -+ * Send the IPI. The write to APIC_ICR fires this off. -+ */ -+ apic_write_around(APIC_ICR, cfg); -+#endif -+} -+#endif /* !CONFIG_SMP */ -+ -+ -+/* -+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to -+ * specific CPU-side IRQs. -+ */ -+ -+#define MAX_PIRQS 8 -+static int pirq_entries [MAX_PIRQS]; -+static int pirqs_enabled; -+int skip_ioapic_setup; -+ -+static int __init ioapic_setup(char *str) -+{ -+ skip_ioapic_setup = 1; -+ return 1; -+} -+ -+__setup("noapic", ioapic_setup); -+ -+static int __init ioapic_pirq_setup(char *str) -+{ -+ int i, max; -+ int ints[MAX_PIRQS+1]; -+ -+ get_options(str, ARRAY_SIZE(ints), ints); -+ -+ for (i = 0; i < MAX_PIRQS; i++) -+ pirq_entries[i] = -1; -+ -+ pirqs_enabled = 1; -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "PIRQ redirection, working around broken MP-BIOS.\n"); -+ max = MAX_PIRQS; -+ if (ints[0] < MAX_PIRQS) -+ max = ints[0]; -+ -+ for (i = 0; i < max; i++) { -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); -+ /* -+ * PIRQs are mapped upside down, usually. -+ */ -+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; -+ } -+ return 1; -+} -+ -+__setup("pirq=", ioapic_pirq_setup); -+ -+/* -+ * Find the IRQ entry number of a certain pin. -+ */ -+static int find_irq_entry(int apic, int pin, int type) -+{ -+ int i; -+ -+ for (i = 0; i < mp_irq_entries; i++) -+ if (mp_irqs[i].mpc_irqtype == type && -+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || -+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && -+ mp_irqs[i].mpc_dstirq == pin) -+ return i; -+ -+ return -1; -+} -+ -+/* -+ * Find the pin to which IRQ[irq] (ISA) is connected -+ */ -+static int __init find_isa_irq_pin(int irq, int type) -+{ -+ int i; -+ -+ for (i = 0; i < mp_irq_entries; i++) { -+ int lbus = mp_irqs[i].mpc_srcbus; -+ -+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_EISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_MCA -+ ) && -+ (mp_irqs[i].mpc_irqtype == type) && -+ (mp_irqs[i].mpc_srcbusirq == irq)) -+ -+ return mp_irqs[i].mpc_dstirq; -+ } -+ return -1; -+} -+ -+static int __init find_isa_irq_apic(int irq, int type) -+{ -+ int i; -+ -+ for (i = 0; i < mp_irq_entries; i++) { -+ int lbus = mp_irqs[i].mpc_srcbus; -+ -+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_EISA || -+ mp_bus_id_to_type[lbus] == MP_BUS_MCA -+ ) && -+ (mp_irqs[i].mpc_irqtype == type) && -+ (mp_irqs[i].mpc_srcbusirq == irq)) -+ break; -+ } -+ if (i < mp_irq_entries) { -+ int apic; -+ for(apic = 0; apic < nr_ioapics; apic++) { -+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) -+ return apic; -+ } -+ } -+ -+ return -1; -+} -+ -+/* -+ * Find a specific PCI IRQ entry. -+ * Not an __init, possibly needed by modules -+ */ -+static int pin_2_irq(int idx, int apic, int pin); -+ -+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -+{ -+ int apic, i, best_guess = -1; -+ -+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " -+ "slot:%d, pin:%d.\n", bus, slot, pin); -+ if (mp_bus_id_to_pci_bus[bus] == -1) { -+ printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); -+ return -1; -+ } -+ for (i = 0; i < mp_irq_entries; i++) { -+ int lbus = mp_irqs[i].mpc_srcbus; -+ -+ for (apic = 0; apic < nr_ioapics; apic++) -+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || -+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) -+ break; -+ -+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && -+ !mp_irqs[i].mpc_irqtype && -+ (bus == lbus) && -+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { -+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); -+ -+ if (!(apic || IO_APIC_IRQ(irq))) -+ continue; -+ -+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) -+ return irq; -+ /* -+ * Use the first all-but-pin matching entry as a -+ * best-guess fuzzy result for broken mptables. -+ */ -+ if (best_guess < 0) -+ best_guess = irq; -+ } -+ } -+ return best_guess; -+} -+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -+ -+/* -+ * This function currently is only a helper for the i386 smp boot process where -+ * we need to reprogram the ioredtbls to cater for the cpus which have come online -+ * so mask in all cases should simply be TARGET_CPUS -+ */ -+#ifdef CONFIG_SMP -+#ifndef CONFIG_XEN -+void __init setup_ioapic_dest(void) -+{ -+ int pin, ioapic, irq, irq_entry; -+ -+ if (skip_ioapic_setup == 1) -+ return; -+ -+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { -+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { -+ irq_entry = find_irq_entry(ioapic, pin, mp_INT); -+ if (irq_entry == -1) -+ continue; -+ irq = pin_2_irq(irq_entry, ioapic, pin); -+ set_ioapic_affinity_irq(irq, TARGET_CPUS); -+ } -+ -+ } -+} -+#endif /* !CONFIG_XEN */ -+#endif -+ -+/* -+ * EISA Edge/Level control register, ELCR -+ */ -+static int EISA_ELCR(unsigned int irq) -+{ -+ if (irq < 16) { -+ unsigned int port = 0x4d0 + (irq >> 3); -+ return (inb(port) >> (irq & 7)) & 1; -+ } -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "Broken MPtable reports ISA irq %d\n", irq); -+ return 0; -+} -+ -+/* EISA interrupts are always polarity zero and can be edge or level -+ * trigger depending on the ELCR value. If an interrupt is listed as -+ * EISA conforming in the MP table, that means its trigger type must -+ * be read in from the ELCR */ -+ -+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -+#define default_EISA_polarity(idx) (0) -+ -+/* ISA interrupts are always polarity zero edge triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_ISA_trigger(idx) (0) -+#define default_ISA_polarity(idx) (0) -+ -+/* PCI interrupts are always polarity one level triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_PCI_trigger(idx) (1) -+#define default_PCI_polarity(idx) (1) -+ -+/* MCA interrupts are always polarity zero level triggered, -+ * when listed as conforming in the MP table. */ -+ -+#define default_MCA_trigger(idx) (1) -+#define default_MCA_polarity(idx) (0) -+ -+static int __init MPBIOS_polarity(int idx) -+{ -+ int bus = mp_irqs[idx].mpc_srcbus; -+ int polarity; -+ -+ /* -+ * Determine IRQ line polarity (high active or low active): -+ */ -+ switch (mp_irqs[idx].mpc_irqflag & 3) -+ { -+ case 0: /* conforms, ie. bus-type dependent polarity */ -+ { -+ switch (mp_bus_id_to_type[bus]) -+ { -+ case MP_BUS_ISA: /* ISA pin */ -+ { -+ polarity = default_ISA_polarity(idx); -+ break; -+ } -+ case MP_BUS_EISA: /* EISA pin */ -+ { -+ polarity = default_EISA_polarity(idx); -+ break; -+ } -+ case MP_BUS_PCI: /* PCI pin */ -+ { -+ polarity = default_PCI_polarity(idx); -+ break; -+ } -+ case MP_BUS_MCA: /* MCA pin */ -+ { -+ polarity = default_MCA_polarity(idx); -+ break; -+ } -+ default: -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ polarity = 1; -+ break; -+ } -+ } -+ break; -+ } -+ case 1: /* high active */ -+ { -+ polarity = 0; -+ break; -+ } -+ case 2: /* reserved */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ polarity = 1; -+ break; -+ } -+ case 3: /* low active */ -+ { -+ polarity = 1; -+ break; -+ } -+ default: /* invalid */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ polarity = 1; -+ break; -+ } -+ } -+ return polarity; -+} -+ -+static int MPBIOS_trigger(int idx) -+{ -+ int bus = mp_irqs[idx].mpc_srcbus; -+ int trigger; -+ -+ /* -+ * Determine IRQ trigger mode (edge or level sensitive): -+ */ -+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) -+ { -+ case 0: /* conforms, ie. bus-type dependent */ -+ { -+ switch (mp_bus_id_to_type[bus]) -+ { -+ case MP_BUS_ISA: /* ISA pin */ -+ { -+ trigger = default_ISA_trigger(idx); -+ break; -+ } -+ case MP_BUS_EISA: /* EISA pin */ -+ { -+ trigger = default_EISA_trigger(idx); -+ break; -+ } -+ case MP_BUS_PCI: /* PCI pin */ -+ { -+ trigger = default_PCI_trigger(idx); -+ break; -+ } -+ case MP_BUS_MCA: /* MCA pin */ -+ { -+ trigger = default_MCA_trigger(idx); -+ break; -+ } -+ default: -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ trigger = 1; -+ break; -+ } -+ } -+ break; -+ } -+ case 1: /* edge */ -+ { -+ trigger = 0; -+ break; -+ } -+ case 2: /* reserved */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ trigger = 1; -+ break; -+ } -+ case 3: /* level */ -+ { -+ trigger = 1; -+ break; -+ } -+ default: /* invalid */ -+ { -+ printk(KERN_WARNING "broken BIOS!!\n"); -+ trigger = 0; -+ break; -+ } -+ } -+ return trigger; -+} -+ -+static inline int irq_polarity(int idx) -+{ -+ return MPBIOS_polarity(idx); -+} -+ -+static inline int irq_trigger(int idx) -+{ -+ return MPBIOS_trigger(idx); -+} -+ -+static int pin_2_irq(int idx, int apic, int pin) -+{ -+ int irq, i; -+ int bus = mp_irqs[idx].mpc_srcbus; -+ -+ /* -+ * Debugging check, we are in big trouble if this message pops up! -+ */ -+ if (mp_irqs[idx].mpc_dstirq != pin) -+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); -+ -+ switch (mp_bus_id_to_type[bus]) -+ { -+ case MP_BUS_ISA: /* ISA pin */ -+ case MP_BUS_EISA: -+ case MP_BUS_MCA: -+ { -+ irq = mp_irqs[idx].mpc_srcbusirq; -+ break; -+ } -+ case MP_BUS_PCI: /* PCI pin */ -+ { -+ /* -+ * PCI IRQs are mapped in order -+ */ -+ i = irq = 0; -+ while (i < apic) -+ irq += nr_ioapic_registers[i++]; -+ irq += pin; -+ -+ /* -+ * For MPS mode, so far only needed by ES7000 platform -+ */ -+ if (ioapic_renumber_irq) -+ irq = ioapic_renumber_irq(apic, irq); -+ -+ break; -+ } -+ default: -+ { -+ printk(KERN_ERR "unknown bus type %d.\n",bus); -+ irq = 0; -+ break; -+ } -+ } -+ -+ /* -+ * PCI IRQ command line redirection. Yes, limits are hardcoded. -+ */ -+ if ((pin >= 16) && (pin <= 23)) { -+ if (pirq_entries[pin-16] != -1) { -+ if (!pirq_entries[pin-16]) { -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ "disabling PIRQ%d\n", pin-16); -+ } else { -+ irq = pirq_entries[pin-16]; -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ "using PIRQ%d -> IRQ %d\n", -+ pin-16, irq); -+ } -+ } -+ } -+ return irq; -+} -+ -+static inline int IO_APIC_irq_trigger(int irq) -+{ -+ int apic, idx, pin; -+ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { -+ idx = find_irq_entry(apic,pin,mp_INT); -+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) -+ return irq_trigger(idx); -+ } -+ } -+ /* -+ * nonexistent IRQs are edge default -+ */ -+ return 0; -+} -+ -+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -+static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ -+ -+static int __assign_irq_vector(int irq) -+{ -+ int vector; -+ struct physdev_irq irq_op; -+ -+ BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); -+ -+ if (irq_vector[irq] > 0) -+ return irq_vector[irq]; -+ irq_op.irq = irq; -+ if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) -+ return -ENOSPC; -+ -+ vector = irq_op.vector; -+ irq_vector[irq] = vector; -+ -+ return vector; -+} -+ -+static int assign_irq_vector(int irq) -+{ -+ unsigned long flags; -+ int vector; -+ -+ spin_lock_irqsave(&vector_lock, flags); -+ vector = __assign_irq_vector(irq); -+ spin_unlock_irqrestore(&vector_lock, flags); -+ -+ return vector; -+} -+#ifndef CONFIG_XEN -+static struct irq_chip ioapic_chip; -+ -+#define IOAPIC_AUTO -1 -+#define IOAPIC_EDGE 0 -+#define IOAPIC_LEVEL 1 -+ -+static void ioapic_register_intr(int irq, int vector, unsigned long trigger) -+{ -+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || -+ trigger == IOAPIC_LEVEL) -+ set_irq_chip_and_handler_name(irq, &ioapic_chip, -+ handle_fasteoi_irq, "fasteoi"); -+ else -+ set_irq_chip_and_handler_name(irq, &ioapic_chip, -+ handle_edge_irq, "edge"); -+ set_intr_gate(vector, interrupt[irq]); -+} -+#else -+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) -+#endif -+ -+static void __init setup_IO_APIC_irqs(void) -+{ -+ struct IO_APIC_route_entry entry; -+ int apic, pin, idx, irq, first_notcon = 1, vector; -+ unsigned long flags; -+ -+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -+ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { -+ -+ /* -+ * add it to the IO-APIC irq-routing table: -+ */ -+ memset(&entry,0,sizeof(entry)); -+ -+ entry.delivery_mode = INT_DELIVERY_MODE; -+ entry.dest_mode = INT_DEST_MODE; -+ entry.mask = 0; /* enable IRQ */ -+ entry.dest.logical.logical_dest = -+ cpu_mask_to_apicid(TARGET_CPUS); -+ -+ idx = find_irq_entry(apic,pin,mp_INT); -+ if (idx == -1) { -+ if (first_notcon) { -+ apic_printk(APIC_VERBOSE, KERN_DEBUG -+ " IO-APIC (apicid-pin) %d-%d", -+ mp_ioapics[apic].mpc_apicid, -+ pin); -+ first_notcon = 0; -+ } else -+ apic_printk(APIC_VERBOSE, ", %d-%d", -+ mp_ioapics[apic].mpc_apicid, pin); -+ continue; -+ } -+ -+ entry.trigger = irq_trigger(idx); -+ entry.polarity = irq_polarity(idx); -+ -+ if (irq_trigger(idx)) { -+ entry.trigger = 1; -+ entry.mask = 1; -+ } -+ -+ irq = pin_2_irq(idx, apic, pin); -+ /* -+ * skip adding the timer int on secondary nodes, which causes -+ * a small but painful rift in the time-space continuum -+ */ -+ if (multi_timer_check(apic, irq)) -+ continue; -+ else -+ add_pin_to_irq(irq, apic, pin); -+ -+ if (/*!apic &&*/ !IO_APIC_IRQ(irq)) -+ continue; -+ -+ if (IO_APIC_IRQ(irq)) { -+ vector = assign_irq_vector(irq); -+ entry.vector = vector; -+ ioapic_register_intr(irq, vector, IOAPIC_AUTO); -+ -+ if (!apic && (irq < 16)) -+ disable_8259A_irq(irq); -+ } -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __ioapic_write_entry(apic, pin, entry); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ } -+ } -+ -+ if (!first_notcon) -+ apic_printk(APIC_VERBOSE, " not connected.\n"); -+} -+ -+/* -+ * Set up the 8259A-master output pin: -+ */ -+#ifndef CONFIG_XEN -+static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) -+{ -+ struct IO_APIC_route_entry entry; -+ -+ memset(&entry,0,sizeof(entry)); -+ -+ disable_8259A_irq(0); -+ -+ /* mask LVT0 */ -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); -+ -+ /* -+ * We use logical delivery to get the timer IRQ -+ * to the first CPU. -+ */ -+ entry.dest_mode = INT_DEST_MODE; -+ entry.mask = 0; /* unmask IRQ now */ -+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); -+ entry.delivery_mode = INT_DELIVERY_MODE; -+ entry.polarity = 0; -+ entry.trigger = 0; -+ entry.vector = vector; -+ -+ /* -+ * The timer IRQ doesn't have to know that behind the -+ * scene we have a 8259A-master in AEOI mode ... -+ */ -+ irq_desc[0].chip = &ioapic_chip; -+ set_irq_handler(0, handle_edge_irq); -+ -+ /* -+ * Add it to the IO-APIC irq-routing table: -+ */ -+ ioapic_write_entry(apic, pin, entry); -+ -+ enable_8259A_irq(0); -+} -+ -+static inline void UNEXPECTED_IO_APIC(void) -+{ -+} -+ -+void __init print_IO_APIC(void) -+{ -+ int apic, i; -+ union IO_APIC_reg_00 reg_00; -+ union IO_APIC_reg_01 reg_01; -+ union IO_APIC_reg_02 reg_02; -+ union IO_APIC_reg_03 reg_03; -+ unsigned long flags; -+ -+ if (apic_verbosity == APIC_QUIET) -+ return; -+ -+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); -+ for (i = 0; i < nr_ioapics; i++) -+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", -+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); -+ -+ /* -+ * We are a bit conservative about what we expect. We have to -+ * know about every hardware change ASAP. -+ */ -+ printk(KERN_INFO "testing the IO APIC.......................\n"); -+ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(apic, 0); -+ reg_01.raw = io_apic_read(apic, 1); -+ if (reg_01.bits.version >= 0x10) -+ reg_02.raw = io_apic_read(apic, 2); -+ if (reg_01.bits.version >= 0x20) -+ reg_03.raw = io_apic_read(apic, 3); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); -+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); -+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); -+ printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); -+ printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); -+ if (reg_00.bits.ID >= get_physical_broadcast()) -+ UNEXPECTED_IO_APIC(); -+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) -+ UNEXPECTED_IO_APIC(); -+ -+ printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); -+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); -+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ -+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ -+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ -+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ -+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ -+ (reg_01.bits.entries != 0x2E) && -+ (reg_01.bits.entries != 0x3F) -+ ) -+ UNEXPECTED_IO_APIC(); -+ -+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); -+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); -+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ -+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ -+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ -+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ -+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ -+ ) -+ UNEXPECTED_IO_APIC(); -+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) -+ UNEXPECTED_IO_APIC(); -+ -+ /* -+ * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, -+ * but the value of reg_02 is read as the previous read register -+ * value, so ignore it if reg_02 == reg_01. -+ */ -+ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { -+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); -+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); -+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) -+ UNEXPECTED_IO_APIC(); -+ } -+ -+ /* -+ * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 -+ * or reg_03, but the value of reg_0[23] is read as the previous read -+ * register value, so ignore it if reg_03 == reg_0[12]. -+ */ -+ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && -+ reg_03.raw != reg_01.raw) { -+ printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); -+ printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); -+ if (reg_03.bits.__reserved_1) -+ UNEXPECTED_IO_APIC(); -+ } -+ -+ printk(KERN_DEBUG ".... IRQ redirection table:\n"); -+ -+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" -+ " Stat Dest Deli Vect: \n"); -+ -+ for (i = 0; i <= reg_01.bits.entries; i++) { -+ struct IO_APIC_route_entry entry; -+ -+ entry = ioapic_read_entry(apic, i); -+ -+ printk(KERN_DEBUG " %02x %03X %02X ", -+ i, -+ entry.dest.logical.logical_dest, -+ entry.dest.physical.physical_dest -+ ); -+ -+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", -+ entry.mask, -+ entry.trigger, -+ entry.irr, -+ entry.polarity, -+ entry.delivery_status, -+ entry.dest_mode, -+ entry.delivery_mode, -+ entry.vector -+ ); -+ } -+ } -+ printk(KERN_DEBUG "IRQ to pin mappings:\n"); -+ for (i = 0; i < NR_IRQS; i++) { -+ struct irq_pin_list *entry = irq_2_pin + i; -+ if (entry->pin < 0) -+ continue; -+ printk(KERN_DEBUG "IRQ%d ", i); -+ for (;;) { -+ printk("-> %d:%d", entry->apic, entry->pin); -+ if (!entry->next) -+ break; -+ entry = irq_2_pin + entry->next; -+ } -+ printk("\n"); -+ } -+ -+ printk(KERN_INFO ".................................... done.\n"); -+ -+ return; -+} -+ -+#if 0 -+ -+static void print_APIC_bitfield (int base) -+{ -+ unsigned int v; -+ int i, j; -+ -+ if (apic_verbosity == APIC_QUIET) -+ return; -+ -+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); -+ for (i = 0; i < 8; i++) { -+ v = apic_read(base + i*0x10); -+ for (j = 0; j < 32; j++) { -+ if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ -+ apic_write(APIC_ESR, 0); -+ v = apic_read(APIC_ESR); -+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v); -+ } -+ -+ v = apic_read(APIC_ICR); -+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v); -+ v = apic_read(APIC_ICR2); -+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); -+ -+ v = apic_read(APIC_LVTT); -+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); -+ -+ if (maxlvt > 3) { /* PC is LVT#4. */ -+ v = apic_read(APIC_LVTPC); -+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); -+ } -+ v = apic_read(APIC_LVT0); -+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); -+ v = apic_read(APIC_LVT1); -+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); -+ -+ if (maxlvt > 2) { /* ERR is LVT#3. */ -+ v = apic_read(APIC_LVTERR); -+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); -+ } -+ -+ v = apic_read(APIC_TMICT); -+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); -+ v = apic_read(APIC_TMCCT); -+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); -+ v = apic_read(APIC_TDCR); -+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); -+ printk("\n"); -+} -+ -+void print_all_local_APICs (void) -+{ -+ on_each_cpu(print_local_APIC, NULL, 1, 1); -+} -+ -+void /*__init*/ print_PIC(void) -+{ -+ unsigned int v; -+ unsigned long flags; -+ -+ if (apic_verbosity == APIC_QUIET) -+ return; -+ -+ printk(KERN_DEBUG "\nprinting PIC contents\n"); -+ -+ spin_lock_irqsave(&i8259A_lock, flags); -+ -+ v = inb(0xa1) << 8 | inb(0x21); -+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v); -+ -+ v = inb(0xa0) << 8 | inb(0x20); -+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v); -+ -+ outb(0x0b,0xa0); -+ outb(0x0b,0x20); -+ v = inb(0xa0) << 8 | inb(0x20); -+ outb(0x0a,0xa0); -+ outb(0x0a,0x20); -+ -+ spin_unlock_irqrestore(&i8259A_lock, flags); -+ -+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v); -+ -+ v = inb(0x4d1) << 8 | inb(0x4d0); -+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -+} -+ -+#endif /* 0 */ -+ -+#else -+void __init print_IO_APIC(void) { } -+#endif /* !CONFIG_XEN */ -+ -+static void __init enable_IO_APIC(void) -+{ -+ union IO_APIC_reg_01 reg_01; -+ int i8259_apic, i8259_pin; -+ int i, apic; -+ unsigned long flags; -+ -+ for (i = 0; i < PIN_MAP_SIZE; i++) { -+ irq_2_pin[i].pin = -1; -+ irq_2_pin[i].next = 0; -+ } -+ if (!pirqs_enabled) -+ for (i = 0; i < MAX_PIRQS; i++) -+ pirq_entries[i] = -1; -+ -+ /* -+ * The number of IO-APIC IRQ registers (== #pins): -+ */ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_01.raw = io_apic_read(apic, 1); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ nr_ioapic_registers[apic] = reg_01.bits.entries+1; -+ } -+ for(apic = 0; apic < nr_ioapics; apic++) { -+ int pin; -+ /* See if any of the pins is in ExtINT mode */ -+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { -+ struct IO_APIC_route_entry entry; -+ entry = ioapic_read_entry(apic, pin); -+ -+ -+ /* If the interrupt line is enabled and in ExtInt mode -+ * I have found the pin where the i8259 is connected. -+ */ -+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { -+ ioapic_i8259.apic = apic; -+ ioapic_i8259.pin = pin; -+ goto found_i8259; -+ } -+ } -+ } -+ found_i8259: -+ /* Look to see what if the MP table has reported the ExtINT */ -+ /* If we could not find the appropriate pin by looking at the ioapic -+ * the i8259 probably is not connected the ioapic but give the -+ * mptable a chance anyway. -+ */ -+ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); -+ i8259_apic = find_isa_irq_apic(0, mp_ExtINT); -+ /* Trust the MP table if nothing is setup in the hardware */ -+ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { -+ printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); -+ ioapic_i8259.pin = i8259_pin; -+ ioapic_i8259.apic = i8259_apic; -+ } -+ /* Complain if the MP table and the hardware disagree */ -+ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && -+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) -+ { -+ printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); -+ } -+ -+ /* -+ * Do not trust the IO-APIC being empty at bootup -+ */ -+ clear_IO_APIC(); -+} -+ -+/* -+ * Not an __init, needed by the reboot code -+ */ -+void disable_IO_APIC(void) -+{ -+ /* -+ * Clear the IO-APIC before rebooting: -+ */ -+ clear_IO_APIC(); -+ -+#ifndef CONFIG_XEN -+ /* -+ * If the i8259 is routed through an IOAPIC -+ * Put that IOAPIC in virtual wire mode -+ * so legacy interrupts can be delivered. -+ */ -+ if (ioapic_i8259.pin != -1) { -+ struct IO_APIC_route_entry entry; -+ -+ memset(&entry, 0, sizeof(entry)); -+ entry.mask = 0; /* Enabled */ -+ entry.trigger = 0; /* Edge */ -+ entry.irr = 0; -+ entry.polarity = 0; /* High */ -+ entry.delivery_status = 0; -+ entry.dest_mode = 0; /* Physical */ -+ entry.delivery_mode = dest_ExtINT; /* ExtInt */ -+ entry.vector = 0; -+ entry.dest.physical.physical_dest = -+ GET_APIC_ID(apic_read(APIC_ID)); -+ -+ /* -+ * Add it to the IO-APIC irq-routing table: -+ */ -+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); -+ } -+ disconnect_bsp_APIC(ioapic_i8259.pin != -1); -+#endif -+} -+ -+/* -+ * function to set the IO-APIC physical IDs based on the -+ * values stored in the MPC table. -+ * -+ * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 -+ */ -+ -+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) -+static void __init setup_ioapic_ids_from_mpc(void) -+{ -+ union IO_APIC_reg_00 reg_00; -+ physid_mask_t phys_id_present_map; -+ int apic; -+ int i; -+ unsigned char old_id; -+ unsigned long flags; -+ -+ /* -+ * Don't check I/O APIC IDs for xAPIC systems. They have -+ * no meaning without the serial APIC bus. -+ */ -+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) -+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) -+ return; -+ /* -+ * This is broken; anything with a real cpu count has to -+ * circumvent this idiocy regardless. -+ */ -+ phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); -+ -+ /* -+ * Set the IOAPIC ID to the value stored in the MPC table. -+ */ -+ for (apic = 0; apic < nr_ioapics; apic++) { -+ -+ /* Read the register 0 value */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(apic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ old_id = mp_ioapics[apic].mpc_apicid; -+ -+ if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { -+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", -+ apic, mp_ioapics[apic].mpc_apicid); -+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", -+ reg_00.bits.ID); -+ mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; -+ } -+ -+ /* -+ * Sanity check, is the ID really free? Every APIC in a -+ * system must have a unique ID or we get lots of nice -+ * 'stuck on smp_invalidate_needed IPI wait' messages. -+ */ -+ if (check_apicid_used(phys_id_present_map, -+ mp_ioapics[apic].mpc_apicid)) { -+ printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", -+ apic, mp_ioapics[apic].mpc_apicid); -+ for (i = 0; i < get_physical_broadcast(); i++) -+ if (!physid_isset(i, phys_id_present_map)) -+ break; -+ if (i >= get_physical_broadcast()) -+ panic("Max APIC ID exceeded!\n"); -+ printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", -+ i); -+ physid_set(i, phys_id_present_map); -+ mp_ioapics[apic].mpc_apicid = i; -+ } else { -+ physid_mask_t tmp; -+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); -+ apic_printk(APIC_VERBOSE, "Setting %d in the " -+ "phys_id_present_map\n", -+ mp_ioapics[apic].mpc_apicid); -+ physids_or(phys_id_present_map, phys_id_present_map, tmp); -+ } -+ -+ -+ /* -+ * We need to adjust the IRQ routing table -+ * if the ID changed. -+ */ -+ if (old_id != mp_ioapics[apic].mpc_apicid) -+ for (i = 0; i < mp_irq_entries; i++) -+ if (mp_irqs[i].mpc_dstapic == old_id) -+ mp_irqs[i].mpc_dstapic -+ = mp_ioapics[apic].mpc_apicid; -+ -+ /* -+ * Read the right value from the MPC table and -+ * write it into the ID register. -+ */ -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "...changing IO-APIC physical APIC ID to %d ...", -+ mp_ioapics[apic].mpc_apicid); -+ -+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(apic, 0, reg_00.raw); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ /* -+ * Sanity check -+ */ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(apic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) -+ printk("could not set ID!\n"); -+ else -+ apic_printk(APIC_VERBOSE, " ok.\n"); -+ } -+} -+#else -+static void __init setup_ioapic_ids_from_mpc(void) { } -+#endif -+ -+int no_timer_check __initdata; -+ -+static int __init notimercheck(char *s) -+{ -+ no_timer_check = 1; -+ return 1; -+} -+__setup("no_timer_check", notimercheck); -+ -+#ifndef CONFIG_XEN -+/* -+ * There is a nasty bug in some older SMP boards, their mptable lies -+ * about the timer IRQ. We do the following to work around the situation: -+ * -+ * - timer IRQ defaults to IO-APIC IRQ -+ * - if this function detects that timer IRQs are defunct, then we fall -+ * back to ISA timer IRQs -+ */ -+int __init timer_irq_works(void) -+{ -+ unsigned long t1 = jiffies; -+ -+ if (no_timer_check) -+ return 1; -+ -+ local_irq_enable(); -+ /* Let ten ticks pass... */ -+ mdelay((10 * 1000) / HZ); -+ -+ /* -+ * Expect a few ticks at least, to be sure some possible -+ * glue logic does not lock up after one or two first -+ * ticks in a non-ExtINT mode. Also the local APIC -+ * might have cached one ExtINT interrupt. Finally, at -+ * least one tick may be lost due to delays. -+ */ -+ if (jiffies - t1 > 4) -+ return 1; -+ -+ return 0; -+} -+ -+/* -+ * In the SMP+IOAPIC case it might happen that there are an unspecified -+ * number of pending IRQ events unhandled. These cases are very rare, -+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much -+ * better to do it this way as thus we do not have to be aware of -+ * 'pending' interrupts in the IRQ path, except at this point. -+ */ -+/* -+ * Edge triggered needs to resend any interrupt -+ * that was delayed but this is now handled in the device -+ * independent code. -+ */ -+ -+/* -+ * Startup quirk: -+ * -+ * Starting up a edge-triggered IO-APIC interrupt is -+ * nasty - we need to make sure that we get the edge. -+ * If it is already asserted for some reason, we need -+ * return 1 to indicate that is was pending. -+ * -+ * This is not complete - we should be able to fake -+ * an edge even if it isn't on the 8259A... -+ * -+ * (We do this for level-triggered IRQs too - it cannot hurt.) -+ */ -+static unsigned int startup_ioapic_irq(unsigned int irq) -+{ -+ int was_pending = 0; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ if (irq < 16) { -+ disable_8259A_irq(irq); -+ if (i8259A_irq_pending(irq)) -+ was_pending = 1; -+ } -+ __unmask_IO_APIC_irq(irq); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return was_pending; -+} -+ -+static void ack_ioapic_irq(unsigned int irq) -+{ -+ move_native_irq(irq); -+ ack_APIC_irq(); -+} -+ -+static void ack_ioapic_quirk_irq(unsigned int irq) -+{ -+ unsigned long v; -+ int i; -+ -+ move_native_irq(irq); -+/* -+ * It appears there is an erratum which affects at least version 0x11 -+ * of I/O APIC (that's the 82093AA and cores integrated into various -+ * chipsets). Under certain conditions a level-triggered interrupt is -+ * erroneously delivered as edge-triggered one but the respective IRR -+ * bit gets set nevertheless. As a result the I/O unit expects an EOI -+ * message but it will never arrive and further interrupts are blocked -+ * from the source. The exact reason is so far unknown, but the -+ * phenomenon was observed when two consecutive interrupt requests -+ * from a given source get delivered to the same CPU and the source is -+ * temporarily disabled in between. -+ * -+ * A workaround is to simulate an EOI message manually. We achieve it -+ * by setting the trigger mode to edge and then to level when the edge -+ * trigger mode gets detected in the TMR of a local APIC for a -+ * level-triggered interrupt. We mask the source for the time of the -+ * operation to prevent an edge-triggered interrupt escaping meanwhile. -+ * The idea is from Manfred Spraul. --macro -+ */ -+ i = irq_vector[irq]; -+ -+ v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -+ -+ ack_APIC_irq(); -+ -+ if (!(v & (1 << (i & 0x1f)))) { -+ atomic_inc(&irq_mis_count); -+ spin_lock(&ioapic_lock); -+ __mask_and_edge_IO_APIC_irq(irq); -+ __unmask_and_level_IO_APIC_irq(irq); -+ spin_unlock(&ioapic_lock); -+ } -+} -+ -+static int ioapic_retrigger_irq(unsigned int irq) -+{ -+ send_IPI_self(irq_vector[irq]); -+ -+ return 1; -+} -+ -+static struct irq_chip ioapic_chip __read_mostly = { -+ .name = "IO-APIC", -+ .startup = startup_ioapic_irq, -+ .mask = mask_IO_APIC_irq, -+ .unmask = unmask_IO_APIC_irq, -+ .ack = ack_ioapic_irq, -+ .eoi = ack_ioapic_quirk_irq, -+#ifdef CONFIG_SMP -+ .set_affinity = set_ioapic_affinity_irq, -+#endif -+ .retrigger = ioapic_retrigger_irq, -+}; -+ -+#endif /* !CONFIG_XEN */ -+ -+static inline void init_IO_APIC_traps(void) -+{ -+ int irq; -+ -+ /* -+ * NOTE! The local APIC isn't very good at handling -+ * multiple interrupts at the same interrupt level. -+ * As the interrupt level is determined by taking the -+ * vector number and shifting that right by 4, we -+ * want to spread these out a bit so that they don't -+ * all fall in the same interrupt level. -+ * -+ * Also, we've got to be careful not to trash gate -+ * 0x80, because int 0x80 is hm, kind of importantish. ;) -+ */ -+ for (irq = 0; irq < NR_IRQS ; irq++) { -+ int tmp = irq; -+ if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { -+ /* -+ * Hmm.. We don't have an entry for this, -+ * so default to an old-fashioned 8259 -+ * interrupt if we can.. -+ */ -+ if (irq < 16) -+ make_8259A_irq(irq); -+#ifndef CONFIG_XEN -+ else -+ /* Strange. Oh, well.. */ -+ irq_desc[irq].chip = &no_irq_chip; -+#endif -+ } -+ } -+} -+ -+#ifndef CONFIG_XEN -+/* -+ * The local APIC irq-chip implementation: -+ */ -+ -+static void ack_apic(unsigned int irq) -+{ -+ ack_APIC_irq(); -+} -+ -+static void mask_lapic_irq (unsigned int irq) -+{ -+ unsigned long v; -+ -+ v = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -+} -+ -+static void unmask_lapic_irq (unsigned int irq) -+{ -+ unsigned long v; -+ -+ v = apic_read(APIC_LVT0); -+ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); -+} -+ -+static struct irq_chip lapic_chip __read_mostly = { -+ .name = "local-APIC-edge", -+ .mask = mask_lapic_irq, -+ .unmask = unmask_lapic_irq, -+ .eoi = ack_apic, -+}; -+ -+static void setup_nmi (void) -+{ -+ /* -+ * Dirty trick to enable the NMI watchdog ... -+ * We put the 8259A master into AEOI mode and -+ * unmask on all local APICs LVT0 as NMI. -+ * -+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') -+ * is from Maciej W. Rozycki - so we do not have to EOI from -+ * the NMI handler or the timer interrupt. -+ */ -+ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); -+ -+ on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); -+ -+ apic_printk(APIC_VERBOSE, " done.\n"); -+} -+ -+/* -+ * This looks a bit hackish but it's about the only one way of sending -+ * a few INTA cycles to 8259As and any associated glue logic. ICR does -+ * not support the ExtINT mode, unfortunately. We need to send these -+ * cycles as some i82489DX-based boards have glue logic that keeps the -+ * 8259A interrupt line asserted until INTA. --macro -+ */ -+static inline void unlock_ExtINT_logic(void) -+{ -+ int apic, pin, i; -+ struct IO_APIC_route_entry entry0, entry1; -+ unsigned char save_control, save_freq_select; -+ -+ pin = find_isa_irq_pin(8, mp_INT); -+ if (pin == -1) { -+ WARN_ON_ONCE(1); -+ return; -+ } -+ apic = find_isa_irq_apic(8, mp_INT); -+ if (apic == -1) { -+ WARN_ON_ONCE(1); -+ return; -+ } -+ -+ entry0 = ioapic_read_entry(apic, pin); -+ clear_IO_APIC_pin(apic, pin); -+ -+ memset(&entry1, 0, sizeof(entry1)); -+ -+ entry1.dest_mode = 0; /* physical delivery */ -+ entry1.mask = 0; /* unmask IRQ now */ -+ entry1.dest.physical.physical_dest = hard_smp_processor_id(); -+ entry1.delivery_mode = dest_ExtINT; -+ entry1.polarity = entry0.polarity; -+ entry1.trigger = 0; -+ entry1.vector = 0; -+ -+ ioapic_write_entry(apic, pin, entry1); -+ -+ save_control = CMOS_READ(RTC_CONTROL); -+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT); -+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, -+ RTC_FREQ_SELECT); -+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); -+ -+ i = 100; -+ while (i-- > 0) { -+ mdelay(10); -+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) -+ i -= 10; -+ } -+ -+ CMOS_WRITE(save_control, RTC_CONTROL); -+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); -+ clear_IO_APIC_pin(apic, pin); -+ -+ ioapic_write_entry(apic, pin, entry0); -+} -+#endif /* !CONFIG_XEN */ -+ -+int timer_uses_ioapic_pin_0; -+ -+#ifndef CONFIG_XEN -+/* -+ * This code may look a bit paranoid, but it's supposed to cooperate with -+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ -+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast -+ * fanatically on his truly buggy board. -+ */ -+static inline void __init check_timer(void) -+{ -+ int apic1, pin1, apic2, pin2; -+ int vector; -+ -+ /* -+ * get/set the timer IRQ vector: -+ */ -+ disable_8259A_irq(0); -+ vector = assign_irq_vector(0); -+ set_intr_gate(vector, interrupt[0]); -+ -+ /* -+ * Subtle, code in do_timer_interrupt() expects an AEOI -+ * mode for the 8259A whenever interrupts are routed -+ * through I/O APICs. Also IRQ0 has to be enabled in -+ * the 8259A which implies the virtual wire has to be -+ * disabled in the local APIC. -+ */ -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); -+ init_8259A(1); -+ timer_ack = 1; -+ if (timer_over_8254 > 0) -+ enable_8259A_irq(0); -+ -+ pin1 = find_isa_irq_pin(0, mp_INT); -+ apic1 = find_isa_irq_apic(0, mp_INT); -+ pin2 = ioapic_i8259.pin; -+ apic2 = ioapic_i8259.apic; -+ -+ if (pin1 == 0) -+ timer_uses_ioapic_pin_0 = 1; -+ -+ printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", -+ vector, apic1, pin1, apic2, pin2); -+ -+ if (pin1 != -1) { -+ /* -+ * Ok, does IRQ0 through the IOAPIC work? -+ */ -+ unmask_IO_APIC_irq(0); -+ if (timer_irq_works()) { -+ if (nmi_watchdog == NMI_IO_APIC) { -+ disable_8259A_irq(0); -+ setup_nmi(); -+ enable_8259A_irq(0); -+ } -+ if (disable_timer_pin_1 > 0) -+ clear_IO_APIC_pin(0, pin1); -+ return; -+ } -+ clear_IO_APIC_pin(apic1, pin1); -+ printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " -+ "IO-APIC\n"); -+ } -+ -+ printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); -+ if (pin2 != -1) { -+ printk("\n..... (found pin %d) ...", pin2); -+ /* -+ * legacy devices should be connected to IO APIC #0 -+ */ -+ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); -+ if (timer_irq_works()) { -+ printk("works.\n"); -+ if (pin1 != -1) -+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2); -+ else -+ add_pin_to_irq(0, apic2, pin2); -+ if (nmi_watchdog == NMI_IO_APIC) { -+ setup_nmi(); -+ } -+ return; -+ } -+ /* -+ * Cleanup, just in case ... -+ */ -+ clear_IO_APIC_pin(apic2, pin2); -+ } -+ printk(" failed.\n"); -+ -+ if (nmi_watchdog == NMI_IO_APIC) { -+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); -+ nmi_watchdog = 0; -+ } -+ -+ printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); -+ -+ disable_8259A_irq(0); -+ set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, -+ "fasteoi"); -+ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ -+ enable_8259A_irq(0); -+ -+ if (timer_irq_works()) { -+ printk(" works.\n"); -+ return; -+ } -+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); -+ printk(" failed.\n"); -+ -+ printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); -+ -+ timer_ack = 0; -+ init_8259A(0); -+ make_8259A_irq(0); -+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); -+ -+ unlock_ExtINT_logic(); -+ -+ if (timer_irq_works()) { -+ printk(" works.\n"); -+ return; -+ } -+ printk(" failed :(.\n"); -+ panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " -+ "report. Then try booting with the 'noapic' option"); -+} -+#else -+#define check_timer() ((void)0) -+#endif /* CONFIG_XEN */ -+ -+/* -+ * -+ * IRQ's that are handled by the PIC in the MPS IOAPIC case. -+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. -+ * Linux doesn't really care, as it's not actually used -+ * for any interrupt handling anyway. -+ */ -+#define PIC_IRQS (1 << PIC_CASCADE_IR) -+ -+void __init setup_IO_APIC(void) -+{ -+ enable_IO_APIC(); -+ -+ if (acpi_ioapic) -+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ -+ else -+ io_apic_irqs = ~PIC_IRQS; -+ -+ printk("ENABLING IO-APIC IRQs\n"); -+ -+ /* -+ * Set up IO-APIC IRQ routing. -+ */ -+ if (!acpi_ioapic) -+ setup_ioapic_ids_from_mpc(); -+#ifndef CONFIG_XEN -+ sync_Arb_IDs(); -+#endif -+ setup_IO_APIC_irqs(); -+ init_IO_APIC_traps(); -+ check_timer(); -+ if (!acpi_ioapic) -+ print_IO_APIC(); -+} -+ -+static int __init setup_disable_8254_timer(char *s) -+{ -+ timer_over_8254 = -1; -+ return 1; -+} -+static int __init setup_enable_8254_timer(char *s) -+{ -+ timer_over_8254 = 2; -+ return 1; -+} -+ -+__setup("disable_8254_timer", setup_disable_8254_timer); -+__setup("enable_8254_timer", setup_enable_8254_timer); -+ -+/* -+ * Called after all the initialization is done. If we didnt find any -+ * APIC bugs then we can allow the modify fast path -+ */ -+ -+static int __init io_apic_bug_finalize(void) -+{ -+ if(sis_apic_bug == -1) -+ sis_apic_bug = 0; -+ if (is_initial_xendomain()) { -+ struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; -+ op.u.platform_quirk.quirk_id = sis_apic_bug ? -+ QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; -+ HYPERVISOR_platform_op(&op); -+ } -+ return 0; -+} -+ -+late_initcall(io_apic_bug_finalize); -+ -+struct sysfs_ioapic_data { -+ struct sys_device dev; -+ struct IO_APIC_route_entry entry[0]; -+}; -+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; -+ -+static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -+{ -+ struct IO_APIC_route_entry *entry; -+ struct sysfs_ioapic_data *data; -+ int i; -+ -+ data = container_of(dev, struct sysfs_ioapic_data, dev); -+ entry = data->entry; -+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) -+ entry[i] = ioapic_read_entry(dev->id, i); -+ -+ return 0; -+} -+ -+static int ioapic_resume(struct sys_device *dev) -+{ -+ struct IO_APIC_route_entry *entry; -+ struct sysfs_ioapic_data *data; -+ unsigned long flags; -+ union IO_APIC_reg_00 reg_00; -+ int i; -+ -+ data = container_of(dev, struct sysfs_ioapic_data, dev); -+ entry = data->entry; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(dev->id, 0); -+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { -+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; -+ io_apic_write(dev->id, 0, reg_00.raw); -+ } -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) -+ ioapic_write_entry(dev->id, i, entry[i]); -+ -+ return 0; -+} -+ -+static struct sysdev_class ioapic_sysdev_class = { -+ set_kset_name("ioapic"), -+ .suspend = ioapic_suspend, -+ .resume = ioapic_resume, -+}; -+ -+static int __init ioapic_init_sysfs(void) -+{ -+ struct sys_device * dev; -+ int i, size, error = 0; -+ -+ error = sysdev_class_register(&ioapic_sysdev_class); -+ if (error) -+ return error; -+ -+ for (i = 0; i < nr_ioapics; i++ ) { -+ size = sizeof(struct sys_device) + nr_ioapic_registers[i] -+ * sizeof(struct IO_APIC_route_entry); -+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); -+ if (!mp_ioapic_data[i]) { -+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); -+ continue; -+ } -+ memset(mp_ioapic_data[i], 0, size); -+ dev = &mp_ioapic_data[i]->dev; -+ dev->id = i; -+ dev->cls = &ioapic_sysdev_class; -+ error = sysdev_register(dev); -+ if (error) { -+ kfree(mp_ioapic_data[i]); -+ mp_ioapic_data[i] = NULL; -+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); -+ continue; -+ } -+ } -+ -+ return 0; -+} -+ -+device_initcall(ioapic_init_sysfs); -+ -+/* -+ * Dynamic irq allocate and deallocation -+ */ -+int create_irq(void) -+{ -+ /* Allocate an unused irq */ -+ int irq, new, vector = 0; -+ unsigned long flags; -+ -+ irq = -ENOSPC; -+ spin_lock_irqsave(&vector_lock, flags); -+ for (new = (NR_IRQS - 1); new >= 0; new--) { -+ if (platform_legacy_irq(new)) -+ continue; -+ if (irq_vector[new] != 0) -+ continue; -+ vector = __assign_irq_vector(new); -+ if (likely(vector > 0)) -+ irq = new; -+ break; -+ } -+ spin_unlock_irqrestore(&vector_lock, flags); -+ -+ if (irq >= 0) { -+#ifndef CONFIG_XEN -+ set_intr_gate(vector, interrupt[irq]); -+#endif -+ dynamic_irq_init(irq); -+ } -+ return irq; -+} -+ -+void destroy_irq(unsigned int irq) -+{ -+ unsigned long flags; -+ -+ dynamic_irq_cleanup(irq); -+ -+ spin_lock_irqsave(&vector_lock, flags); -+ irq_vector[irq] = 0; -+ spin_unlock_irqrestore(&vector_lock, flags); -+} -+ -+/* -+ * MSI mesage composition -+ */ -+#ifdef CONFIG_PCI_MSI -+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -+{ -+ int vector; -+ unsigned dest; -+ -+ vector = assign_irq_vector(irq); -+ if (vector >= 0) { -+ dest = cpu_mask_to_apicid(TARGET_CPUS); -+ -+ msg->address_hi = MSI_ADDR_BASE_HI; -+ msg->address_lo = -+ MSI_ADDR_BASE_LO | -+ ((INT_DEST_MODE == 0) ? -+ MSI_ADDR_DEST_MODE_PHYSICAL: -+ MSI_ADDR_DEST_MODE_LOGICAL) | -+ ((INT_DELIVERY_MODE != dest_LowestPrio) ? -+ MSI_ADDR_REDIRECTION_CPU: -+ MSI_ADDR_REDIRECTION_LOWPRI) | -+ MSI_ADDR_DEST_ID(dest); -+ -+ msg->data = -+ MSI_DATA_TRIGGER_EDGE | -+ MSI_DATA_LEVEL_ASSERT | -+ ((INT_DELIVERY_MODE != dest_LowestPrio) ? -+ MSI_DATA_DELIVERY_FIXED: -+ MSI_DATA_DELIVERY_LOWPRI) | -+ MSI_DATA_VECTOR(vector); -+ } -+ return vector; -+} -+ -+#ifdef CONFIG_SMP -+static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -+{ -+ struct msi_msg msg; -+ unsigned int dest; -+ cpumask_t tmp; -+ int vector; -+ -+ cpus_and(tmp, mask, cpu_online_map); -+ if (cpus_empty(tmp)) -+ tmp = TARGET_CPUS; -+ -+ vector = assign_irq_vector(irq); -+ if (vector < 0) -+ return; -+ -+ dest = cpu_mask_to_apicid(mask); -+ -+ read_msi_msg(irq, &msg); -+ -+ msg.data &= ~MSI_DATA_VECTOR_MASK; -+ msg.data |= MSI_DATA_VECTOR(vector); -+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; -+ msg.address_lo |= MSI_ADDR_DEST_ID(dest); -+ -+ write_msi_msg(irq, &msg); -+ irq_desc[irq].affinity = mask; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, -+ * which implement the MSI or MSI-X Capability Structure. -+ */ -+static struct irq_chip msi_chip = { -+ .name = "PCI-MSI", -+ .unmask = unmask_msi_irq, -+ .mask = mask_msi_irq, -+ .ack = ack_ioapic_irq, -+#ifdef CONFIG_SMP -+ .set_affinity = set_msi_irq_affinity, -+#endif -+ .retrigger = ioapic_retrigger_irq, -+}; -+ -+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -+{ -+ struct msi_msg msg; -+ int irq, ret; -+ irq = create_irq(); -+ if (irq < 0) -+ return irq; -+ -+ set_irq_msi(irq, desc); -+ ret = msi_compose_msg(dev, irq, &msg); -+ if (ret < 0) { -+ destroy_irq(irq); -+ return ret; -+ } -+ -+ write_msi_msg(irq, &msg); -+ -+ set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, -+ "edge"); -+ -+ return irq; -+} -+ -+void arch_teardown_msi_irq(unsigned int irq) -+{ -+ destroy_irq(irq); -+} -+ -+#endif /* CONFIG_PCI_MSI */ -+ -+/* -+ * Hypertransport interrupt support -+ */ -+#ifdef CONFIG_HT_IRQ -+ -+#ifdef CONFIG_SMP -+ -+static void target_ht_irq(unsigned int irq, unsigned int dest) -+{ -+ struct ht_irq_msg msg; -+ fetch_ht_irq_msg(irq, &msg); -+ -+ msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); -+ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); -+ -+ msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); -+ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); -+ -+ write_ht_irq_msg(irq, &msg); -+} -+ -+static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -+{ -+ unsigned int dest; -+ cpumask_t tmp; -+ -+ cpus_and(tmp, mask, cpu_online_map); -+ if (cpus_empty(tmp)) -+ tmp = TARGET_CPUS; -+ -+ cpus_and(mask, tmp, CPU_MASK_ALL); -+ -+ dest = cpu_mask_to_apicid(mask); -+ -+ target_ht_irq(irq, dest); -+ irq_desc[irq].affinity = mask; -+} -+#endif -+ -+static struct irq_chip ht_irq_chip = { -+ .name = "PCI-HT", -+ .mask = mask_ht_irq, -+ .unmask = unmask_ht_irq, -+ .ack = ack_ioapic_irq, -+#ifdef CONFIG_SMP -+ .set_affinity = set_ht_irq_affinity, -+#endif -+ .retrigger = ioapic_retrigger_irq, -+}; -+ -+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -+{ -+ int vector; -+ -+ vector = assign_irq_vector(irq); -+ if (vector >= 0) { -+ struct ht_irq_msg msg; -+ unsigned dest; -+ cpumask_t tmp; -+ -+ cpus_clear(tmp); -+ cpu_set(vector >> 8, tmp); -+ dest = cpu_mask_to_apicid(tmp); -+ -+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); -+ -+ msg.address_lo = -+ HT_IRQ_LOW_BASE | -+ HT_IRQ_LOW_DEST_ID(dest) | -+ HT_IRQ_LOW_VECTOR(vector) | -+ ((INT_DEST_MODE == 0) ? -+ HT_IRQ_LOW_DM_PHYSICAL : -+ HT_IRQ_LOW_DM_LOGICAL) | -+ HT_IRQ_LOW_RQEOI_EDGE | -+ ((INT_DELIVERY_MODE != dest_LowestPrio) ? -+ HT_IRQ_LOW_MT_FIXED : -+ HT_IRQ_LOW_MT_ARBITRATED) | -+ HT_IRQ_LOW_IRQ_MASKED; -+ -+ write_ht_irq_msg(irq, &msg); -+ -+ set_irq_chip_and_handler_name(irq, &ht_irq_chip, -+ handle_edge_irq, "edge"); -+ } -+ return vector; -+} -+#endif /* CONFIG_HT_IRQ */ -+ -+/* -------------------------------------------------------------------------- -+ ACPI-based IOAPIC Configuration -+ -------------------------------------------------------------------------- */ -+ -+#ifdef CONFIG_ACPI -+ -+int __init io_apic_get_unique_id (int ioapic, int apic_id) -+{ -+#ifndef CONFIG_XEN -+ union IO_APIC_reg_00 reg_00; -+ static physid_mask_t apic_id_map = PHYSID_MASK_NONE; -+ physid_mask_t tmp; -+ unsigned long flags; -+ int i = 0; -+ -+ /* -+ * The P4 platform supports up to 256 APIC IDs on two separate APIC -+ * buses (one for LAPICs, one for IOAPICs), where predecessors only -+ * supports up to 16 on one shared APIC bus. -+ * -+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full -+ * advantage of new APIC bus architecture. -+ */ -+ -+ if (physids_empty(apic_id_map)) -+ apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_00.raw = io_apic_read(ioapic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ if (apic_id >= get_physical_broadcast()) { -+ printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " -+ "%d\n", ioapic, apic_id, reg_00.bits.ID); -+ apic_id = reg_00.bits.ID; -+ } -+ -+ /* -+ * Every APIC in a system must have a unique ID or we get lots of nice -+ * 'stuck on smp_invalidate_needed IPI wait' messages. -+ */ -+ if (check_apicid_used(apic_id_map, apic_id)) { -+ -+ for (i = 0; i < get_physical_broadcast(); i++) { -+ if (!check_apicid_used(apic_id_map, i)) -+ break; -+ } -+ -+ if (i == get_physical_broadcast()) -+ panic("Max apic_id exceeded!\n"); -+ -+ printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " -+ "trying %d\n", ioapic, apic_id, i); -+ -+ apic_id = i; -+ } -+ -+ tmp = apicid_to_cpu_present(apic_id); -+ physids_or(apic_id_map, apic_id_map, tmp); -+ -+ if (reg_00.bits.ID != apic_id) { -+ reg_00.bits.ID = apic_id; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ io_apic_write(ioapic, 0, reg_00.raw); -+ reg_00.raw = io_apic_read(ioapic, 0); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ /* Sanity check */ -+ if (reg_00.bits.ID != apic_id) { -+ printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); -+ return -1; -+ } -+ } -+ -+ apic_printk(APIC_VERBOSE, KERN_INFO -+ "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); -+#endif /* !CONFIG_XEN */ -+ -+ return apic_id; -+} -+ -+ -+int __init io_apic_get_version (int ioapic) -+{ -+ union IO_APIC_reg_01 reg_01; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_01.raw = io_apic_read(ioapic, 1); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return reg_01.bits.version; -+} -+ -+ -+int __init io_apic_get_redir_entries (int ioapic) -+{ -+ union IO_APIC_reg_01 reg_01; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ reg_01.raw = io_apic_read(ioapic, 1); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return reg_01.bits.entries; -+} -+ -+ -+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) -+{ -+ struct IO_APIC_route_entry entry; -+ unsigned long flags; -+ -+ if (!IO_APIC_IRQ(irq)) { -+ printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", -+ ioapic); -+ return -EINVAL; -+ } -+ -+ /* -+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. -+ * Note that we mask (disable) IRQs now -- these get enabled when the -+ * corresponding device driver registers for this IRQ. -+ */ -+ -+ memset(&entry,0,sizeof(entry)); -+ -+ entry.delivery_mode = INT_DELIVERY_MODE; -+ entry.dest_mode = INT_DEST_MODE; -+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); -+ entry.trigger = edge_level; -+ entry.polarity = active_high_low; -+ entry.mask = 1; -+ -+ /* -+ * IRQs < 16 are already in the irq_2_pin[] map -+ */ -+ if (irq >= 16) -+ add_pin_to_irq(irq, ioapic, pin); -+ -+ entry.vector = assign_irq_vector(irq); -+ -+ apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " -+ "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, -+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, -+ edge_level, active_high_low); -+ -+ ioapic_register_intr(irq, entry.vector, edge_level); -+ -+ if (!ioapic && (irq < 16)) -+ disable_8259A_irq(irq); -+ -+ spin_lock_irqsave(&ioapic_lock, flags); -+ __ioapic_write_entry(ioapic, pin, entry); -+ spin_unlock_irqrestore(&ioapic_lock, flags); -+ -+ return 0; -+} -+ -+#endif /* CONFIG_ACPI */ -+ -+static int __init parse_disable_timer_pin_1(char *arg) -+{ -+ disable_timer_pin_1 = 1; -+ return 0; -+} -+early_param("disable_timer_pin_1", parse_disable_timer_pin_1); -+ -+static int __init parse_enable_timer_pin_1(char *arg) -+{ -+ disable_timer_pin_1 = -1; -+ return 0; -+} -+early_param("enable_timer_pin_1", parse_enable_timer_pin_1); -+ -+static int __init parse_noapic(char *arg) -+{ -+ /* disable IO-APIC */ -+ disable_ioapic_setup(); -+ return 0; -+} -+early_param("noapic", parse_noapic); -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/ioport-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/ioport-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,122 @@ -+/* -+ * linux/arch/i386/kernel/ioport.c -+ * -+ * This contains the io-permission bitmap code - written by obz, with changes -+ * by Linus. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -+{ -+ unsigned long mask; -+ unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); -+ unsigned int low_index = base & (BITS_PER_LONG-1); -+ int length = low_index + extent; -+ -+ if (low_index != 0) { -+ mask = (~0UL << low_index); -+ if (length < BITS_PER_LONG) -+ mask &= ~(~0UL << length); -+ if (new_value) -+ *bitmap_base++ |= mask; -+ else -+ *bitmap_base++ &= ~mask; -+ length -= BITS_PER_LONG; -+ } -+ -+ mask = (new_value ? ~0UL : 0UL); -+ while (length >= BITS_PER_LONG) { -+ *bitmap_base++ = mask; -+ length -= BITS_PER_LONG; -+ } -+ -+ if (length > 0) { -+ mask = ~(~0UL << length); -+ if (new_value) -+ *bitmap_base++ |= mask; -+ else -+ *bitmap_base++ &= ~mask; -+ } -+} -+ -+ -+/* -+ * this changes the io permissions bitmap in the current task. -+ */ -+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -+{ -+ struct thread_struct * t = ¤t->thread; -+ unsigned long *bitmap; -+ struct physdev_set_iobitmap set_iobitmap; -+ -+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) -+ return -EINVAL; -+ if (turn_on && !capable(CAP_SYS_RAWIO)) -+ return -EPERM; -+ -+ /* -+ * If it's the first ioperm() call in this thread's lifetime, set the -+ * IO bitmap up. ioperm() is much less timing critical than clone(), -+ * this is why we delay this operation until now: -+ */ -+ if (!t->io_bitmap_ptr) { -+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); -+ if (!bitmap) -+ return -ENOMEM; -+ -+ memset(bitmap, 0xff, IO_BITMAP_BYTES); -+ t->io_bitmap_ptr = bitmap; -+ set_thread_flag(TIF_IO_BITMAP); -+ -+ set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); -+ set_iobitmap.nr_ports = IO_BITMAP_BITS; -+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap); -+ } -+ -+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); -+ -+ return 0; -+} -+ -+/* -+ * sys_iopl has to be used when you want to access the IO ports -+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped -+ * you'd need 8kB of bitmaps/process, which is a bit excessive. -+ * -+ * Here we just change the eflags value on the stack: we allow -+ * only the super-user to do it. This depends on the stack-layout -+ * on system-call entry - see also fork() and the signal handling -+ * code. -+ */ -+ -+asmlinkage long sys_iopl(unsigned long unused) -+{ -+ volatile struct pt_regs * regs = (struct pt_regs *) &unused; -+ unsigned int level = regs->ebx; -+ struct thread_struct *t = ¤t->thread; -+ unsigned int old = (t->iopl >> 12) & 3; -+ -+ if (level > 3) -+ return -EINVAL; -+ /* Trying to gain more privileges? */ -+ if (level > old) { -+ if (!capable(CAP_SYS_RAWIO)) -+ return -EPERM; -+ } -+ t->iopl = level << 12; -+ set_iopl_mask(t->iopl); -+ return 0; -+} -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/irq.c ---- a/arch/i386/kernel/irq.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/irq.c Fri Jul 20 11:56:41 2007 -0300 -@@ -302,7 +302,9 @@ skip: - } - - #ifdef CONFIG_HOTPLUG_CPU -+#ifndef CONFIG_XEN - #include -+#endif - - void fixup_irqs(cpumask_t map) - { -@@ -316,7 +318,9 @@ void fixup_irqs(cpumask_t map) - - cpus_and(mask, irq_desc[irq].affinity, map); - if (any_online_cpu(mask) == NR_CPUS) { -+#ifndef CONFIG_XEN - printk("Breaking affinity for irq %i\n", irq); -+#endif - mask = map; - } - if (irq_desc[irq].chip->set_affinity) -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/ldt-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/ldt-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,268 @@ -+/* -+ * linux/arch/i386/kernel/ldt.c -+ * -+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds -+ * Copyright (C) 1999 Ingo Molnar -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -+static void flush_ldt(void *null) -+{ -+ if (current->active_mm) -+ load_LDT(¤t->active_mm->context); -+} -+#endif -+ -+static int alloc_ldt(mm_context_t *pc, int mincount, int reload) -+{ -+ void *oldldt; -+ void *newldt; -+ int oldsize; -+ -+ if (mincount <= pc->size) -+ return 0; -+ oldsize = pc->size; -+ mincount = (mincount+511)&(~511); -+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) -+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE); -+ else -+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); -+ -+ if (!newldt) -+ return -ENOMEM; -+ -+ if (oldsize) -+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); -+ oldldt = pc->ldt; -+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); -+ pc->ldt = newldt; -+ wmb(); -+ pc->size = mincount; -+ wmb(); -+ -+ if (reload) { -+#ifdef CONFIG_SMP -+ cpumask_t mask; -+ preempt_disable(); -+#endif -+ make_pages_readonly( -+ pc->ldt, -+ (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ load_LDT(pc); -+#ifdef CONFIG_SMP -+ mask = cpumask_of_cpu(smp_processor_id()); -+ if (!cpus_equal(current->mm->cpu_vm_mask, mask)) -+ smp_call_function(flush_ldt, NULL, 1, 1); -+ preempt_enable(); -+#endif -+ } -+ if (oldsize) { -+ make_pages_writable( -+ oldldt, -+ (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) -+ vfree(oldldt); -+ else -+ kfree(oldldt); -+ } -+ return 0; -+} -+ -+static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -+{ -+ int err = alloc_ldt(new, old->size, 0); -+ if (err < 0) -+ return err; -+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); -+ make_pages_readonly( -+ new->ldt, -+ (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ return 0; -+} -+ -+/* -+ * we do not have to muck with descriptors here, that is -+ * done in switch_mm() as needed. -+ */ -+int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -+{ -+ struct mm_struct * old_mm; -+ int retval = 0; -+ -+ init_MUTEX(&mm->context.sem); -+ mm->context.size = 0; -+ mm->context.has_foreign_mappings = 0; -+ old_mm = current->mm; -+ if (old_mm && old_mm->context.size > 0) { -+ down(&old_mm->context.sem); -+ retval = copy_ldt(&mm->context, &old_mm->context); -+ up(&old_mm->context.sem); -+ } -+ return retval; -+} -+ -+/* -+ * No need to lock the MM as we are the last user -+ */ -+void destroy_context(struct mm_struct *mm) -+{ -+ if (mm->context.size) { -+ if (mm == current->active_mm) -+ clear_LDT(); -+ make_pages_writable( -+ mm->context.ldt, -+ (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, -+ XENFEAT_writable_descriptor_tables); -+ if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) -+ vfree(mm->context.ldt); -+ else -+ kfree(mm->context.ldt); -+ mm->context.size = 0; -+ } -+} -+ -+static int read_ldt(void __user * ptr, unsigned long bytecount) -+{ -+ int err; -+ unsigned long size; -+ struct mm_struct * mm = current->mm; -+ -+ if (!mm->context.size) -+ return 0; -+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) -+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; -+ -+ down(&mm->context.sem); -+ size = mm->context.size*LDT_ENTRY_SIZE; -+ if (size > bytecount) -+ size = bytecount; -+ -+ err = 0; -+ if (copy_to_user(ptr, mm->context.ldt, size)) -+ err = -EFAULT; -+ up(&mm->context.sem); -+ if (err < 0) -+ goto error_return; -+ if (size != bytecount) { -+ /* zero-fill the rest */ -+ if (clear_user(ptr+size, bytecount-size) != 0) { -+ err = -EFAULT; -+ goto error_return; -+ } -+ } -+ return bytecount; -+error_return: -+ return err; -+} -+ -+static int read_default_ldt(void __user * ptr, unsigned long bytecount) -+{ -+ int err; -+ unsigned long size; -+ -+ err = 0; -+ size = 5*sizeof(struct desc_struct); -+ if (size > bytecount) -+ size = bytecount; -+ -+ err = size; -+ if (clear_user(ptr, size)) -+ err = -EFAULT; -+ -+ return err; -+} -+ -+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -+{ -+ struct mm_struct * mm = current->mm; -+ __u32 entry_1, entry_2; -+ int error; -+ struct user_desc ldt_info; -+ -+ error = -EINVAL; -+ if (bytecount != sizeof(ldt_info)) -+ goto out; -+ error = -EFAULT; -+ if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) -+ goto out; -+ -+ error = -EINVAL; -+ if (ldt_info.entry_number >= LDT_ENTRIES) -+ goto out; -+ if (ldt_info.contents == 3) { -+ if (oldmode) -+ goto out; -+ if (ldt_info.seg_not_present == 0) -+ goto out; -+ } -+ -+ down(&mm->context.sem); -+ if (ldt_info.entry_number >= mm->context.size) { -+ error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); -+ if (error < 0) -+ goto out_unlock; -+ } -+ -+ /* Allow LDTs to be cleared by the user. */ -+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { -+ if (oldmode || LDT_empty(&ldt_info)) { -+ entry_1 = 0; -+ entry_2 = 0; -+ goto install; -+ } -+ } -+ -+ entry_1 = LDT_entry_a(&ldt_info); -+ entry_2 = LDT_entry_b(&ldt_info); -+ if (oldmode) -+ entry_2 &= ~(1 << 20); -+ -+ /* Install the new entry ... */ -+install: -+ error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, -+ entry_1, entry_2); -+ -+out_unlock: -+ up(&mm->context.sem); -+out: -+ return error; -+} -+ -+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -+{ -+ int ret = -ENOSYS; -+ -+ switch (func) { -+ case 0: -+ ret = read_ldt(ptr, bytecount); -+ break; -+ case 1: -+ ret = write_ldt(ptr, bytecount, 1); -+ break; -+ case 2: -+ ret = read_default_ldt(ptr, bytecount); -+ break; -+ case 0x11: -+ ret = write_ldt(ptr, bytecount, 0); -+ break; -+ } -+ return ret; -+} -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/machine_kexec.c ---- a/arch/i386/kernel/machine_kexec.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/machine_kexec.c Fri Jul 20 11:56:41 2007 -0300 -@@ -20,6 +20,10 @@ - #include - #include - -+#ifdef CONFIG_XEN -+#include -+#endif -+ - #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) - static u32 kexec_pgd[1024] PAGE_ALIGNED; - #ifdef CONFIG_X86_PAE -@@ -28,6 +32,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED - #endif - static u32 kexec_pte0[1024] PAGE_ALIGNED; - static u32 kexec_pte1[1024] PAGE_ALIGNED; -+ -+#ifdef CONFIG_XEN -+ -+#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) -+ -+#if PAGES_NR > KEXEC_XEN_NO_PAGES -+#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break -+#endif -+ -+#if PA_CONTROL_PAGE != 0 -+#error PA_CONTROL_PAGE is non zero - Xen support will break -+#endif -+ -+void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) -+{ -+ void *control_page; -+ -+ memset(xki->page_list, 0, sizeof(xki->page_list)); -+ -+ control_page = page_address(image->control_code_page); -+ memcpy(control_page, relocate_kernel, PAGE_SIZE); -+ -+ xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); -+ xki->page_list[PA_PGD] = __ma(kexec_pgd); -+#ifdef CONFIG_X86_PAE -+ xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); -+ xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); -+#endif -+ xki->page_list[PA_PTE_0] = __ma(kexec_pte0); -+ xki->page_list[PA_PTE_1] = __ma(kexec_pte1); -+ -+} -+ -+#else /* CONFIG_XEN */ - - static void set_idt(void *newidt, __u16 limit) - { -@@ -70,6 +108,7 @@ static void load_segments(void) - #undef STR - #undef __STR - } -+#endif /* !CONFIG_XEN */ - - /* - * A architecture hook called to validate the -@@ -97,6 +136,7 @@ void machine_kexec_cleanup(struct kimage - { - } - -+#ifndef CONFIG_XEN - /* - * Do not allocate memory (or fail in any way) in machine_kexec(). - * We are past the point of no return, committed to rebooting now. -@@ -147,6 +187,7 @@ NORET_TYPE void machine_kexec(struct kim - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); - } -+#endif - - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/microcode-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/microcode-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,144 @@ -+/* -+ * Intel CPU Microcode Update Driver for Linux -+ * -+ * Copyright (C) 2000-2004 Tigran Aivazian -+ * -+ * This driver allows to upgrade microcode on Intel processors -+ * belonging to IA-32 family - PentiumPro, Pentium II, -+ * Pentium III, Xeon, Pentium 4, etc. -+ * -+ * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, -+ * Order Number 245472 or free download from: -+ * -+ * http://developer.intel.com/design/pentium4/manuals/245472.htm -+ * -+ * For more information, go to http://www.urbanmyth.org/microcode -+ * -+ * This program is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU General Public License -+ * as published by the Free Software Foundation; either version -+ * 2 of the License, or (at your option) any later version. -+ */ -+ -+//#define DEBUG /* pr_debug */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -+MODULE_AUTHOR("Tigran Aivazian "); -+MODULE_LICENSE("GPL"); -+ -+static int verbose; -+module_param(verbose, int, 0644); -+ -+#define MICROCODE_VERSION "1.14a-xen" -+ -+#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -+#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ -+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -+ -+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -+static DEFINE_MUTEX(microcode_mutex); -+ -+static int microcode_open (struct inode *unused1, struct file *unused2) -+{ -+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -+} -+ -+ -+static int do_microcode_update (const void __user *ubuf, size_t len) -+{ -+ int err; -+ void *kbuf; -+ -+ kbuf = vmalloc(len); -+ if (!kbuf) -+ return -ENOMEM; -+ -+ if (copy_from_user(kbuf, ubuf, len) == 0) { -+ struct xen_platform_op op; -+ -+ op.cmd = XENPF_microcode_update; -+ set_xen_guest_handle(op.u.microcode.data, kbuf); -+ op.u.microcode.length = len; -+ err = HYPERVISOR_platform_op(&op); -+ } else -+ err = -EFAULT; -+ -+ vfree(kbuf); -+ -+ return err; -+} -+ -+static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -+{ -+ ssize_t ret; -+ -+ if (len < MC_HEADER_SIZE) { -+ printk(KERN_ERR "microcode: not enough data\n"); -+ return -EINVAL; -+ } -+ -+ mutex_lock(µcode_mutex); -+ -+ ret = do_microcode_update(buf, len); -+ if (!ret) -+ ret = (ssize_t)len; -+ -+ mutex_unlock(µcode_mutex); -+ -+ return ret; -+} -+ -+static struct file_operations microcode_fops = { -+ .owner = THIS_MODULE, -+ .write = microcode_write, -+ .open = microcode_open, -+}; -+ -+static struct miscdevice microcode_dev = { -+ .minor = MICROCODE_MINOR, -+ .name = "microcode", -+ .fops = µcode_fops, -+}; -+ -+static int __init microcode_init (void) -+{ -+ int error; -+ -+ error = misc_register(µcode_dev); -+ if (error) { -+ printk(KERN_ERR -+ "microcode: can't misc_register on minor=%d\n", -+ MICROCODE_MINOR); -+ return error; -+ } -+ -+ printk(KERN_INFO -+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); -+ return 0; -+} -+ -+static void __exit microcode_exit (void) -+{ -+ misc_deregister(µcode_dev); -+} -+ -+module_init(microcode_init) -+module_exit(microcode_exit) -+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/mpparse.c ---- a/arch/i386/kernel/mpparse.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/mpparse.c Fri Jul 20 11:56:41 2007 -0300 -@@ -106,6 +106,7 @@ static struct mpc_config_translation *tr - - static void __cpuinit MP_processor_info (struct mpc_config_processor *m) - { -+#ifndef CONFIG_XEN - int ver, apicid; - physid_mask_t phys_cpu; - -@@ -196,8 +197,9 @@ static void __cpuinit MP_processor_info - } - - cpu_set(num_processors, cpu_possible_map); -+#endif /* CONFIG_XEN */ - num_processors++; -- -+#ifndef CONFIG_XEN - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc -@@ -218,6 +220,7 @@ static void __cpuinit MP_processor_info - } - } - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; -+#endif /* CONFIG_XEN */ - } - - static void __init MP_bus_info (struct mpc_config_bus *m) -@@ -684,7 +687,11 @@ void __init get_smp_config (void) - * Read the physical hardware table. Anything here will - * override the defaults. - */ -+#ifdef CONFIG_XEN -+ if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { -+#else - if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { -+#endif - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); -@@ -719,7 +726,11 @@ void __init get_smp_config (void) - - static int __init smp_scan_config (unsigned long base, unsigned long length) - { -+#ifdef CONFIG_XEN -+ unsigned long *bp = isa_bus_to_virt(base); -+#else - unsigned long *bp = phys_to_virt(base); -+#endif - struct intel_mp_floating *mpf; - - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); -@@ -735,6 +746,7 @@ static int __init smp_scan_config (unsig - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; -+#ifndef CONFIG_XEN - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); - reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); -@@ -754,6 +766,10 @@ static int __init smp_scan_config (unsig - size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size); - } -+#else -+ printk(KERN_INFO "found SMP MP-table at %08lx\n", -+ ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); -+#endif - - mpf_found = mpf; - return 1; -@@ -766,7 +782,9 @@ static int __init smp_scan_config (unsig - - void __init find_smp_config (void) - { -+#ifndef CONFIG_XEN - unsigned int address; -+#endif - - /* - * FIXME: Linux assumes you have 640K of base ram.. -@@ -797,9 +815,11 @@ void __init find_smp_config (void) - * MP1.4 SPEC states to only scan first 1K of 4K EBDA. - */ - -+#ifndef CONFIG_XEN - address = get_bios_ebda(); - if (address) - smp_scan_config(address, 0x400); -+#endif - } - - int es7000_plat; -@@ -812,6 +832,7 @@ int es7000_plat; - - void __init mp_register_lapic_address(u64 address) - { -+#ifndef CONFIG_XEN - mp_lapic_addr = (unsigned long) address; - - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); -@@ -820,6 +841,7 @@ void __init mp_register_lapic_address(u6 - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); -+#endif - } - - void __cpuinit mp_register_lapic (u8 id, u8 enabled) -@@ -836,6 +858,7 @@ void __cpuinit mp_register_lapic (u8 id, - if (id == boot_cpu_physical_apicid) - boot_cpu = 1; - -+#ifndef CONFIG_XEN - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); -@@ -846,6 +869,7 @@ void __cpuinit mp_register_lapic (u8 id, - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; -+#endif - - MP_processor_info(&processor); - } -@@ -900,7 +924,9 @@ void __init mp_register_ioapic(u8 id, u3 - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - -+#ifndef CONFIG_XEN - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); -+#endif - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - tmpid = io_apic_get_unique_id(idx, id); -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/pci-dma-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/pci-dma-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,395 @@ -+/* -+ * Dynamic DMA mapping support. -+ * -+ * On i386 there is no hardware dynamic DMA address translation, -+ * so consistent alloc/free are merely page allocation/freeing. -+ * The rest of the dynamic DMA mapping interface is implemented -+ * in asm/pci.h. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef __x86_64__ -+#include -+#include -+ -+int iommu_merge __read_mostly = 0; -+EXPORT_SYMBOL(iommu_merge); -+ -+dma_addr_t bad_dma_address __read_mostly; -+EXPORT_SYMBOL(bad_dma_address); -+ -+/* This tells the BIO block layer to assume merging. Default to off -+ because we cannot guarantee merging later. */ -+int iommu_bio_merge __read_mostly = 0; -+EXPORT_SYMBOL(iommu_bio_merge); -+ -+int iommu_sac_force __read_mostly = 0; -+EXPORT_SYMBOL(iommu_sac_force); -+ -+int no_iommu __read_mostly; -+#ifdef CONFIG_IOMMU_DEBUG -+int panic_on_overflow __read_mostly = 1; -+int force_iommu __read_mostly = 1; -+#else -+int panic_on_overflow __read_mostly = 0; -+int force_iommu __read_mostly= 0; -+#endif -+ -+/* Set this to 1 if there is a HW IOMMU in the system */ -+int iommu_detected __read_mostly = 0; -+ -+void __init pci_iommu_alloc(void) -+{ -+ /* -+ * The order of these functions is important for -+ * fall-back/fail-over reasons -+ */ -+#ifdef CONFIG_IOMMU -+ iommu_hole_init(); -+#endif -+ -+#ifdef CONFIG_CALGARY_IOMMU -+#include -+ /* shut up compiler */ -+ use_calgary = use_calgary; -+ detect_calgary(); -+#endif -+ -+#ifdef CONFIG_SWIOTLB -+ pci_swiotlb_init(); -+#endif -+} -+ -+static int __init pci_iommu_init(void) -+{ -+#ifdef CONFIG_CALGARY_IOMMU -+ calgary_iommu_init(); -+#endif -+ -+#ifdef CONFIG_IOMMU -+ gart_iommu_init(); -+#endif -+ -+ no_iommu_init(); -+ return 0; -+} -+ -+/* Must execute after PCI subsystem */ -+fs_initcall(pci_iommu_init); -+#endif -+ -+struct dma_coherent_mem { -+ void *virt_base; -+ u32 device_base; -+ int size; -+ int flags; -+ unsigned long *bitmap; -+}; -+ -+#define IOMMU_BUG_ON(test) \ -+do { \ -+ if (unlikely(test)) { \ -+ printk(KERN_ALERT "Fatal DMA error! " \ -+ "Please use 'swiotlb=force'\n"); \ -+ BUG(); \ -+ } \ -+} while (0) -+ -+int -+dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, -+ enum dma_data_direction direction) -+{ -+ int i, rc; -+ -+ BUG_ON(!valid_dma_direction(direction)); -+ WARN_ON(nents == 0 || sg[0].length == 0); -+ -+ if (swiotlb) { -+ rc = swiotlb_map_sg(hwdev, sg, nents, direction); -+ } else { -+ for (i = 0; i < nents; i++ ) { -+ sg[i].dma_address = -+ page_to_bus(sg[i].page) + sg[i].offset; -+ sg[i].dma_length = sg[i].length; -+ BUG_ON(!sg[i].page); -+ IOMMU_BUG_ON(address_needs_mapping( -+ hwdev, sg[i].dma_address)); -+ } -+ rc = nents; -+ } -+ -+ flush_write_buffers(); -+ return rc; -+} -+EXPORT_SYMBOL(dma_map_sg); -+ -+void -+dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, -+ enum dma_data_direction direction) -+{ -+ BUG_ON(!valid_dma_direction(direction)); -+ if (swiotlb) -+ swiotlb_unmap_sg(hwdev, sg, nents, direction); -+} -+EXPORT_SYMBOL(dma_unmap_sg); -+ -+#ifdef CONFIG_HIGHMEM -+dma_addr_t -+dma_map_page(struct device *dev, struct page *page, unsigned long offset, -+ size_t size, enum dma_data_direction direction) -+{ -+ dma_addr_t dma_addr; -+ -+ BUG_ON(!valid_dma_direction(direction)); -+ -+ if (swiotlb) { -+ dma_addr = swiotlb_map_page( -+ dev, page, offset, size, direction); -+ } else { -+ dma_addr = page_to_bus(page) + offset; -+ IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); -+ } -+ -+ return dma_addr; -+} -+EXPORT_SYMBOL(dma_map_page); -+ -+void -+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, -+ enum dma_data_direction direction) -+{ -+ BUG_ON(!valid_dma_direction(direction)); -+ if (swiotlb) -+ swiotlb_unmap_page(dev, dma_address, size, direction); -+} -+EXPORT_SYMBOL(dma_unmap_page); -+#endif /* CONFIG_HIGHMEM */ -+ -+int -+dma_mapping_error(dma_addr_t dma_addr) -+{ -+ if (swiotlb) -+ return swiotlb_dma_mapping_error(dma_addr); -+ return 0; -+} -+EXPORT_SYMBOL(dma_mapping_error); -+ -+int -+dma_supported(struct device *dev, u64 mask) -+{ -+ if (swiotlb) -+ return swiotlb_dma_supported(dev, mask); -+ /* -+ * By default we'll BUG when an infeasible DMA is requested, and -+ * request swiotlb=force (see IOMMU_BUG_ON). -+ */ -+ return 1; -+} -+EXPORT_SYMBOL(dma_supported); -+ -+void *dma_alloc_coherent(struct device *dev, size_t size, -+ dma_addr_t *dma_handle, gfp_t gfp) -+{ -+ void *ret; -+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; -+ unsigned int order = get_order(size); -+ unsigned long vstart; -+ u64 mask; -+ -+ /* ignore region specifiers */ -+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); -+ -+ if (mem) { -+ int page = bitmap_find_free_region(mem->bitmap, mem->size, -+ order); -+ if (page >= 0) { -+ *dma_handle = mem->device_base + (page << PAGE_SHIFT); -+ ret = mem->virt_base + (page << PAGE_SHIFT); -+ memset(ret, 0, size); -+ return ret; -+ } -+ if (mem->flags & DMA_MEMORY_EXCLUSIVE) -+ return NULL; -+ } -+ -+ if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) -+ gfp |= GFP_DMA; -+ -+ vstart = __get_free_pages(gfp, order); -+ ret = (void *)vstart; -+ -+ if (dev != NULL && dev->coherent_dma_mask) -+ mask = dev->coherent_dma_mask; -+ else -+ mask = 0xffffffff; -+ -+ if (ret != NULL) { -+ if (xen_create_contiguous_region(vstart, order, -+ fls64(mask)) != 0) { -+ free_pages(vstart, order); -+ return NULL; -+ } -+ memset(ret, 0, size); -+ *dma_handle = virt_to_bus(ret); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(dma_alloc_coherent); -+ -+void dma_free_coherent(struct device *dev, size_t size, -+ void *vaddr, dma_addr_t dma_handle) -+{ -+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; -+ int order = get_order(size); -+ -+ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { -+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; -+ -+ bitmap_release_region(mem->bitmap, page, order); -+ } else { -+ xen_destroy_contiguous_region((unsigned long)vaddr, order); -+ free_pages((unsigned long)vaddr, order); -+ } -+} -+EXPORT_SYMBOL(dma_free_coherent); -+ -+#ifdef ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY -+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, -+ dma_addr_t device_addr, size_t size, int flags) -+{ -+ void __iomem *mem_base = NULL; -+ int pages = size >> PAGE_SHIFT; -+ int bitmap_size = (pages + 31)/32; -+ -+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) -+ goto out; -+ if (!size) -+ goto out; -+ if (dev->dma_mem) -+ goto out; -+ -+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ -+ -+ mem_base = ioremap(bus_addr, size); -+ if (!mem_base) -+ goto out; -+ -+ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); -+ if (!dev->dma_mem) -+ goto out; -+ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); -+ if (!dev->dma_mem->bitmap) -+ goto free1_out; -+ -+ dev->dma_mem->virt_base = mem_base; -+ dev->dma_mem->device_base = device_addr; -+ dev->dma_mem->size = pages; -+ dev->dma_mem->flags = flags; -+ -+ if (flags & DMA_MEMORY_MAP) -+ return DMA_MEMORY_MAP; -+ -+ return DMA_MEMORY_IO; -+ -+ free1_out: -+ kfree(dev->dma_mem); -+ out: -+ if (mem_base) -+ iounmap(mem_base); -+ return 0; -+} -+EXPORT_SYMBOL(dma_declare_coherent_memory); -+ -+void dma_release_declared_memory(struct device *dev) -+{ -+ struct dma_coherent_mem *mem = dev->dma_mem; -+ -+ if(!mem) -+ return; -+ dev->dma_mem = NULL; -+ iounmap(mem->virt_base); -+ kfree(mem->bitmap); -+ kfree(mem); -+} -+EXPORT_SYMBOL(dma_release_declared_memory); -+ -+void *dma_mark_declared_memory_occupied(struct device *dev, -+ dma_addr_t device_addr, size_t size) -+{ -+ struct dma_coherent_mem *mem = dev->dma_mem; -+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; -+ int pos, err; -+ -+ if (!mem) -+ return ERR_PTR(-EINVAL); -+ -+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT; -+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); -+ if (err != 0) -+ return ERR_PTR(err); -+ return mem->virt_base + (pos << PAGE_SHIFT); -+} -+EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -+#endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ -+ -+dma_addr_t -+dma_map_single(struct device *dev, void *ptr, size_t size, -+ enum dma_data_direction direction) -+{ -+ dma_addr_t dma; -+ -+ BUG_ON(!valid_dma_direction(direction)); -+ WARN_ON(size == 0); -+ -+ if (swiotlb) { -+ dma = swiotlb_map_single(dev, ptr, size, direction); -+ } else { -+ dma = virt_to_bus(ptr); -+ IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size)); -+ IOMMU_BUG_ON(address_needs_mapping(dev, dma)); -+ } -+ -+ flush_write_buffers(); -+ return dma; -+} -+EXPORT_SYMBOL(dma_map_single); -+ -+void -+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, -+ enum dma_data_direction direction) -+{ -+ BUG_ON(!valid_dma_direction(direction)); -+ if (swiotlb) -+ swiotlb_unmap_single(dev, dma_addr, size, direction); -+} -+EXPORT_SYMBOL(dma_unmap_single); -+ -+void -+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, -+ enum dma_data_direction direction) -+{ -+ if (swiotlb) -+ swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); -+} -+EXPORT_SYMBOL(dma_sync_single_for_cpu); -+ -+void -+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, -+ enum dma_data_direction direction) -+{ -+ if (swiotlb) -+ swiotlb_sync_single_for_device(dev, dma_handle, size, direction); -+} -+EXPORT_SYMBOL(dma_sync_single_for_device); -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/process-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/process-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,905 @@ -+/* -+ * linux/arch/i386/kernel/process.c -+ * -+ * Copyright (C) 1995 Linus Torvalds -+ * -+ * Pentium III FXSR, SSE support -+ * Gareth Hughes , May 2000 -+ */ -+ -+/* -+ * This file handles the architecture-dependent parts of process handling.. -+ */ -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#ifdef CONFIG_MATH_EMULATION -+#include -+#endif -+ -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+ -+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -+ -+static int hlt_counter; -+ -+unsigned long boot_option_idle_override = 0; -+EXPORT_SYMBOL(boot_option_idle_override); -+ -+/* -+ * Return saved PC of a blocked thread. -+ */ -+unsigned long thread_saved_pc(struct task_struct *tsk) -+{ -+ return ((unsigned long *)tsk->thread.esp)[3]; -+} -+ -+/* -+ * Powermanagement idle function, if any.. -+ */ -+void (*pm_idle)(void); -+EXPORT_SYMBOL(pm_idle); -+static DEFINE_PER_CPU(unsigned int, cpu_idle_state); -+ -+void disable_hlt(void) -+{ -+ hlt_counter++; -+} -+ -+EXPORT_SYMBOL(disable_hlt); -+ -+void enable_hlt(void) -+{ -+ hlt_counter--; -+} -+ -+EXPORT_SYMBOL(enable_hlt); -+ -+/* -+ * On SMP it's slightly faster (but much more power-consuming!) -+ * to poll the ->work.need_resched flag instead of waiting for the -+ * cross-CPU IPI to arrive. Use this option with caution. -+ */ -+static void poll_idle (void) -+{ -+ local_irq_enable(); -+ -+ asm volatile( -+ "2:" -+ "testl %0, %1;" -+ "rep; nop;" -+ "je 2b;" -+ : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); -+} -+ -+static void xen_idle(void) -+{ -+ current_thread_info()->status &= ~TS_POLLING; -+ /* -+ * TS_POLLING-cleared state must be visible before we -+ * test NEED_RESCHED: -+ */ -+ smp_mb(); -+ -+ local_irq_disable(); -+ if (!need_resched()) -+ safe_halt(); /* enables interrupts racelessly */ -+ else -+ local_irq_enable(); -+ current_thread_info()->status |= TS_POLLING; -+} -+#ifdef CONFIG_APM_MODULE -+EXPORT_SYMBOL(default_idle); -+#endif -+ -+#ifdef CONFIG_HOTPLUG_CPU -+extern cpumask_t cpu_initialized; -+static inline void play_dead(void) -+{ -+ idle_task_exit(); -+ local_irq_disable(); -+ cpu_clear(smp_processor_id(), cpu_initialized); -+ preempt_enable_no_resched(); -+ HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); -+ cpu_bringup(); -+} -+#else -+static inline void play_dead(void) -+{ -+ BUG(); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+/* -+ * The idle thread. There's no useful work to be -+ * done, so just try to conserve power and have a -+ * low exit latency (ie sit in a loop waiting for -+ * somebody to say that they'd like to reschedule) -+ */ -+void cpu_idle(void) -+{ -+ int cpu = smp_processor_id(); -+ -+ current_thread_info()->status |= TS_POLLING; -+ -+ /* endless idle loop with no priority at all */ -+ while (1) { -+ tick_nohz_stop_sched_tick(); -+ while (!need_resched()) { -+ void (*idle)(void); -+ -+ if (__get_cpu_var(cpu_idle_state)) -+ __get_cpu_var(cpu_idle_state) = 0; -+ -+ rmb(); -+ idle = xen_idle; /* no alternatives */ -+ -+ if (cpu_is_offline(cpu)) -+ play_dead(); -+ -+ __get_cpu_var(irq_stat).idle_timestamp = jiffies; -+ idle(); -+ } -+ tick_nohz_restart_sched_tick(); -+ preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+ } -+} -+ -+void cpu_idle_wait(void) -+{ -+ unsigned int cpu, this_cpu = get_cpu(); -+ cpumask_t map, tmp = current->cpus_allowed; -+ -+ set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); -+ put_cpu(); -+ -+ cpus_clear(map); -+ for_each_online_cpu(cpu) { -+ per_cpu(cpu_idle_state, cpu) = 1; -+ cpu_set(cpu, map); -+ } -+ -+ __get_cpu_var(cpu_idle_state) = 0; -+ -+ wmb(); -+ do { -+ ssleep(1); -+ for_each_online_cpu(cpu) { -+ if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) -+ cpu_clear(cpu, map); -+ } -+ cpus_and(map, map, cpu_online_map); -+ } while (!cpus_empty(map)); -+ -+ set_cpus_allowed(current, tmp); -+} -+EXPORT_SYMBOL_GPL(cpu_idle_wait); -+ -+/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */ -+/* Always use xen_idle() instead. */ -+void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) {} -+ -+void __devinit select_idle_routine(const struct cpuinfo_x86 *c) -+{ -+} -+ -+static int __init idle_setup (char *str) -+{ -+ if (!strncmp(str, "poll", 4)) { -+ printk("using polling idle threads.\n"); -+ pm_idle = poll_idle; -+ } -+ -+ boot_option_idle_override = 1; -+ return 1; -+} -+ -+__setup("idle=", idle_setup); -+ -+void show_regs(struct pt_regs * regs) -+{ -+ unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; -+ -+ printk("\n"); -+ printk("Pid: %d, comm: %20s\n", current->pid, current->comm); -+ printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); -+ print_symbol("EIP is at %s\n", regs->eip); -+ -+ if (user_mode_vm(regs)) -+ printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); -+ printk(" EFLAGS: %08lx %s (%s %.*s)\n", -+ regs->eflags, print_tainted(), init_utsname()->release, -+ (int)strcspn(init_utsname()->version, " "), -+ init_utsname()->version); -+ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", -+ regs->eax,regs->ebx,regs->ecx,regs->edx); -+ printk("ESI: %08lx EDI: %08lx EBP: %08lx", -+ regs->esi, regs->edi, regs->ebp); -+ printk(" DS: %04x ES: %04x FS: %04x\n", -+ 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); -+ -+ cr0 = read_cr0(); -+ cr2 = read_cr2(); -+ cr3 = read_cr3(); -+ cr4 = read_cr4_safe(); -+ printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); -+ show_trace(NULL, regs, ®s->esp); -+} -+ -+/* -+ * This gets run with %ebx containing the -+ * function to call, and %edx containing -+ * the "args". -+ */ -+extern void kernel_thread_helper(void); -+ -+/* -+ * Create a kernel thread -+ */ -+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) -+{ -+ struct pt_regs regs; -+ -+ memset(®s, 0, sizeof(regs)); -+ -+ regs.ebx = (unsigned long) fn; -+ regs.edx = (unsigned long) arg; -+ -+ regs.xds = __USER_DS; -+ regs.xes = __USER_DS; -+ regs.xfs = __KERNEL_PDA; -+ regs.orig_eax = -1; -+ regs.eip = (unsigned long) kernel_thread_helper; -+ regs.xcs = __KERNEL_CS | get_kernel_rpl(); -+ regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; -+ -+ /* Ok, create the new process.. */ -+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -+} -+EXPORT_SYMBOL(kernel_thread); -+ -+/* -+ * Free current thread data structures etc.. -+ */ -+void exit_thread(void) -+{ -+ /* The process may have allocated an io port bitmap... nuke it. */ -+ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { -+ struct task_struct *tsk = current; -+ struct thread_struct *t = &tsk->thread; -+ struct physdev_set_iobitmap set_iobitmap; -+ memset(&set_iobitmap, 0, sizeof(set_iobitmap)); -+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap); -+ kfree(t->io_bitmap_ptr); -+ t->io_bitmap_ptr = NULL; -+ clear_thread_flag(TIF_IO_BITMAP); -+ } -+} -+ -+void flush_thread(void) -+{ -+ struct task_struct *tsk = current; -+ -+ memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); -+ memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -+ clear_tsk_thread_flag(tsk, TIF_DEBUG); -+ /* -+ * Forget coprocessor state.. -+ */ -+ clear_fpu(tsk); -+ clear_used_math(); -+} -+ -+void release_thread(struct task_struct *dead_task) -+{ -+ BUG_ON(dead_task->mm); -+ release_vm86_irqs(dead_task); -+} -+ -+/* -+ * This gets called before we allocate a new thread and copy -+ * the current task into it. -+ */ -+void prepare_to_copy(struct task_struct *tsk) -+{ -+ unlazy_fpu(tsk); -+} -+ -+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, -+ unsigned long unused, -+ struct task_struct * p, struct pt_regs * regs) -+{ -+ struct pt_regs * childregs; -+ struct task_struct *tsk; -+ int err; -+ -+ childregs = task_pt_regs(p); -+ *childregs = *regs; -+ childregs->eax = 0; -+ childregs->esp = esp; -+ -+ p->thread.esp = (unsigned long) childregs; -+ p->thread.esp0 = (unsigned long) (childregs+1); -+ -+ p->thread.eip = (unsigned long) ret_from_fork; -+ -+ savesegment(gs,p->thread.gs); -+ -+ tsk = current; -+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { -+ p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, -+ IO_BITMAP_BYTES, GFP_KERNEL); -+ if (!p->thread.io_bitmap_ptr) { -+ p->thread.io_bitmap_max = 0; -+ return -ENOMEM; -+ } -+ set_tsk_thread_flag(p, TIF_IO_BITMAP); -+ } -+ -+ /* -+ * Set a new TLS for the child thread? -+ */ -+ if (clone_flags & CLONE_SETTLS) { -+ struct desc_struct *desc; -+ struct user_desc info; -+ int idx; -+ -+ err = -EFAULT; -+ if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) -+ goto out; -+ err = -EINVAL; -+ if (LDT_empty(&info)) -+ goto out; -+ -+ idx = info.entry_number; -+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) -+ goto out; -+ -+ desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; -+ desc->a = LDT_entry_a(&info); -+ desc->b = LDT_entry_b(&info); -+ } -+ -+ p->thread.iopl = current->thread.iopl; -+ -+ err = 0; -+ out: -+ if (err && p->thread.io_bitmap_ptr) { -+ kfree(p->thread.io_bitmap_ptr); -+ p->thread.io_bitmap_max = 0; -+ } -+ return err; -+} -+ -+/* -+ * fill in the user structure for a core dump.. -+ */ -+void dump_thread(struct pt_regs * regs, struct user * dump) -+{ -+ int i; -+ -+/* changed the size calculations - should hopefully work better. lbt */ -+ dump->magic = CMAGIC; -+ dump->start_code = 0; -+ dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); -+ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; -+ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; -+ dump->u_dsize -= dump->u_tsize; -+ dump->u_ssize = 0; -+ for (i = 0; i < 8; i++) -+ dump->u_debugreg[i] = current->thread.debugreg[i]; -+ -+ if (dump->start_stack < TASK_SIZE) -+ dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; -+ -+ dump->regs.ebx = regs->ebx; -+ dump->regs.ecx = regs->ecx; -+ dump->regs.edx = regs->edx; -+ dump->regs.esi = regs->esi; -+ dump->regs.edi = regs->edi; -+ dump->regs.ebp = regs->ebp; -+ dump->regs.eax = regs->eax; -+ dump->regs.ds = regs->xds; -+ dump->regs.es = regs->xes; -+ dump->regs.fs = regs->xfs; -+ savesegment(gs,dump->regs.gs); -+ dump->regs.orig_eax = regs->orig_eax; -+ dump->regs.eip = regs->eip; -+ dump->regs.cs = regs->xcs; -+ dump->regs.eflags = regs->eflags; -+ dump->regs.esp = regs->esp; -+ dump->regs.ss = regs->xss; -+ -+ dump->u_fpvalid = dump_fpu (regs, &dump->i387); -+} -+EXPORT_SYMBOL(dump_thread); -+ -+/* -+ * Capture the user space registers if the task is not running (in user space) -+ */ -+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -+{ -+ struct pt_regs ptregs = *task_pt_regs(tsk); -+ ptregs.xcs &= 0xffff; -+ ptregs.xds &= 0xffff; -+ ptregs.xes &= 0xffff; -+ ptregs.xss &= 0xffff; -+ -+ elf_core_copy_regs(regs, &ptregs); -+ -+ return 1; -+} -+ -+static noinline void __switch_to_xtra(struct task_struct *next_p) -+{ -+ struct thread_struct *next; -+ -+ next = &next_p->thread; -+ -+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { -+ set_debugreg(next->debugreg[0], 0); -+ set_debugreg(next->debugreg[1], 1); -+ set_debugreg(next->debugreg[2], 2); -+ set_debugreg(next->debugreg[3], 3); -+ /* no 4 and 5 */ -+ set_debugreg(next->debugreg[6], 6); -+ set_debugreg(next->debugreg[7], 7); -+ } -+#ifndef CONFIG_XEN -+ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { -+ /* -+ * Disable the bitmap via an invalid offset. We still cache -+ * the previous bitmap owner and the IO bitmap contents: -+ */ -+ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; -+ return; -+ } -+ -+ if (likely(next == tss->io_bitmap_owner)) { -+ /* -+ * Previous owner of the bitmap (hence the bitmap content) -+ * matches the next task, we dont have to do anything but -+ * to set a valid offset in the TSS: -+ */ -+ tss->io_bitmap_base = IO_BITMAP_OFFSET; -+ return; -+ } -+ /* -+ * Lazy TSS's I/O bitmap copy. We set an invalid offset here -+ * and we let the task to get a GPF in case an I/O instruction -+ * is performed. The handler of the GPF will verify that the -+ * faulting task has a valid I/O bitmap and, it true, does the -+ * real copy and restart the instruction. This will save us -+ * redundant copies when the currently switched task does not -+ * perform any I/O during its timeslice. -+ */ -+ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; -+#endif /* !CONFIG_XEN */ -+} -+ -+/* -+ * This function selects if the context switch from prev to next -+ * has to tweak the TSC disable bit in the cr4. -+ */ -+static inline void disable_tsc(struct task_struct *prev_p, -+ struct task_struct *next_p) -+{ -+ struct thread_info *prev, *next; -+ -+ /* -+ * gcc should eliminate the ->thread_info dereference if -+ * has_secure_computing returns 0 at compile time (SECCOMP=n). -+ */ -+ prev = task_thread_info(prev_p); -+ next = task_thread_info(next_p); -+ -+ if (has_secure_computing(prev) || has_secure_computing(next)) { -+ /* slow path here */ -+ if (has_secure_computing(prev) && -+ !has_secure_computing(next)) { -+ write_cr4(read_cr4() & ~X86_CR4_TSD); -+ } else if (!has_secure_computing(prev) && -+ has_secure_computing(next)) -+ write_cr4(read_cr4() | X86_CR4_TSD); -+ } -+} -+ -+/* -+ * switch_to(x,yn) should switch tasks from x to y. -+ * -+ * We fsave/fwait so that an exception goes off at the right time -+ * (as a call from the fsave or fwait in effect) rather than to -+ * the wrong process. Lazy FP saving no longer makes any sense -+ * with modern CPU's, and this simplifies a lot of things (SMP -+ * and UP become the same). -+ * -+ * NOTE! We used to use the x86 hardware context switching. The -+ * reason for not using it any more becomes apparent when you -+ * try to recover gracefully from saved state that is no longer -+ * valid (stale segment register values in particular). With the -+ * hardware task-switch, there is no way to fix up bad state in -+ * a reasonable manner. -+ * -+ * The fact that Intel documents the hardware task-switching to -+ * be slow is a fairly red herring - this code is not noticeably -+ * faster. However, there _is_ some room for improvement here, -+ * so the performance issues may eventually be a valid point. -+ * More important, however, is the fact that this allows us much -+ * more flexibility. -+ * -+ * The return value (in %eax) will be the "prev" task after -+ * the task-switch, and shows up in ret_from_fork in entry.S, -+ * for example. -+ */ -+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) -+{ -+ struct thread_struct *prev = &prev_p->thread, -+ *next = &next_p->thread; -+ int cpu = smp_processor_id(); -+#ifndef CONFIG_X86_NO_TSS -+ struct tss_struct *tss = &per_cpu(init_tss, cpu); -+#endif -+ struct physdev_set_iobitmap iobmp_op; -+ multicall_entry_t _mcl[8], *mcl = _mcl; -+ -+ /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ -+ -+ /* -+ * This is basically '__unlazy_fpu', except that we queue a -+ * multicall to indicate FPU task switch, rather than -+ * synchronously trapping to Xen. -+ */ -+ if (prev_p->thread_info->status & TS_USEDFPU) { -+ __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ -+ mcl->op = __HYPERVISOR_fpu_taskswitch; -+ mcl->args[0] = 1; -+ mcl++; -+ } -+#if 0 /* lazy fpu sanity check */ -+ else BUG_ON(!(read_cr0() & 8)); -+#endif -+ -+ /* we're going to use this soon, after a few expensive things */ -+ if (next_p->fpu_counter > 5) -+ prefetch(&next->i387.fxsave); -+ -+ /* -+ * Reload esp0. -+ * This is load_esp0(tss, next) with a multicall. -+ */ -+ mcl->op = __HYPERVISOR_stack_switch; -+ mcl->args[0] = __KERNEL_DS; -+ mcl->args[1] = next->esp0; -+ mcl++; -+ -+ /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ -+ -+ /* -+ * Load the per-thread Thread-Local Storage descriptor. -+ * This is load_TLS(next, cpu) with multicalls. -+ */ -+#define C(i) do { \ -+ if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ -+ next->tls_array[i].b != prev->tls_array[i].b)) { \ -+ mcl->op = __HYPERVISOR_update_descriptor; \ -+ *(u64 *)&mcl->args[0] = virt_to_machine( \ -+ &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ -+ *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ -+ mcl++; \ -+ } \ -+} while (0) -+ C(0); C(1); C(2); -+#undef C -+ -+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { -+ set_xen_guest_handle(iobmp_op.bitmap, -+ (char *)next->io_bitmap_ptr); -+ iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; -+ mcl->op = __HYPERVISOR_physdev_op; -+ mcl->args[0] = PHYSDEVOP_set_iobitmap; -+ mcl->args[1] = (unsigned long)&iobmp_op; -+ mcl++; -+ } -+ -+ (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); -+ -+ /* -+ * Restore IOPL if needed. In normal use, the flags restore -+ * in the switch assembly will handle this. But if the kernel -+ * is running virtualized at a non-zero CPL, the popf will -+ * not restore flags, so it must be done in a separate step. -+ */ -+ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) -+ set_iopl_mask(next->iopl); -+ -+ /* -+ * Now maybe handle debug registers -+ */ -+ if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) -+ __switch_to_xtra(next_p); -+ -+ disable_tsc(prev_p, next_p); -+ -+ /* -+ * Leave lazy mode, flushing any hypercalls made here. -+ * This must be done before restoring TLS segments so -+ * the GDT and LDT are properly updated, and must be -+ * done before math_state_restore, so the TS bit is up -+ * to date. -+ */ -+ arch_leave_lazy_cpu_mode(); -+ -+ /* If the task has used fpu the last 5 timeslices, just do a full -+ * restore of the math state immediately to avoid the trap; the -+ * chances of needing FPU soon are obviously high now -+ */ -+ if (next_p->fpu_counter > 5) -+ math_state_restore(); -+ -+ /* -+ * Restore %gs if needed (which is common) -+ */ -+ if (prev->gs | next->gs) -+ loadsegment(gs, next->gs); -+ -+ write_pda(pcurrent, next_p); -+ -+ return prev_p; -+} -+ -+asmlinkage int sys_fork(struct pt_regs regs) -+{ -+ return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); -+} -+ -+asmlinkage int sys_clone(struct pt_regs regs) -+{ -+ unsigned long clone_flags; -+ unsigned long newsp; -+ int __user *parent_tidptr, *child_tidptr; -+ -+ clone_flags = regs.ebx; -+ newsp = regs.ecx; -+ parent_tidptr = (int __user *)regs.edx; -+ child_tidptr = (int __user *)regs.edi; -+ if (!newsp) -+ newsp = regs.esp; -+ return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); -+} -+ -+/* -+ * This is trivial, and on the face of it looks like it -+ * could equally well be done in user mode. -+ * -+ * Not so, for quite unobvious reasons - register pressure. -+ * In user mode vfork() cannot have a stack frame, and if -+ * done by calling the "clone()" system call directly, you -+ * do not have enough call-clobbered registers to hold all -+ * the information you need. -+ */ -+asmlinkage int sys_vfork(struct pt_regs regs) -+{ -+ return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); -+} -+ -+/* -+ * sys_execve() executes a new program. -+ */ -+asmlinkage int sys_execve(struct pt_regs regs) -+{ -+ int error; -+ char * filename; -+ -+ filename = getname((char __user *) regs.ebx); -+ error = PTR_ERR(filename); -+ if (IS_ERR(filename)) -+ goto out; -+ error = do_execve(filename, -+ (char __user * __user *) regs.ecx, -+ (char __user * __user *) regs.edx, -+ ®s); -+ if (error == 0) { -+ task_lock(current); -+ current->ptrace &= ~PT_DTRACE; -+ task_unlock(current); -+ /* Make sure we don't return using sysenter.. */ -+ set_thread_flag(TIF_IRET); -+ } -+ putname(filename); -+out: -+ return error; -+} -+ -+#define top_esp (THREAD_SIZE - sizeof(unsigned long)) -+#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) -+ -+unsigned long get_wchan(struct task_struct *p) -+{ -+ unsigned long ebp, esp, eip; -+ unsigned long stack_page; -+ int count = 0; -+ if (!p || p == current || p->state == TASK_RUNNING) -+ return 0; -+ stack_page = (unsigned long)task_stack_page(p); -+ esp = p->thread.esp; -+ if (!stack_page || esp < stack_page || esp > top_esp+stack_page) -+ return 0; -+ /* include/asm-i386/system.h:switch_to() pushes ebp last. */ -+ ebp = *(unsigned long *) esp; -+ do { -+ if (ebp < stack_page || ebp > top_ebp+stack_page) -+ return 0; -+ eip = *(unsigned long *) (ebp+4); -+ if (!in_sched_functions(eip)) -+ return eip; -+ ebp = *(unsigned long *) ebp; -+ } while (count++ < 16); -+ return 0; -+} -+ -+/* -+ * sys_alloc_thread_area: get a yet unused TLS descriptor index. -+ */ -+static int get_free_idx(void) -+{ -+ struct thread_struct *t = ¤t->thread; -+ int idx; -+ -+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) -+ if (desc_empty(t->tls_array + idx)) -+ return idx + GDT_ENTRY_TLS_MIN; -+ return -ESRCH; -+} -+ -+/* -+ * Set a given TLS descriptor: -+ */ -+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -+{ -+ struct thread_struct *t = ¤t->thread; -+ struct user_desc info; -+ struct desc_struct *desc; -+ int cpu, idx; -+ -+ if (copy_from_user(&info, u_info, sizeof(info))) -+ return -EFAULT; -+ idx = info.entry_number; -+ -+ /* -+ * index -1 means the kernel should try to find and -+ * allocate an empty descriptor: -+ */ -+ if (idx == -1) { -+ idx = get_free_idx(); -+ if (idx < 0) -+ return idx; -+ if (put_user(idx, &u_info->entry_number)) -+ return -EFAULT; -+ } -+ -+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) -+ return -EINVAL; -+ -+ desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; -+ -+ /* -+ * We must not get preempted while modifying the TLS. -+ */ -+ cpu = get_cpu(); -+ -+ if (LDT_empty(&info)) { -+ desc->a = 0; -+ desc->b = 0; -+ } else { -+ desc->a = LDT_entry_a(&info); -+ desc->b = LDT_entry_b(&info); -+ } -+ load_TLS(t, cpu); -+ -+ put_cpu(); -+ -+ return 0; -+} -+ -+/* -+ * Get the current Thread-Local Storage area: -+ */ -+ -+#define GET_BASE(desc) ( \ -+ (((desc)->a >> 16) & 0x0000ffff) | \ -+ (((desc)->b << 16) & 0x00ff0000) | \ -+ ( (desc)->b & 0xff000000) ) -+ -+#define GET_LIMIT(desc) ( \ -+ ((desc)->a & 0x0ffff) | \ -+ ((desc)->b & 0xf0000) ) -+ -+#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -+#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -+#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -+#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -+#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -+#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) -+ -+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -+{ -+ struct user_desc info; -+ struct desc_struct *desc; -+ int idx; -+ -+ if (get_user(idx, &u_info->entry_number)) -+ return -EFAULT; -+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) -+ return -EINVAL; -+ -+ memset(&info, 0, sizeof(info)); -+ -+ desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; -+ -+ info.entry_number = idx; -+ info.base_addr = GET_BASE(desc); -+ info.limit = GET_LIMIT(desc); -+ info.seg_32bit = GET_32BIT(desc); -+ info.contents = GET_CONTENTS(desc); -+ info.read_exec_only = !GET_WRITABLE(desc); -+ info.limit_in_pages = GET_LIMIT_PAGES(desc); -+ info.seg_not_present = !GET_PRESENT(desc); -+ info.useable = GET_USEABLE(desc); -+ -+ if (copy_to_user(u_info, &info, sizeof(info))) -+ return -EFAULT; -+ return 0; -+} -+ -+unsigned long arch_align_stack(unsigned long sp) -+{ -+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) -+ sp -= get_random_int() % 8192; -+ return sp & ~0xf; -+} -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/quirks.c ---- a/arch/i386/kernel/quirks.c Fri Jul 20 11:42:41 2007 -0300 -+++ b/arch/i386/kernel/quirks.c Fri Jul 20 11:56:41 2007 -0300 -@@ -7,7 +7,7 @@ - #include - #include - --#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) - static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) - { - u8 config, rev; -@@ -68,11 +68,19 @@ void __init quirk_intel_irqbalance(void) - word = read_pci_config_16(0, 0, 0x40, 0x4c); - - if (!(word & (1 << 13))) { -+#ifdef CONFIG_XEN -+ struct xen_platform_op op; -+ printk(KERN_INFO "Disabling irq balancing and affinity\n"); -+ op.cmd = XENPF_platform_quirk; -+ op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; -+ (void)HYPERVISOR_platform_op(&op); -+#else - printk(KERN_INFO "Disabling irq balancing and affinity\n"); - #ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); - #endif - noirqdebug_setup(""); -+#endif /* CONFIG_XEN */ - #ifdef CONFIG_PROC_FS - no_irq_affinity = 1; - #endif -@@ -80,12 +88,12 @@ void __init quirk_intel_irqbalance(void) - printk(KERN_INFO "Disabling cpu hotplug control\n"); - enable_cpu_hotplug = 0; - #endif --#ifdef CONFIG_X86_64 -+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) - /* force the genapic selection to flat mode so that - * interrupts can be redirected to more than one CPU. - */ - genapic_force = &apic_flat; --#endif -+#endif /* CONFIG_XEN */ - } - - /* put back the original value for config space */ -diff -r 4edbf98e9507 -r 3990a07432f0 arch/i386/kernel/setup-xen.c ---- /dev/null Thu Jan 01 00:00:00 1970 +0000 -+++ b/arch/i386/kernel/setup-xen.c Fri Jul 20 11:56:41 2007 -0300 -@@ -0,0 +1,825 @@ -+/* -+ * linux/arch/i386/kernel/setup.c -+ * -+ * Copyright (C) 1995 Linus Torvalds -+ * -+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 -+ * -+ * Memory region support -+ * David Parsons , July-August 1999 -+ * -+ * Added E820 sanitization routine (removes overlapping memory regions); -+ * Brian Moyle , February 2001 -+ * -+ * Moved CPU detection code to cpu/${cpu}.c -+ * Patrick Mochel , March 2002 -+ * -+ * Provisions for empty E820 memory regions (reported by certain BIOSes). -+ * Alex Achenbach , December 2002. -+ * -+ */ -+ -+/* -+ * This file handles the architecture-dependent parts of initialization -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include