diff options
Diffstat (limited to 'hw/device-assignment.c')
-rw-r--r-- | hw/device-assignment.c | 1378 |
1 files changed, 1378 insertions, 0 deletions
diff --git a/hw/device-assignment.c b/hw/device-assignment.c new file mode 100644 index 000000000..801950eaf --- /dev/null +++ b/hw/device-assignment.c @@ -0,0 +1,1378 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * + * Assign a PCI device from the host to a guest VM. + * + * Adapted for KVM by Qumranet. + * + * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) + * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) + * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) + * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) + * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) + */ +#include <stdio.h> +#include <unistd.h> +#include <sys/io.h> +#include <sys/types.h> +#include <sys/stat.h> +#include "qemu-kvm.h" +#include "hw.h" +#include "pc.h" +#include "sysemu.h" +#include "console.h" +#include "device-assignment.h" +#include "loader.h" +#include <pci/pci.h> + +/* From linux/ioport.h */ +#define IORESOURCE_IO 0x00000100 /* Resource type */ +#define IORESOURCE_MEM 0x00000200 +#define IORESOURCE_IRQ 0x00000400 +#define IORESOURCE_DMA 0x00000800 +#define IORESOURCE_PREFETCH 0x00001000 /* No side effects */ + +/* #define DEVICE_ASSIGNMENT_DEBUG 1 */ + +#ifdef DEVICE_ASSIGNMENT_DEBUG +#define DEBUG(fmt, ...) \ + do { \ + fprintf(stderr, "%s: " fmt, __func__ , __VA_ARGS__); \ + } while (0) +#else +#define DEBUG(fmt, ...) do { } while(0) +#endif + +static void assigned_dev_load_option_rom(AssignedDevice *dev); + +static uint32_t guest_to_host_ioport(AssignedDevRegion *region, uint32_t addr) +{ + return region->u.r_baseport + (addr - region->e_physbase); +} + +static void assigned_dev_ioport_writeb(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = opaque; + uint32_t r_pio = guest_to_host_ioport(r_access, addr); + + DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n", + r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->u.r_baseport, value); + + outb(value, r_pio); +} + +static void assigned_dev_ioport_writew(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = opaque; + uint32_t r_pio = guest_to_host_ioport(r_access, addr); + + DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n", + r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->u.r_baseport, value); + + outw(value, r_pio); +} + +static void assigned_dev_ioport_writel(void *opaque, uint32_t addr, + uint32_t value) +{ + AssignedDevRegion *r_access = opaque; + uint32_t r_pio = guest_to_host_ioport(r_access, addr); + + DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n", + r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->u.r_baseport, value); + + outl(value, r_pio); +} + +static uint32_t assigned_dev_ioport_readb(void *opaque, uint32_t addr) +{ + AssignedDevRegion *r_access = opaque; + uint32_t r_pio = guest_to_host_ioport(r_access, addr); + uint32_t value; + + value = inb(r_pio); + + DEBUG("r_pio=%08x e_physbase=%08x r_=%08lx value=%08x\n", + r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->u.r_baseport, value); + + return value; +} + +static uint32_t assigned_dev_ioport_readw(void *opaque, uint32_t addr) +{ + AssignedDevRegion *r_access = opaque; + uint32_t r_pio = guest_to_host_ioport(r_access, addr); + uint32_t value; + + value = inw(r_pio); + + DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n", + r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->u.r_baseport, value); + + return value; +} + +static uint32_t assigned_dev_ioport_readl(void *opaque, uint32_t addr) +{ + AssignedDevRegion *r_access = opaque; + uint32_t r_pio = guest_to_host_ioport(r_access, addr); + uint32_t value; + + value = inl(r_pio); + + DEBUG("r_pio=%08x e_physbase=%08x r_baseport=%08lx value=%08x\n", + r_pio, (int)r_access->e_physbase, + (unsigned long)r_access->u.r_baseport, value); + + return value; +} + +static void assigned_dev_iomem_map(PCIDevice *pci_dev, int region_num, + pcibus_t e_phys, pcibus_t e_size, int type) +{ + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; + PCIRegion *real_region = &r_dev->real_device.regions[region_num]; + pcibus_t old_ephys = region->e_physbase; + pcibus_t old_esize = region->e_size; + int first_map = (region->e_size == 0); + int ret = 0; + + DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n", + e_phys, region->u.r_virtbase, type, e_size, region_num); + + region->e_physbase = e_phys; + region->e_size = e_size; + + if (!first_map) + kvm_destroy_phys_mem(kvm_context, old_ephys, + TARGET_PAGE_ALIGN(old_esize)); + + if (e_size > 0) { + /* deal with MSI-X MMIO page */ + if (real_region->base_addr <= r_dev->msix_table_addr && + real_region->base_addr + real_region->size >= + r_dev->msix_table_addr) { + int offset = r_dev->msix_table_addr - real_region->base_addr; + ret = munmap(region->u.r_virtbase + offset, TARGET_PAGE_SIZE); + if (ret == 0) + DEBUG("munmap done, virt_base 0x%p\n", + region->u.r_virtbase + offset); + else { + fprintf(stderr, "%s: fail munmap msix table!\n", __func__); + exit(1); + } + cpu_register_physical_memory(e_phys + offset, + TARGET_PAGE_SIZE, r_dev->mmio_index); + } + ret = kvm_register_phys_mem(kvm_context, e_phys, + region->u.r_virtbase, + TARGET_PAGE_ALIGN(e_size), 0); + } + + if (ret != 0) { + fprintf(stderr, "%s: Error: create new mapping failed\n", __func__); + exit(1); + } +} + +static void assigned_dev_ioport_map(PCIDevice *pci_dev, int region_num, + pcibus_t addr, pcibus_t size, int type) +{ + AssignedDevice *r_dev = container_of(pci_dev, AssignedDevice, dev); + AssignedDevRegion *region = &r_dev->v_addrs[region_num]; + int first_map = (region->e_size == 0); + CPUState *env; + + region->e_physbase = addr; + region->e_size = size; + + DEBUG("e_phys=0x%x r_baseport=%x type=0x%x len=%d region_num=%d \n", + addr, region->u.r_baseport, type, size, region_num); + + if (first_map) { + struct ioperm_data *data; + + data = qemu_mallocz(sizeof(struct ioperm_data)); + if (data == NULL) { + fprintf(stderr, "%s: Out of memory\n", __func__); + exit(1); + } + + data->start_port = region->u.r_baseport; + data->num = region->r_size; + data->turn_on = 1; + + kvm_add_ioperm_data(data); + + for (env = first_cpu; env; env = env->next_cpu) + kvm_ioperm(env, data); + } + + register_ioport_read(addr, size, 1, assigned_dev_ioport_readb, + (r_dev->v_addrs + region_num)); + register_ioport_read(addr, size, 2, assigned_dev_ioport_readw, + (r_dev->v_addrs + region_num)); + register_ioport_read(addr, size, 4, assigned_dev_ioport_readl, + (r_dev->v_addrs + region_num)); + register_ioport_write(addr, size, 1, assigned_dev_ioport_writeb, + (r_dev->v_addrs + region_num)); + register_ioport_write(addr, size, 2, assigned_dev_ioport_writew, + (r_dev->v_addrs + region_num)); + register_ioport_write(addr, size, 4, assigned_dev_ioport_writel, + (r_dev->v_addrs + region_num)); +} + +static uint8_t pci_find_cap_offset(struct pci_dev *pci_dev, uint8_t cap) +{ + int id; + int max_cap = 48; + int pos = PCI_CAPABILITY_LIST; + int status; + + status = pci_read_byte(pci_dev, PCI_STATUS); + if ((status & PCI_STATUS_CAP_LIST) == 0) + return 0; + + while (max_cap--) { + pos = pci_read_byte(pci_dev, pos); + if (pos < 0x40) + break; + + pos &= ~3; + id = pci_read_byte(pci_dev, pos + PCI_CAP_LIST_ID); + + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} + +static void assigned_dev_pci_write_config(PCIDevice *d, uint32_t address, + uint32_t val, int len) +{ + int fd; + ssize_t ret; + AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev); + + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), + (uint16_t) address, val, len); + + if (address == 0x4) { + pci_default_write_config(d, address, val, len); + /* Continue to program the card */ + } + + if ((address >= 0x10 && address <= 0x24) || address == 0x30 || + address == 0x34 || address == 0x3c || address == 0x3d || + pci_access_cap_config(d, address, len)) { + /* used for update-mappings (BAR emulation) */ + pci_default_write_config(d, address, val, len); + return; + } + + DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n", + ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), + (uint16_t) address, val, len); + + fd = pci_dev->real_device.config_fd; + +again: + ret = pwrite(fd, &val, len, address); + if (ret != len) { + if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) + goto again; + + fprintf(stderr, "%s: pwrite failed, ret = %zd errno = %d\n", + __func__, ret, errno); + + exit(1); + } +} + +static uint32_t assigned_dev_pci_read_config(PCIDevice *d, uint32_t address, + int len) +{ + uint32_t val = 0; + int fd; + ssize_t ret; + AssignedDevice *pci_dev = container_of(d, AssignedDevice, dev); + + if (address < 0x4 || (pci_dev->need_emulate_cmd && address == 0x4) || + (address >= 0x10 && address <= 0x24) || address == 0x30 || + address == 0x34 || address == 0x3c || address == 0x3d || + pci_access_cap_config(d, address, len)) { + val = pci_default_read_config(d, address, len); + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); + return val; + } + + /* vga specific, remove later */ + if (address == 0xFC) + goto do_log; + + fd = pci_dev->real_device.config_fd; + +again: + ret = pread(fd, &val, len, address); + if (ret != len) { + if ((ret < 0) && (errno == EINTR || errno == EAGAIN)) + goto again; + + fprintf(stderr, "%s: pread failed, ret = %zd errno = %d\n", + __func__, ret, errno); + + exit(1); + } + +do_log: + DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n", + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); + + if (!pci_dev->cap.available) { + /* kill the special capabilities */ + if (address == 4 && len == 4) + val &= ~0x100000; + else if (address == 6) + val &= ~0x10; + } + + return val; +} + +static int assigned_dev_register_regions(PCIRegion *io_regions, + unsigned long regions_num, + AssignedDevice *pci_dev) +{ + uint32_t i; + PCIRegion *cur_region = io_regions; + + for (i = 0; i < regions_num; i++, cur_region++) { + if (!cur_region->valid) + continue; + pci_dev->v_addrs[i].num = i; + + /* handle memory io regions */ + if (cur_region->type & IORESOURCE_MEM) { + int t = cur_region->type & IORESOURCE_PREFETCH + ? PCI_BASE_ADDRESS_MEM_PREFETCH + : PCI_BASE_ADDRESS_SPACE_MEMORY; + if (cur_region->size & 0xFFF) { + fprintf(stderr, "Unable to assign device: PCI region %d " + "at address 0x%llx has size 0x%x, " + " which is not a multiple of 4K\n", + i, (unsigned long long)cur_region->base_addr, + cur_region->size); + return -1; + } + + /* map physical memory */ + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; + if (i == PCI_ROM_SLOT) { + pci_dev->v_addrs[i].u.r_virtbase = + mmap(NULL, + (cur_region->size + 0xFFF) & 0xFFFFF000, + PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, + 0, (off_t) 0); + + } else { + pci_dev->v_addrs[i].u.r_virtbase = + mmap(NULL, + (cur_region->size + 0xFFF) & 0xFFFFF000, + PROT_WRITE | PROT_READ, MAP_SHARED, + cur_region->resource_fd, (off_t) 0); + } + + if (pci_dev->v_addrs[i].u.r_virtbase == MAP_FAILED) { + pci_dev->v_addrs[i].u.r_virtbase = NULL; + fprintf(stderr, "%s: Error: Couldn't mmap 0x%x!" + "\n", __func__, + (uint32_t) (cur_region->base_addr)); + return -1; + } + + if (i == PCI_ROM_SLOT) { + memset(pci_dev->v_addrs[i].u.r_virtbase, 0, + (cur_region->size + 0xFFF) & 0xFFFFF000); + mprotect(pci_dev->v_addrs[PCI_ROM_SLOT].u.r_virtbase, + (cur_region->size + 0xFFF) & 0xFFFFF000, PROT_READ); + } + + pci_dev->v_addrs[i].r_size = cur_region->size; + pci_dev->v_addrs[i].e_size = 0; + + /* add offset */ + pci_dev->v_addrs[i].u.r_virtbase += + (cur_region->base_addr & 0xFFF); + + pci_register_bar((PCIDevice *) pci_dev, i, + cur_region->size, t, + assigned_dev_iomem_map); + continue; + } + /* handle port io regions */ + pci_dev->v_addrs[i].e_physbase = cur_region->base_addr; + pci_dev->v_addrs[i].u.r_baseport = cur_region->base_addr; + pci_dev->v_addrs[i].r_size = cur_region->size; + pci_dev->v_addrs[i].e_size = 0; + + pci_register_bar((PCIDevice *) pci_dev, i, + cur_region->size, PCI_BASE_ADDRESS_SPACE_IO, + assigned_dev_ioport_map); + + /* not relevant for port io */ + pci_dev->v_addrs[i].memory_index = 0; + } + + /* success */ + return 0; +} + +static int get_real_device(AssignedDevice *pci_dev, uint8_t r_bus, + uint8_t r_dev, uint8_t r_func) +{ + char dir[128], name[128]; + int fd, r = 0; + FILE *f; + unsigned long long start, end, size, flags; + unsigned long id; + struct stat statbuf; + PCIRegion *rp; + PCIDevRegions *dev = &pci_dev->real_device; + + dev->region_number = 0; + + snprintf(dir, sizeof(dir), "/sys/bus/pci/devices/0000:%02x:%02x.%x/", + r_bus, r_dev, r_func); + + snprintf(name, sizeof(name), "%sconfig", dir); + + fd = open(name, O_RDWR); + if (fd == -1) { + fprintf(stderr, "%s: %s: %m\n", __func__, name); + return 1; + } + dev->config_fd = fd; +again: + r = read(fd, pci_dev->dev.config, pci_config_size(&pci_dev->dev)); + if (r < 0) { + if (errno == EINTR || errno == EAGAIN) + goto again; + fprintf(stderr, "%s: read failed, errno = %d\n", __func__, errno); + } + + snprintf(name, sizeof(name), "%sresource", dir); + + f = fopen(name, "r"); + if (f == NULL) { + fprintf(stderr, "%s: %s: %m\n", __func__, name); + return 1; + } + + for (r = 0; r < PCI_NUM_REGIONS; r++) { + if (fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) != 3) + break; + + rp = dev->regions + r; + rp->valid = 0; + size = end - start + 1; + flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH; + if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0) + continue; + if (flags & IORESOURCE_MEM) { + flags &= ~IORESOURCE_IO; + if (r != PCI_ROM_SLOT) { + snprintf(name, sizeof(name), "%sresource%d", dir, r); + fd = open(name, O_RDWR); + if (fd == -1) + continue; + rp->resource_fd = fd; + } + } else + flags &= ~IORESOURCE_PREFETCH; + + rp->type = flags; + rp->valid = 1; + rp->base_addr = start; + rp->size = size; + DEBUG("region %d size %d start 0x%llx type %d resource_fd %d\n", + r, rp->size, start, rp->type, rp->resource_fd); + } + + fclose(f); + + /* read and fill device ID */ + snprintf(name, sizeof(name), "%svendor", dir); + f = fopen(name, "r"); + if (f == NULL) { + fprintf(stderr, "%s: %s: %m\n", __func__, name); + return 1; + } + if (fscanf(f, "%li\n", &id) == 1) { + pci_dev->dev.config[0] = id & 0xff; + pci_dev->dev.config[1] = (id & 0xff00) >> 8; + } + fclose(f); + + /* read and fill vendor ID */ + snprintf(name, sizeof(name), "%sdevice", dir); + f = fopen(name, "r"); + if (f == NULL) { + fprintf(stderr, "%s: %s: %m\n", __func__, name); + return 1; + } + if (fscanf(f, "%li\n", &id) == 1) { + pci_dev->dev.config[2] = id & 0xff; + pci_dev->dev.config[3] = (id & 0xff00) >> 8; + } + fclose(f); + + /* dealing with virtual function device */ + snprintf(name, sizeof(name), "%sphysfn/", dir); + if (!stat(name, &statbuf)) + pci_dev->need_emulate_cmd = 1; + else + pci_dev->need_emulate_cmd = 0; + + dev->region_number = r; + return 0; +} + +static QLIST_HEAD(, AssignedDevice) devs = QLIST_HEAD_INITIALIZER(devs); + +#ifdef KVM_CAP_IRQ_ROUTING +static void free_dev_irq_entries(AssignedDevice *dev) +{ + int i; + + for (i = 0; i < dev->irq_entries_nr; i++) + kvm_del_routing_entry(kvm_context, &dev->entry[i]); + free(dev->entry); + dev->entry = NULL; + dev->irq_entries_nr = 0; +} +#endif + +static void free_assigned_device(AssignedDevice *dev) +{ + if (dev) { + int i; + + for (i = 0; i < dev->real_device.region_number; i++) { + PCIRegion *pci_region = &dev->real_device.regions[i]; + AssignedDevRegion *region = &dev->v_addrs[i]; + + if (!pci_region->valid) + continue; + + if (pci_region->type & IORESOURCE_IO) { + kvm_remove_ioperm_data(region->u.r_baseport, region->r_size); + continue; + } else if (pci_region->type & IORESOURCE_MEM) { + if (region->e_size > 0) + kvm_destroy_phys_mem(kvm_context, region->e_physbase, + TARGET_PAGE_ALIGN(region->e_size)); + + if (region->u.r_virtbase) { + int ret = munmap(region->u.r_virtbase, + (pci_region->size + 0xFFF) & 0xFFFFF000); + if (ret != 0) + fprintf(stderr, + "Failed to unmap assigned device region: %s\n", + strerror(errno)); + } + } + } + + if (dev->real_device.config_fd) { + close(dev->real_device.config_fd); + dev->real_device.config_fd = 0; + } + +#ifdef KVM_CAP_IRQ_ROUTING + free_dev_irq_entries(dev); +#endif + } +} + +static uint32_t calc_assigned_dev_id(uint8_t bus, uint8_t devfn) +{ + return (uint32_t)bus << 8 | (uint32_t)devfn; +} + +static int assign_device(AssignedDevice *dev) +{ + struct kvm_assigned_pci_dev assigned_dev_data; + int r; + + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data)); + assigned_dev_data.assigned_dev_id = + calc_assigned_dev_id(dev->h_busnr, dev->h_devfn); + assigned_dev_data.busnr = dev->h_busnr; + assigned_dev_data.devfn = dev->h_devfn; + +#ifdef KVM_CAP_IOMMU + /* We always enable the IOMMU unless disabled on the command line */ + if (dev->use_iommu) { + if (!kvm_check_extension(kvm_state, KVM_CAP_IOMMU)) { + fprintf(stderr, "No IOMMU found. Unable to assign device \"%s\"\n", + dev->dev.qdev.id); + return -ENODEV; + } + assigned_dev_data.flags |= KVM_DEV_ASSIGN_ENABLE_IOMMU; + } +#else + dev->use_iommu = 0; +#endif + + r = kvm_assign_pci_device(kvm_context, &assigned_dev_data); + if (r < 0) + fprintf(stderr, "Failed to assign device \"%s\" : %s\n", + dev->dev.qdev.id, strerror(-r)); + return r; +} + +static int assign_irq(AssignedDevice *dev) +{ + struct kvm_assigned_irq assigned_irq_data; + int irq, r = 0; + + /* Interrupt PIN 0 means don't use INTx */ + if (pci_read_byte(dev->pdev, PCI_INTERRUPT_PIN) == 0) + return 0; + + irq = pci_map_irq(&dev->dev, dev->intpin); + irq = piix_get_irq(irq); + +#ifdef TARGET_IA64 + irq = ipf_map_irq(&dev->dev, irq); +#endif + + if (dev->girq == irq) + return r; + + memset(&assigned_irq_data, 0, sizeof(assigned_irq_data)); + assigned_irq_data.assigned_dev_id = + calc_assigned_dev_id(dev->h_busnr, dev->h_devfn); + assigned_irq_data.guest_irq = irq; + assigned_irq_data.host_irq = dev->real_device.irq; +#ifdef KVM_CAP_ASSIGN_DEV_IRQ + if (dev->irq_requested_type) { + assigned_irq_data.flags = dev->irq_requested_type; + r = kvm_deassign_irq(kvm_context, &assigned_irq_data); + /* -ENXIO means no assigned irq */ + if (r && r != -ENXIO) + perror("assign_irq: deassign"); + } + + assigned_irq_data.flags = KVM_DEV_IRQ_GUEST_INTX; + if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) + assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_MSI; + else + assigned_irq_data.flags |= KVM_DEV_IRQ_HOST_INTX; +#endif + + r = kvm_assign_irq(kvm_context, &assigned_irq_data); + if (r < 0) { + fprintf(stderr, "Failed to assign irq for \"%s\": %s\n", + dev->dev.qdev.id, strerror(-r)); + fprintf(stderr, "Perhaps you are assigning a device " + "that shares an IRQ with another device?\n"); + return r; + } + + dev->girq = irq; + dev->irq_requested_type = assigned_irq_data.flags; + return r; +} + +static void deassign_device(AssignedDevice *dev) +{ +#ifdef KVM_CAP_DEVICE_DEASSIGNMENT + struct kvm_assigned_pci_dev assigned_dev_data; + int r; + + memset(&assigned_dev_data, 0, sizeof(assigned_dev_data)); + assigned_dev_data.assigned_dev_id = + calc_assigned_dev_id(dev->h_busnr, dev->h_devfn); + + r = kvm_deassign_pci_device(kvm_context, &assigned_dev_data); + if (r < 0) + fprintf(stderr, "Failed to deassign device \"%s\" : %s\n", + dev->dev.qdev.id, strerror(-r)); +#endif +} + +#if 0 +AssignedDevInfo *get_assigned_device(int pcibus, int slot) +{ + AssignedDevice *assigned_dev = NULL; + AssignedDevInfo *adev = NULL; + + QLIST_FOREACH(adev, &adev_head, next) { + assigned_dev = adev->assigned_dev; + if (pci_bus_num(assigned_dev->dev.bus) == pcibus && + PCI_SLOT(assigned_dev->dev.devfn) == slot) + return adev; + } + + return NULL; +} +#endif + +/* The pci config space got updated. Check if irq numbers have changed + * for our devices + */ +void assigned_dev_update_irqs(void) +{ + AssignedDevice *dev, *next; + int r; + + dev = QLIST_FIRST(&devs); + while (dev) { + next = QLIST_NEXT(dev, next); + r = assign_irq(dev); + if (r < 0) + qdev_unplug(&dev->dev.qdev); + dev = next; + } +} + +#ifdef KVM_CAP_IRQ_ROUTING + +#ifdef KVM_CAP_DEVICE_MSI +static void assigned_dev_update_msi(PCIDevice *pci_dev, unsigned int ctrl_pos) +{ + struct kvm_assigned_irq assigned_irq_data; + AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev); + uint8_t ctrl_byte = pci_dev->config[ctrl_pos]; + int r; + + memset(&assigned_irq_data, 0, sizeof assigned_irq_data); + assigned_irq_data.assigned_dev_id = + calc_assigned_dev_id(assigned_dev->h_busnr, + (uint8_t)assigned_dev->h_devfn); + + if (assigned_dev->irq_requested_type) { + assigned_irq_data.flags = assigned_dev->irq_requested_type; + free_dev_irq_entries(assigned_dev); + r = kvm_deassign_irq(kvm_context, &assigned_irq_data); + /* -ENXIO means no assigned irq */ + if (r && r != -ENXIO) + perror("assigned_dev_update_msi: deassign irq"); + } + + if (ctrl_byte & PCI_MSI_FLAGS_ENABLE) { + assigned_dev->entry = calloc(1, sizeof(struct kvm_irq_routing_entry)); + if (!assigned_dev->entry) { + perror("assigned_dev_update_msi: "); + return; + } + assigned_dev->entry->u.msi.address_lo = + *(uint32_t *)(pci_dev->config + pci_dev->cap.start + + PCI_MSI_ADDRESS_LO); + assigned_dev->entry->u.msi.address_hi = 0; + assigned_dev->entry->u.msi.data = *(uint16_t *)(pci_dev->config + + pci_dev->cap.start + PCI_MSI_DATA_32); + assigned_dev->entry->type = KVM_IRQ_ROUTING_MSI; + r = kvm_get_irq_route_gsi(kvm_context); + if (r < 0) { + perror("assigned_dev_update_msi: kvm_get_irq_route_gsi"); + return; + } + assigned_dev->entry->gsi = r; + + kvm_add_routing_entry(kvm_context, assigned_dev->entry); + if (kvm_commit_irq_routes(kvm_context) < 0) { + perror("assigned_dev_update_msi: kvm_commit_irq_routes"); + assigned_dev->cap.state &= ~ASSIGNED_DEVICE_MSI_ENABLED; + return; + } + assigned_dev->irq_entries_nr = 1; + + assigned_irq_data.guest_irq = assigned_dev->entry->gsi; + assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_MSI; + if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0) + perror("assigned_dev_enable_msi: assign irq"); + + assigned_dev->irq_requested_type = assigned_irq_data.flags; + } +} +#endif + +#ifdef KVM_CAP_DEVICE_MSIX +static int assigned_dev_update_msix_mmio(PCIDevice *pci_dev) +{ + AssignedDevice *adev = container_of(pci_dev, AssignedDevice, dev); + u16 entries_nr = 0, entries_max_nr; + int pos = 0, i, r = 0; + u32 msg_addr, msg_upper_addr, msg_data, msg_ctrl; + struct kvm_assigned_msix_nr msix_nr; + struct kvm_assigned_msix_entry msix_entry; + void *va = adev->msix_table_page; + + if (adev->cap.available & ASSIGNED_DEVICE_CAP_MSI) + pos = pci_dev->cap.start + PCI_CAPABILITY_CONFIG_MSI_LENGTH; + else + pos = pci_dev->cap.start; + + entries_max_nr = pci_dev->config[pos + 2]; + entries_max_nr &= PCI_MSIX_TABSIZE; + entries_max_nr += 1; + + /* Get the usable entry number for allocating */ + for (i = 0; i < entries_max_nr; i++) { + memcpy(&msg_ctrl, va + i * 16 + 12, 4); + memcpy(&msg_data, va + i * 16 + 8, 4); + /* Ignore unused entry even it's unmasked */ + if (msg_data == 0) + continue; + entries_nr ++; + } + + if (entries_nr == 0) { + fprintf(stderr, "MSI-X entry number is zero!\n"); + return -EINVAL; + } + msix_nr.assigned_dev_id = calc_assigned_dev_id(adev->h_busnr, + (uint8_t)adev->h_devfn); + msix_nr.entry_nr = entries_nr; + r = kvm_assign_set_msix_nr(kvm_context, &msix_nr); + if (r != 0) { + fprintf(stderr, "fail to set MSI-X entry number for MSIX! %s\n", + strerror(-r)); + return r; + } + + free_dev_irq_entries(adev); + adev->irq_entries_nr = entries_nr; + adev->entry = calloc(entries_nr, sizeof(struct kvm_irq_routing_entry)); + if (!adev->entry) { + perror("assigned_dev_update_msix_mmio: "); + return -errno; + } + + msix_entry.assigned_dev_id = msix_nr.assigned_dev_id; + entries_nr = 0; + for (i = 0; i < entries_max_nr; i++) { + if (entries_nr >= msix_nr.entry_nr) + break; + memcpy(&msg_ctrl, va + i * 16 + 12, 4); + memcpy(&msg_data, va + i * 16 + 8, 4); + if (msg_data == 0) + continue; + + memcpy(&msg_addr, va + i * 16, 4); + memcpy(&msg_upper_addr, va + i * 16 + 4, 4); + + r = kvm_get_irq_route_gsi(kvm_context); + if (r < 0) + return r; + + adev->entry[entries_nr].gsi = r; + adev->entry[entries_nr].type = KVM_IRQ_ROUTING_MSI; + adev->entry[entries_nr].flags = 0; + adev->entry[entries_nr].u.msi.address_lo = msg_addr; + adev->entry[entries_nr].u.msi.address_hi = msg_upper_addr; + adev->entry[entries_nr].u.msi.data = msg_data; + DEBUG("MSI-X data 0x%x, MSI-X addr_lo 0x%x\n!", msg_data, msg_addr); + kvm_add_routing_entry(kvm_context, &adev->entry[entries_nr]); + + msix_entry.gsi = adev->entry[entries_nr].gsi; + msix_entry.entry = i; + r = kvm_assign_set_msix_entry(kvm_context, &msix_entry); + if (r) { + fprintf(stderr, "fail to set MSI-X entry! %s\n", strerror(-r)); + break; + } + DEBUG("MSI-X entry gsi 0x%x, entry %d\n!", + msix_entry.gsi, msix_entry.entry); + entries_nr ++; + } + + if (r == 0 && kvm_commit_irq_routes(kvm_context) < 0) { + perror("assigned_dev_update_msix_mmio: kvm_commit_irq_routes"); + return -EINVAL; + } + + return r; +} + +static void assigned_dev_update_msix(PCIDevice *pci_dev, unsigned int ctrl_pos) +{ + struct kvm_assigned_irq assigned_irq_data; + AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev); + uint16_t *ctrl_word = (uint16_t *)(pci_dev->config + ctrl_pos); + int r; + + memset(&assigned_irq_data, 0, sizeof assigned_irq_data); + assigned_irq_data.assigned_dev_id = + calc_assigned_dev_id(assigned_dev->h_busnr, + (uint8_t)assigned_dev->h_devfn); + + if (assigned_dev->irq_requested_type) { + assigned_irq_data.flags = assigned_dev->irq_requested_type; + free_dev_irq_entries(assigned_dev); + r = kvm_deassign_irq(kvm_context, &assigned_irq_data); + /* -ENXIO means no assigned irq */ + if (r && r != -ENXIO) + perror("assigned_dev_update_msix: deassign irq"); + } + assigned_irq_data.flags = KVM_DEV_IRQ_HOST_MSIX | KVM_DEV_IRQ_GUEST_MSIX; + + if (*ctrl_word & PCI_MSIX_ENABLE) { + if (assigned_dev_update_msix_mmio(pci_dev) < 0) { + perror("assigned_dev_update_msix_mmio"); + return; + } + if (kvm_assign_irq(kvm_context, &assigned_irq_data) < 0) { + perror("assigned_dev_enable_msix: assign irq"); + return; + } + assigned_dev->irq_requested_type = assigned_irq_data.flags; + } +} +#endif +#endif + +static void assigned_device_pci_cap_write_config(PCIDevice *pci_dev, uint32_t address, + uint32_t val, int len) +{ + AssignedDevice *assigned_dev = container_of(pci_dev, AssignedDevice, dev); + unsigned int pos = pci_dev->cap.start, ctrl_pos; + + pci_default_cap_write_config(pci_dev, address, val, len); +#ifdef KVM_CAP_IRQ_ROUTING +#ifdef KVM_CAP_DEVICE_MSI + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSI) { + ctrl_pos = pos + PCI_MSI_FLAGS; + if (address <= ctrl_pos && address + len > ctrl_pos) + assigned_dev_update_msi(pci_dev, ctrl_pos); + pos += PCI_CAPABILITY_CONFIG_MSI_LENGTH; + } +#endif +#ifdef KVM_CAP_DEVICE_MSIX + if (assigned_dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) { + ctrl_pos = pos + 3; + if (address <= ctrl_pos && address + len > ctrl_pos) { + ctrl_pos--; /* control is word long */ + assigned_dev_update_msix(pci_dev, ctrl_pos); + } + pos += PCI_CAPABILITY_CONFIG_MSIX_LENGTH; + } +#endif +#endif + return; +} + +static int assigned_device_pci_cap_init(PCIDevice *pci_dev) +{ + AssignedDevice *dev = container_of(pci_dev, AssignedDevice, dev); + PCIRegion *pci_region = dev->real_device.regions; + int next_cap_pt = 0; + + pci_dev->cap.length = 0; +#ifdef KVM_CAP_IRQ_ROUTING +#ifdef KVM_CAP_DEVICE_MSI + /* Expose MSI capability + * MSI capability is the 1st capability in capability config */ + if (pci_find_cap_offset(dev->pdev, PCI_CAP_ID_MSI)) { + dev->cap.available |= ASSIGNED_DEVICE_CAP_MSI; + memset(&pci_dev->config[pci_dev->cap.start + pci_dev->cap.length], + 0, PCI_CAPABILITY_CONFIG_MSI_LENGTH); + pci_dev->config[pci_dev->cap.start + pci_dev->cap.length] = + PCI_CAP_ID_MSI; + pci_dev->cap.length += PCI_CAPABILITY_CONFIG_MSI_LENGTH; + next_cap_pt = 1; + } +#endif +#ifdef KVM_CAP_DEVICE_MSIX + /* Expose MSI-X capability */ + if (pci_find_cap_offset(dev->pdev, PCI_CAP_ID_MSIX)) { + int pos, entry_nr, bar_nr; + u32 msix_table_entry; + dev->cap.available |= ASSIGNED_DEVICE_CAP_MSIX; + memset(&pci_dev->config[pci_dev->cap.start + pci_dev->cap.length], + 0, PCI_CAPABILITY_CONFIG_MSIX_LENGTH); + pos = pci_find_cap_offset(dev->pdev, PCI_CAP_ID_MSIX); + entry_nr = pci_read_word(dev->pdev, pos + 2) & PCI_MSIX_TABSIZE; + pci_dev->config[pci_dev->cap.start + pci_dev->cap.length] = 0x11; + pci_dev->config[pci_dev->cap.start + + pci_dev->cap.length + 2] = entry_nr; + msix_table_entry = pci_read_long(dev->pdev, pos + PCI_MSIX_TABLE); + *(uint32_t *)(pci_dev->config + pci_dev->cap.start + + pci_dev->cap.length + PCI_MSIX_TABLE) = msix_table_entry; + *(uint32_t *)(pci_dev->config + pci_dev->cap.start + + pci_dev->cap.length + PCI_MSIX_PBA) = + pci_read_long(dev->pdev, pos + PCI_MSIX_PBA); + bar_nr = msix_table_entry & PCI_MSIX_BIR; + msix_table_entry &= ~PCI_MSIX_BIR; + dev->msix_table_addr = pci_region[bar_nr].base_addr + msix_table_entry; + if (next_cap_pt != 0) { + pci_dev->config[pci_dev->cap.start + next_cap_pt] = + pci_dev->cap.start + pci_dev->cap.length; + next_cap_pt += PCI_CAPABILITY_CONFIG_MSI_LENGTH; + } else + next_cap_pt = 1; + pci_dev->cap.length += PCI_CAPABILITY_CONFIG_MSIX_LENGTH; + } +#endif +#endif + + return 0; +} + +static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr) +{ + AssignedDevice *adev = opaque; + unsigned int offset = addr & 0xfff; + void *page = adev->msix_table_page; + uint32_t val = 0; + + memcpy(&val, (void *)((char *)page + offset), 4); + + return val; +} + +static uint32_t msix_mmio_readb(void *opaque, target_phys_addr_t addr) +{ + return ((msix_mmio_readl(opaque, addr & ~3)) >> + (8 * (addr & 3))) & 0xff; +} + +static uint32_t msix_mmio_readw(void *opaque, target_phys_addr_t addr) +{ + return ((msix_mmio_readl(opaque, addr & ~3)) >> + (8 * (addr & 3))) & 0xffff; +} + +static void msix_mmio_writel(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + AssignedDevice *adev = opaque; + unsigned int offset = addr & 0xfff; + void *page = adev->msix_table_page; + + DEBUG("write to MSI-X entry table mmio offset 0x%lx, val 0x%lx\n", + addr, val); + memcpy((void *)((char *)page + offset), &val, 4); +} + +static void msix_mmio_writew(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + msix_mmio_writel(opaque, addr & ~3, + (val & 0xffff) << (8*(addr & 3))); +} + +static void msix_mmio_writeb(void *opaque, + target_phys_addr_t addr, uint32_t val) +{ + msix_mmio_writel(opaque, addr & ~3, + (val & 0xff) << (8*(addr & 3))); +} + +static CPUWriteMemoryFunc *msix_mmio_write[] = { + msix_mmio_writeb, msix_mmio_writew, msix_mmio_writel +}; + +static CPUReadMemoryFunc *msix_mmio_read[] = { + msix_mmio_readb, msix_mmio_readw, msix_mmio_readl +}; + +static int assigned_dev_register_msix_mmio(AssignedDevice *dev) +{ + dev->msix_table_page = mmap(NULL, 0x1000, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); + if (dev->msix_table_page == MAP_FAILED) { + fprintf(stderr, "fail allocate msix_table_page! %s\n", + strerror(errno)); + return -EFAULT; + } + memset(dev->msix_table_page, 0, 0x1000); + dev->mmio_index = cpu_register_io_memory( + msix_mmio_read, msix_mmio_write, dev); + return 0; +} + +static int assigned_initfn(struct PCIDevice *pci_dev) +{ + AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + struct pci_access *pacc; + uint8_t e_device, e_intx; + int r; + + if (!dev->host.bus && !dev->host.dev && !dev->host.func) { + qemu_error("pci-assign: error: no host device specified\n"); + goto out; + } + + if (get_real_device(dev, dev->host.bus, dev->host.dev, dev->host.func)) { + qemu_error("pci-assign: Error: Couldn't get real device (%s)!\n", + dev->dev.qdev.id); + goto out; + } + + /* handle real device's MMIO/PIO BARs */ + if (assigned_dev_register_regions(dev->real_device.regions, + dev->real_device.region_number, + dev)) + goto out; + + /* handle interrupt routing */ + e_device = (dev->dev.devfn >> 3) & 0x1f; + e_intx = dev->dev.config[0x3d] - 1; + dev->intpin = e_intx; + dev->run = 0; + dev->girq = 0; + dev->h_busnr = dev->host.bus; + dev->h_devfn = PCI_DEVFN(dev->host.dev, dev->host.func); + + pacc = pci_alloc(); + pci_init(pacc); + dev->pdev = pci_get_dev(pacc, 0, dev->host.bus, dev->host.dev, dev->host.func); + + if (pci_enable_capability_support(pci_dev, 0, NULL, + assigned_device_pci_cap_write_config, + assigned_device_pci_cap_init) < 0) + goto assigned_out; + + /* assign device to guest */ + r = assign_device(dev); + if (r < 0) + goto assigned_out; + + /* assign irq for the device */ + r = assign_irq(dev); + if (r < 0) + goto assigned_out; + + /* intercept MSI-X entry page in the MMIO */ + if (dev->cap.available & ASSIGNED_DEVICE_CAP_MSIX) + if (assigned_dev_register_msix_mmio(dev)) + goto assigned_out; + + assigned_dev_load_option_rom(dev); + QLIST_INSERT_HEAD(&devs, dev, next); + return 0; + +assigned_out: + deassign_device(dev); +out: + free_assigned_device(dev); + return -1; +} + +static int assigned_exitfn(struct PCIDevice *pci_dev) +{ + AssignedDevice *dev = DO_UPCAST(AssignedDevice, dev, pci_dev); + + deassign_device(dev); + free_assigned_device(dev); + return 0; +} + +static int parse_hostaddr(DeviceState *dev, Property *prop, const char *str) +{ + PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop); + int rc; + + rc = pci_parse_host_devaddr(str, &ptr->bus, &ptr->dev, &ptr->func); + if (rc != 0) + return -1; + return 0; +} + +static int print_hostaddr(DeviceState *dev, Property *prop, char *dest, size_t len) +{ + PCIHostDevice *ptr = qdev_get_prop_ptr(dev, prop); + + return snprintf(dest, len, "%02x:%02x.%x", ptr->bus, ptr->dev, ptr->func); +} + +PropertyInfo qdev_prop_hostaddr = { + .name = "pci-hostaddr", + .type = -1, + .size = sizeof(PCIHostDevice), + .parse = parse_hostaddr, + .print = print_hostaddr, +}; + +static PCIDeviceInfo assign_info = { + .qdev.name = "pci-assign", + .qdev.desc = "pass through host pci devices to the guest", + .qdev.size = sizeof(AssignedDevice), + .init = assigned_initfn, + .exit = assigned_exitfn, + .config_read = assigned_dev_pci_read_config, + .config_write = assigned_dev_pci_write_config, + .qdev.props = (Property[]) { + DEFINE_PROP("host", AssignedDevice, host, qdev_prop_hostaddr, PCIHostDevice), + DEFINE_PROP_UINT32("iommu", AssignedDevice, use_iommu, 1), + DEFINE_PROP_END_OF_LIST(), + }, +}; + +static void assign_register_devices(void) +{ + pci_qdev_register(&assign_info); +} + +device_init(assign_register_devices) + + +/* + * Syntax to assign device: + * + * -pcidevice host=bus:dev.func[,dma=none][,name=Foo] + * + * Example: + * -pcidevice host=00:13.0,dma=pvdma + * + * dma can currently only be 'none' to disable iommu support. + */ +QemuOpts *add_assigned_device(const char *arg) +{ + QemuOpts *opts = NULL; + char host[64], id[64], dma[8]; + int r; + + r = get_param_value(host, sizeof(host), "host", arg); + if (!r) + goto bad; + r = get_param_value(id, sizeof(id), "id", arg); + if (!r) + r = get_param_value(id, sizeof(id), "name", arg); + if (!r) + r = get_param_value(id, sizeof(id), "host", arg); + + opts = qemu_opts_create(&qemu_device_opts, id, 0); + if (!opts) + goto bad; + qemu_opt_set(opts, "driver", "pci-assign"); + qemu_opt_set(opts, "host", host); + +#ifdef KVM_CAP_IOMMU + r = get_param_value(dma, sizeof(dma), "dma", arg); + if (r && !strncmp(dma, "none", 4)) + qemu_opt_set(opts, "iommu", "0"); +#endif + qemu_opts_print(opts, NULL); + return opts; + +bad: + fprintf(stderr, "pcidevice argument parse error; " + "please check the help text for usage\n"); + if (opts) + qemu_opts_del(opts); + return NULL; +} + +void add_assigned_devices(PCIBus *bus, const char **devices, int n_devices) +{ + QemuOpts *opts; + int i; + + for (i = 0; i < n_devices; i++) { + opts = add_assigned_device(devices[i]); + if (opts == NULL) { + fprintf(stderr, "Could not add assigned device %s\n", devices[i]); + exit(1); + } + /* generic code will call qdev_device_add() for the device */ + } +} + +/* + * Scan the assigned devices for the devices that have an option ROM, and then + * load the corresponding ROM data to RAM. If an error occurs while loading an + * option ROM, we just ignore that option ROM and continue with the next one. + */ +static void assigned_dev_load_option_rom(AssignedDevice *dev) +{ + int size, len; + void *buf; + FILE *fp; + uint8_t i = 1; + char rom_file[64]; + + snprintf(rom_file, sizeof(rom_file), + "/sys/bus/pci/devices/0000:%02x:%02x.%01x/rom", + dev->host.bus, dev->host.dev, dev->host.func); + + if (access(rom_file, F_OK)) + return; + + /* Write something to the ROM file to enable it */ + fp = fopen(rom_file, "wb"); + if (fp == NULL) + return; + len = fwrite(&i, 1, 1, fp); + fclose(fp); + if (len != 1) + return; + + /* The file has to be closed and reopened, otherwise it won't work */ + fp = fopen(rom_file, "rb"); + if (fp == NULL) + return; + + fseek(fp, 0, SEEK_END); + size = ftell(fp); + fseek(fp, 0, SEEK_SET); + + buf = malloc(size); + if (buf == NULL) { + fclose(fp); + return; + } + + fread(buf, size, 1, fp); + if (!feof(fp) || ferror(fp)) { + free(buf); + fclose(fp); + return; + } + fclose(fp); + + /* Copy ROM contents into the space backing the ROM BAR */ + if (dev->v_addrs[PCI_ROM_SLOT].r_size >= size && + dev->v_addrs[PCI_ROM_SLOT].u.r_virtbase) { + mprotect(dev->v_addrs[PCI_ROM_SLOT].u.r_virtbase, + size, PROT_READ | PROT_WRITE); + memcpy(dev->v_addrs[PCI_ROM_SLOT].u.r_virtbase, + buf, size); + mprotect(dev->v_addrs[PCI_ROM_SLOT].u.r_virtbase, + size, PROT_READ); + } + + free(buf); +} |